From 01d48e55a5aa683828dcb88fee2d811c8262d3e9 Mon Sep 17 00:00:00 2001 From: Jan Philipp Hafer Date: Sat, 5 Feb 2022 03:32:29 +0100 Subject: [PATCH] compiler_rt: optimize mulo - use usize to decide if register size is big enough to store multiplication result or if division is necessary - multiplication routine with check of integer bounds - wrapping multipliation and division routine from Hacker's Delight --- lib/std/special/compiler_rt/mulo.zig | 97 ++++++++++++++-------------- 1 file changed, 49 insertions(+), 48 deletions(-) diff --git a/lib/std/special/compiler_rt/mulo.zig b/lib/std/special/compiler_rt/mulo.zig index 9fa5d3830b..df4c98134c 100644 --- a/lib/std/special/compiler_rt/mulo.zig +++ b/lib/std/special/compiler_rt/mulo.zig @@ -1,67 +1,68 @@ const builtin = @import("builtin"); +const std = @import("std"); +const math = std.math; // mulo - multiplication overflow -// - muloXi4_generic for unoptimized version +// * return a*b. +// * return if a*b overflows => 1 else => 0 +// - muloXi4_genericSmall as default +// - muloXi4_genericFast for 2*bitsize <= usize -// return a*b. -// return if a*b overflows => 1 else => 0 -// see https://stackoverflow.com/a/26320664 for possible implementations - -inline fn muloXi4_generic(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST { +inline fn muloXi4_genericSmall(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST { @setRuntimeSafety(builtin.is_test); - const BSIZE = @bitSizeOf(ST); - comptime var UT = switch (ST) { - i32 => u32, - i64 => u64, - i128 => u128, + overflow.* = 0; + const min = math.minInt(ST); + var res: ST = a *% b; + // Hacker's Delight section Overflow subsection Multiplication + // case a=-2^{31}, b=-1 problem, because + // on some machines a*b = -2^{31} with overflow + // Then -2^{31}/-1 overflows and any result is possible. + // => check with a<0 and b=-2^{31} + if ((a < 0 and b == min) or (a != 0 and @divTrunc(res, a) != b)) + overflow.* = 1; + return res; +} + +inline fn muloXi4_genericFast(comptime ST: type, a: ST, b: ST, overflow: *c_int) ST { + @setRuntimeSafety(builtin.is_test); + overflow.* = 0; + const EST = switch (ST) { + i32 => i64, + i64 => i128, + i128 => i256, else => unreachable, }; - const min = @bitCast(ST, @as(UT, 1 << (BSIZE - 1))); - const max = ~min; - overflow.* = 0; - const result = a *% b; - - // edge cases - if (a == min) { - if (b != 0 and b != 1) overflow.* = 1; - return result; - } - if (b == min) { - if (a != 0 and a != 1) overflow.* = 1; - return result; - } - - // take sign of x sx - const sa = a >> (BSIZE - 1); - const sb = b >> (BSIZE - 1); - // take absolute value of a and b via - // abs(x) = (x^sx)) - sx - const abs_a = (a ^ sa) -% sa; - const abs_b = (b ^ sb) -% sb; - - // unitary magnitude, cannot have overflow - if (abs_a < 2 or abs_b < 2) return result; - - // compare the signs of operands - if ((a ^ b) >> (BSIZE - 1) != 0) { - if (abs_a > @divTrunc(max, abs_b)) overflow.* = 1; - } else { - if (abs_a > @divTrunc(min, -abs_b)) overflow.* = 1; - } - - return result; + const min = math.minInt(ST); + const max = math.maxInt(ST); + var res: EST = @as(EST, a) * @as(EST, b); + //invariant: -2^{bitwidth(EST)} < res < 2^{bitwidth(EST)-1} + if (res < min or max < res) + overflow.* = 1; + return @truncate(ST, res); } pub fn __mulosi4(a: i32, b: i32, overflow: *c_int) callconv(.C) i32 { - return muloXi4_generic(i32, a, b, overflow); + if (2 * @bitSizeOf(i32) <= @bitSizeOf(usize)) { + return muloXi4_genericFast(i32, a, b, overflow); + } else { + return muloXi4_genericSmall(i32, a, b, overflow); + } } pub fn __mulodi4(a: i64, b: i64, overflow: *c_int) callconv(.C) i64 { - return muloXi4_generic(i64, a, b, overflow); + if (2 * @bitSizeOf(i64) <= @bitSizeOf(usize)) { + return muloXi4_genericFast(i64, a, b, overflow); + } else { + return muloXi4_genericSmall(i64, a, b, overflow); + } } pub fn __muloti4(a: i128, b: i128, overflow: *c_int) callconv(.C) i128 { - return muloXi4_generic(i128, a, b, overflow); + if (2 * @bitSizeOf(i128) <= @bitSizeOf(usize)) { + return muloXi4_genericFast(i128, a, b, overflow); + } else { + return muloXi4_genericSmall(i128, a, b, overflow); + } } test {