From 947dd36341bdef0cb359a008257e7f1638f60594 Mon Sep 17 00:00:00 2001
From: Marc Tiehuis <marc@tiehu.is>
Date: Thu, 13 Apr 2023 18:20:17 +1200
Subject: [PATCH 1/2] optimize udivmod

See https://reviews.llvm.org/D81809 for upstream description.

In summary this is ~10x improvement for small divisors and similar
performance for equal divisors.

Closes #13523.
---
 lib/compiler_rt/udivmod.zig | 314 ++++++++++++++++--------------------
 1 file changed, 135 insertions(+), 179 deletions(-)

diff --git a/lib/compiler_rt/udivmod.zig b/lib/compiler_rt/udivmod.zig
index bae4365295..55abade733 100644
--- a/lib/compiler_rt/udivmod.zig
+++ b/lib/compiler_rt/udivmod.zig
@@ -1,201 +1,157 @@
+const std = @import("std");
 const builtin = @import("builtin");
 const is_test = builtin.is_test;
-const native_endian = builtin.cpu.arch.endian();
-const std = @import("std");
+const Log2Int = std.math.Log2Int;
 
-const low = switch (native_endian) {
+const lo = switch (builtin.cpu.arch.endian()) {
     .Big => 1,
     .Little => 0,
 };
-const high = 1 - low;
+const hi = 1 - lo;
 
-pub fn udivmod(comptime DoubleInt: type, a: DoubleInt, b: DoubleInt, maybe_rem: ?*DoubleInt) DoubleInt {
+fn HalfInt(comptime T: type) type {
+    std.debug.assert(@typeInfo(T) == .Int);
+    std.debug.assert(@bitSizeOf(T) % 2 == 0);
+    return std.meta.Int(.unsigned, @bitSizeOf(T) / 2);
+}
+
+// Performs division of a double-word specified in its single-word components. Most commonly used
+// for computing u128 bit divisions in terms of 64-bit integers.
+//
+// q = U / v
+// r = U % v
+// where  U = (u1 | u0)
+fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T {
     @setRuntimeSafety(is_test);
+    var v = v_;
 
-    const double_int_bits = @typeInfo(DoubleInt).Int.bits;
-    const single_int_bits = @divExact(double_int_bits, 2);
-    const SingleInt = std.meta.Int(.unsigned, single_int_bits);
-    const SignedDoubleInt = std.meta.Int(.signed, double_int_bits);
-    const Log2SingleInt = std.math.Log2Int(SingleInt);
+    const b = @as(T, 1) << (@bitSizeOf(T) / 2);
+    var un64: T = undefined;
+    var un10: T = undefined;
 
-    const n = @bitCast([2]SingleInt, a);
-    const d = @bitCast([2]SingleInt, b);
-    var q: [2]SingleInt = undefined;
-    var r: [2]SingleInt = undefined;
-    var sr: c_uint = undefined;
-    // special cases, X is unknown, K != 0
-    if (n[high] == 0) {
-        if (d[high] == 0) {
-            // 0 X
-            // ---
-            // 0 X
-            if (maybe_rem) |rem| {
-                rem.* = n[low] % d[low];
-            }
-            return n[low] / d[low];
-        }
-        // 0 X
-        // ---
-        // K X
+    const s = @intCast(Log2Int(T), @clz(v));
+    if (s > 0) {
+        // Normalize divisor
+        v <<= s;
+        un64 = (_u1 << s) | (_u0 >> @intCast(Log2Int(T), (@bitSizeOf(T) - @intCast(T, s))));
+        un10 = _u0 << s;
+    } else {
+        // Avoid undefined behavior of (u0 >> @bitSizeOf(T))
+        un64 = _u1;
+        un10 = _u0;
+    }
+
+    // Break divisor up into two 32-bit digits
+    const vn1 = v >> (@bitSizeOf(T) / 2);
+    const vn0 = v & std.math.maxInt(HalfInt(T));
+
+    // Break right half of dividend into two digits
+    const un1 = un10 >> (@bitSizeOf(T) / 2);
+    const un0 = un10 & std.math.maxInt(HalfInt(T));
+
+    // Compute the first quotient digit, q1
+    var q1 = un64 / vn1;
+    var rhat = un64 -% q1 *% vn1;
+
+    // q1 has at most error 2. No more than 2 iterations
+    while (q1 >= b or q1 * vn0 > b * rhat + un1) {
+        q1 -= 1;
+        rhat += vn1;
+        if (rhat >= b) break;
+    }
+
+    var un21 = un64 *% b +% un1 -% q1 *% v;
+
+    // Compute the second quotient digit
+    var q0 = un21 / vn1;
+    rhat = un21 -% q0 *% vn1;
+
+    // q0 has at most error 2. No more than 2 iterations.
+    while (q0 >= b or q0 * vn0 > b * rhat + un0) {
+        q0 -= 1;
+        rhat += vn1;
+        if (rhat >= b) break;
+    }
+
+    r.* = (un21 *% b +% un0 -% q0 *% v) >> s;
+    return q1 *% b +% q0;
+}
+
+fn divwide(comptime T: type, _u1: T, _u0: T, v: T, r: *T) T {
+    @setRuntimeSafety(is_test);
+    if (T == u64 and builtin.target.cpu.arch == .x86_64) {
+        var rem: T = undefined;
+        const quo = asm (
+            \\divq %[v]
+            : [_] "={rax}" (-> T),
+              [_] "={rdx}" (rem),
+            : [v] "r" (v),
+              [_] "{rax}" (_u0),
+              [_] "{rdx}" (_u1),
+        );
+        r.* = rem;
+        return quo;
+    } else {
+        return divwide_generic(T, _u1, _u0, v, r);
+    }
+}
+
+// return q = a / b, *r = a % b
+pub fn udivmod(comptime T: type, a_: T, b_: T, maybe_rem: ?*T) T {
+    @setRuntimeSafety(is_test);
+    const HalfT = HalfInt(T);
+    const SignedT = std.meta.Int(.signed, @bitSizeOf(T));
+
+    if (b_ > a_) {
         if (maybe_rem) |rem| {
-            rem.* = n[low];
+            rem.* = a_;
         }
         return 0;
     }
-    // n[high] != 0
-    if (d[low] == 0) {
-        if (d[high] == 0) {
-            // K X
-            // ---
-            // 0 0
-            if (maybe_rem) |rem| {
-                rem.* = n[high] % d[low];
-            }
-            return n[high] / d[low];
-        }
-        // d[high] != 0
-        if (n[low] == 0) {
-            // K 0
-            // ---
-            // K 0
-            if (maybe_rem) |rem| {
-                r[high] = n[high] % d[high];
-                r[low] = 0;
-                rem.* = @bitCast(DoubleInt, r);
-            }
-            return n[high] / d[high];
-        }
-        // K K
-        // ---
-        // K 0
-        if ((d[high] & (d[high] - 1)) == 0) {
-            // d is a power of 2
-            if (maybe_rem) |rem| {
-                r[low] = n[low];
-                r[high] = n[high] & (d[high] - 1);
-                rem.* = @bitCast(DoubleInt, r);
-            }
-            return n[high] >> @intCast(Log2SingleInt, @ctz(d[high]));
-        }
-        // K K
-        // ---
-        // K 0
-        sr = @bitCast(c_uint, @as(c_int, @clz(d[high])) - @as(c_int, @clz(n[high])));
-        // 0 <= sr <= single_int_bits - 2 or sr large
-        if (sr > single_int_bits - 2) {
-            if (maybe_rem) |rem| {
-                rem.* = a;
-            }
-            return 0;
-        }
-        sr += 1;
-        // 1 <= sr <= single_int_bits - 1
-        // q.all = a << (double_int_bits - sr);
-        q[low] = 0;
-        q[high] = n[low] << @intCast(Log2SingleInt, single_int_bits - sr);
-        // r.all = a >> sr;
-        r[high] = n[high] >> @intCast(Log2SingleInt, sr);
-        r[low] = (n[high] << @intCast(Log2SingleInt, single_int_bits - sr)) | (n[low] >> @intCast(Log2SingleInt, sr));
-    } else {
-        // d[low] != 0
-        if (d[high] == 0) {
-            // K X
-            // ---
-            // 0 K
-            if ((d[low] & (d[low] - 1)) == 0) {
-                // d is a power of 2
-                if (maybe_rem) |rem| {
-                    rem.* = n[low] & (d[low] - 1);
-                }
-                if (d[low] == 1) {
-                    return a;
-                }
-                sr = @ctz(d[low]);
-                q[high] = n[high] >> @intCast(Log2SingleInt, sr);
-                q[low] = (n[high] << @intCast(Log2SingleInt, single_int_bits - sr)) | (n[low] >> @intCast(Log2SingleInt, sr));
-                return @bitCast(DoubleInt, q);
-            }
-            // K X
-            // ---
-            // 0 K
-            sr = 1 + single_int_bits + @as(c_uint, @clz(d[low])) - @as(c_uint, @clz(n[high]));
-            // 2 <= sr <= double_int_bits - 1
-            // q.all = a << (double_int_bits - sr);
-            // r.all = a >> sr;
-            if (sr == single_int_bits) {
-                q[low] = 0;
-                q[high] = n[low];
-                r[high] = 0;
-                r[low] = n[high];
-            } else if (sr < single_int_bits) {
-                // 2 <= sr <= single_int_bits - 1
-                q[low] = 0;
-                q[high] = n[low] << @intCast(Log2SingleInt, single_int_bits - sr);
-                r[high] = n[high] >> @intCast(Log2SingleInt, sr);
-                r[low] = (n[high] << @intCast(Log2SingleInt, single_int_bits - sr)) | (n[low] >> @intCast(Log2SingleInt, sr));
-            } else {
-                // single_int_bits + 1 <= sr <= double_int_bits - 1
-                q[low] = n[low] << @intCast(Log2SingleInt, double_int_bits - sr);
-                q[high] = (n[high] << @intCast(Log2SingleInt, double_int_bits - sr)) | (n[low] >> @intCast(Log2SingleInt, sr - single_int_bits));
-                r[high] = 0;
-                r[low] = n[high] >> @intCast(Log2SingleInt, sr - single_int_bits);
-            }
+
+    var a = @bitCast([2]HalfT, a_);
+    var b = @bitCast([2]HalfT, b_);
+    var q: [2]HalfT = undefined;
+    var r: [2]HalfT = undefined;
+
+    // When the divisor fits in 64 bits, we can use an optimized path
+    if (b[hi] == 0) {
+        r[hi] = 0;
+        if (a[hi] < b[lo]) {
+            // The result fits in 64 bits
+            q[hi] = 0;
+            q[lo] = divwide(HalfT, a[hi], a[lo], b[lo], &r[lo]);
         } else {
-            // K X
-            // ---
-            // K K
-            sr = @bitCast(c_uint, @as(c_int, @clz(d[high])) - @as(c_int, @clz(n[high])));
-            // 0 <= sr <= single_int_bits - 1 or sr large
-            if (sr > single_int_bits - 1) {
-                if (maybe_rem) |rem| {
-                    rem.* = a;
-                }
-                return 0;
-            }
-            sr += 1;
-            // 1 <= sr <= single_int_bits
-            // q.all = a << (double_int_bits - sr);
-            // r.all = a >> sr;
-            q[low] = 0;
-            if (sr == single_int_bits) {
-                q[high] = n[low];
-                r[high] = 0;
-                r[low] = n[high];
-            } else {
-                r[high] = n[high] >> @intCast(Log2SingleInt, sr);
-                r[low] = (n[high] << @intCast(Log2SingleInt, single_int_bits - sr)) | (n[low] >> @intCast(Log2SingleInt, sr));
-                q[high] = n[low] << @intCast(Log2SingleInt, single_int_bits - sr);
-            }
+            // First, divide with the high part to get the remainder. After that a_hi < b_lo.
+            q[hi] = a[hi] / b[lo];
+            q[lo] = divwide(HalfT, a[hi] % b[lo], a[lo], b[lo], &r[lo]);
         }
+        if (maybe_rem) |rem| {
+            rem.* = @bitCast(T, r);
+        }
+        return @bitCast(T, q);
     }
-    // Not a special case
-    // q and r are initialized with:
-    // q.all = a << (double_int_bits - sr);
-    // r.all = a >> sr;
-    // 1 <= sr <= double_int_bits - 1
-    var carry: u32 = 0;
-    var r_all: DoubleInt = undefined;
-    while (sr > 0) : (sr -= 1) {
-        // r:q = ((r:q)  << 1) | carry
-        r[high] = (r[high] << 1) | (r[low] >> (single_int_bits - 1));
-        r[low] = (r[low] << 1) | (q[high] >> (single_int_bits - 1));
-        q[high] = (q[high] << 1) | (q[low] >> (single_int_bits - 1));
-        q[low] = (q[low] << 1) | carry;
-        // carry = 0;
-        // if (r.all >= b)
-        // {
-        //     r.all -= b;
-        //      carry = 1;
+
+    // 0 <= shift <= 63
+    var shift: Log2Int(T) = @clz(b[hi]) - @clz(a[hi]);
+    var af = @bitCast(T, a);
+    var bf = @bitCast(T, b) << shift;
+    q = @bitCast([2]HalfT, @as(T, 0));
+
+    for (0..shift + 1) |_| {
+        q[lo] <<= 1;
+        // Branchless version of:
+        // if (a >= b) {
+        //     a -= b;
+        //     q[lo] |= 1;
         // }
-        r_all = @bitCast(DoubleInt, r);
-        const s: SignedDoubleInt = @bitCast(SignedDoubleInt, b -% r_all -% 1) >> (double_int_bits - 1);
-        carry = @intCast(u32, s & 1);
-        r_all -= b & @bitCast(DoubleInt, s);
-        r = @bitCast([2]SingleInt, r_all);
+        const s = @bitCast(SignedT, bf -% af -% 1) >> (@bitSizeOf(T) - 1);
+        q[lo] |= @intCast(HalfT, s & 1);
+        af -= bf & @bitCast(T, s);
+        bf >>= 1;
     }
-    const q_all = (@bitCast(DoubleInt, q) << 1) | carry;
     if (maybe_rem) |rem| {
-        rem.* = r_all;
+        rem.* = @bitCast(T, af);
     }
-    return q_all;
+    return @bitCast(T, q);
 }

From 59a6f482e205c2f8ee60771a3d05652fd63e11e6 Mon Sep 17 00:00:00 2001
From: Marc Tiehuis <marc@tiehu.is>
Date: Fri, 14 Apr 2023 18:29:42 +1200
Subject: [PATCH 2/2] disable udivmod asm divq on windows

---
 lib/compiler_rt/udivmod.zig | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/lib/compiler_rt/udivmod.zig b/lib/compiler_rt/udivmod.zig
index 55abade733..a83ece8ada 100644
--- a/lib/compiler_rt/udivmod.zig
+++ b/lib/compiler_rt/udivmod.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 const builtin = @import("builtin");
 const is_test = builtin.is_test;
 const Log2Int = std.math.Log2Int;
+const HalveInt = @import("common.zig").HalveInt;
 
 const lo = switch (builtin.cpu.arch.endian()) {
     .Big => 1,
@@ -9,19 +10,10 @@ const lo = switch (builtin.cpu.arch.endian()) {
 };
 const hi = 1 - lo;
 
-fn HalfInt(comptime T: type) type {
-    std.debug.assert(@typeInfo(T) == .Int);
-    std.debug.assert(@bitSizeOf(T) % 2 == 0);
-    return std.meta.Int(.unsigned, @bitSizeOf(T) / 2);
-}
-
-// Performs division of a double-word specified in its single-word components. Most commonly used
-// for computing u128 bit divisions in terms of 64-bit integers.
-//
-// q = U / v
-// r = U % v
-// where  U = (u1 | u0)
+// Let _u1 and _u0 be the high and low limbs of U respectively.
+// Returns U / v_ and sets r = U % v_.
 fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T {
+    const HalfT = HalveInt(T, false).HalfT;
     @setRuntimeSafety(is_test);
     var v = v_;
 
@@ -43,11 +35,11 @@ fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T {
 
     // Break divisor up into two 32-bit digits
     const vn1 = v >> (@bitSizeOf(T) / 2);
-    const vn0 = v & std.math.maxInt(HalfInt(T));
+    const vn0 = v & std.math.maxInt(HalfT);
 
     // Break right half of dividend into two digits
     const un1 = un10 >> (@bitSizeOf(T) / 2);
-    const un0 = un10 & std.math.maxInt(HalfInt(T));
+    const un0 = un10 & std.math.maxInt(HalfT);
 
     // Compute the first quotient digit, q1
     var q1 = un64 / vn1;
@@ -79,7 +71,7 @@ fn divwide_generic(comptime T: type, _u1: T, _u0: T, v_: T, r: *T) T {
 
 fn divwide(comptime T: type, _u1: T, _u0: T, v: T, r: *T) T {
     @setRuntimeSafety(is_test);
-    if (T == u64 and builtin.target.cpu.arch == .x86_64) {
+    if (T == u64 and builtin.target.cpu.arch == .x86_64 and builtin.target.os.tag != .windows) {
         var rem: T = undefined;
         const quo = asm (
             \\divq %[v]
@@ -96,10 +88,10 @@ fn divwide(comptime T: type, _u1: T, _u0: T, v: T, r: *T) T {
     }
 }
 
-// return q = a / b, *r = a % b
+// Returns a_ / b_ and sets maybe_rem = a_ % b.
 pub fn udivmod(comptime T: type, a_: T, b_: T, maybe_rem: ?*T) T {
     @setRuntimeSafety(is_test);
-    const HalfT = HalfInt(T);
+    const HalfT = HalveInt(T, false).HalfT;
     const SignedT = std.meta.Int(.signed, @bitSizeOf(T));
 
     if (b_ > a_) {
@@ -141,8 +133,8 @@ pub fn udivmod(comptime T: type, a_: T, b_: T, maybe_rem: ?*T) T {
     for (0..shift + 1) |_| {
         q[lo] <<= 1;
         // Branchless version of:
-        // if (a >= b) {
-        //     a -= b;
+        // if (af >= bf) {
+        //     af -= bf;
         //     q[lo] |= 1;
         // }
         const s = @bitCast(SignedT, bf -% af -% 1) >> (@bitSizeOf(T) - 1);