From 53e6c719efe5073307ee58436b17ee80f574b984 Mon Sep 17 00:00:00 2001
From: Marc Tiehuis <marc@tiehu.is>
Date: Sat, 5 Feb 2022 20:36:48 +1300
Subject: [PATCH] std/math: optimize division with divisors less than a
 half-limb

This adds a new path which avoids using compiler_rt generated div
udivmod instructions in the case that a divisor is less than half the
max usize value. Two half-limb divisions are performed instead which
ensures that non-emulated division instructions are actually used. This
does not improve the udivmod code which should still be reviewed
independently of this issue.

Notably this improves the performance of the toString implementation of
non-power-of-two bases considerably.

Division performance is improved ~1000% based on some coarse testing.

The following test code is used to provide a rough comparison between
the old vs. new method.

```
const std = @import("std");
const Managed = std.math.big.int.Managed;

const allocator = std.heap.c_allocator;

fn fib(a: *Managed, n: usize) !void {
    var b = try Managed.initSet(allocator, 1);
    defer b.deinit();
    var c = try Managed.init(allocator);
    defer c.deinit();

    var i: usize = 0;
    while (i < n) : (i += 1) {
        try c.add(a.toConst(), b.toConst());

        a.swap(&b);
        b.swap(&c);
    }
}

pub fn main() !void {
    var a = try Managed.initSet(allocator, 0);
    defer a.deinit();

    try fib(&a, 1_000_000);

    // Note: Next two lines (and printed digit count) omitted on no-print version.
    const as = try a.toString(allocator, 10, .lower);
    defer allocator.free(as);

    std.debug.print("fib: digit count: {}, limb count: {}\n", .{ as.len, a.limbs.len });
}
```

```
==> time.no-print <==
limb count: 10849

________________________________________________________
Executed in   10.60 secs    fish           external
   usr time   10.44 secs    0.00 millis   10.44 secs
   sys time    0.02 secs    1.12 millis    0.02 secs

==> time.old <==
fib: digit count: 208988, limb count: 10849

________________________________________________________
Executed in   22.78 secs    fish           external
   usr time   22.43 secs    1.01 millis   22.43 secs
   sys time    0.03 secs    0.13 millis    0.03 secs

==> time.optimized <==
fib: digit count: 208988, limb count: 10849

________________________________________________________
Executed in   11.59 secs    fish           external
   usr time   11.56 secs    1.03 millis   11.56 secs
   sys time    0.03 secs    0.12 millis    0.03 secs
```

Perf data for non-optimized and optimized, verifying no udivmod is
generated by the new code.

```
$ perf report -i perf.data.old --stdio
- Total Lost Samples: 0
-
- Samples: 90K of event 'cycles:u'
- Event count (approx.): 71603695208
-
- Overhead  Command  Shared Object     Symbol
- ........  .......  ................  ...........................................
-
    52.97%  t        t                 [.] compiler_rt.udivmod.udivmod
    45.97%  t        t                 [.] std.math.big.int.Mutable.addCarry
     0.83%  t        t                 [.] main
     0.08%  t        libc-2.33.so      [.] __memmove_avx_unaligned_erms
     0.08%  t        t                 [.] __udivti3
     0.03%  t        [unknown]         [k] 0xffffffff9a0010a7
     0.02%  t        t                 [.] std.math.big.int.Managed.ensureCapacity
     0.01%  t        libc-2.33.so      [.] _int_malloc
     0.00%  t        libc-2.33.so      [.] __malloc_usable_size
     0.00%  t        libc-2.33.so      [.] _int_free
     0.00%  t        t                 [.] 0x0000000000004a80
     0.00%  t        t                 [.] std.heap.CAllocator.resize
     0.00%  t        libc-2.33.so      [.] _mid_memalign
     0.00%  t        libc-2.33.so      [.] sysmalloc
     0.00%  t        libc-2.33.so      [.] __posix_memalign
     0.00%  t        t                 [.] std.heap.CAllocator.alloc
     0.00%  t        ld-2.33.so        [.] do_lookup_x

$ perf report -i perf.data.optimized --stdio
- Total Lost Samples: 0
-
- Samples: 46K of event 'cycles:u'
- Event count (approx.): 36790112336
-
- Overhead  Command  Shared Object     Symbol
- ........  .......  ................  ...........................................
-
    79.98%  t        t                 [.] std.math.big.int.Mutable.addCarry
    15.14%  t        t                 [.] main
     4.58%  t        t                 [.] std.math.big.int.Managed.ensureCapacity
     0.21%  t        libc-2.33.so      [.] __memmove_avx_unaligned_erms
     0.05%  t        [unknown]         [k] 0xffffffff9a0010a7
     0.02%  t        libc-2.33.so      [.] _int_malloc
     0.01%  t        t                 [.] std.heap.CAllocator.alloc
     0.01%  t        libc-2.33.so      [.] __malloc_usable_size
     0.00%  t        libc-2.33.so      [.] systrim.constprop.0
     0.00%  t        libc-2.33.so      [.] _mid_memalign
     0.00%  t        t                 [.] 0x0000000000000c7d
     0.00%  t        libc-2.33.so      [.] malloc
     0.00%  t        ld-2.33.so        [.] check_match
```

Closes #10630.
---
 lib/std/math/big.zig          |  1 +
 lib/std/math/big/int.zig      | 40 +++++++++++++++++++++++++++++++++--
 lib/std/math/big/int_test.zig | 37 ++++++++++++++++++++++++++++++--
 3 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/lib/std/math/big.zig b/lib/std/math/big.zig
index e7f8a7fb34..c7fc0b17f5 100644
--- a/lib/std/math/big.zig
+++ b/lib/std/math/big.zig
@@ -7,6 +7,7 @@ pub const Limb = usize;
 const limb_info = @typeInfo(Limb).Int;
 pub const SignedLimb = std.meta.Int(.signed, limb_info.bits);
 pub const DoubleLimb = std.meta.Int(.unsigned, 2 * limb_info.bits);
+pub const HalfLimb = std.meta.Int(.unsigned, limb_info.bits / 2);
 pub const SignedDoubleLimb = std.meta.Int(.signed, 2 * limb_info.bits);
 pub const Log2Limb = std.math.Log2Int(Limb);
 
diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index ec0143a3d7..87a62bf66c 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -2,6 +2,8 @@ const std = @import("../../std.zig");
 const math = std.math;
 const Limb = std.math.big.Limb;
 const limb_bits = @typeInfo(Limb).Int.bits;
+const HalfLimb = std.math.big.HalfLimb;
+const half_limb_bits = @typeInfo(HalfLimb).Int.bits;
 const DoubleLimb = std.math.big.DoubleLimb;
 const SignedDoubleLimb = std.math.big.SignedDoubleLimb;
 const Log2Limb = std.math.big.Log2Limb;
@@ -1335,7 +1337,16 @@ pub const Mutable = struct {
         const xy_trailing = math.min(x_trailing, y_trailing);
 
         if (y.len - xy_trailing == 1) {
-            lldiv1(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], y.limbs[y.len - 1]);
+            const divisor = y.limbs[y.len - 1];
+
+            // Optimization for small divisor. By using a half limb we can avoid requiring DoubleLimb
+            // divisions in the hot code path. This may often require compiler_rt software-emulation.
+            if (divisor < maxInt(HalfLimb)) {
+                lldiv0p5(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], @intCast(HalfLimb, divisor));
+            } else {
+                lldiv1(q.limbs, &r.limbs[0], x.limbs[xy_trailing..x.len], divisor);
+            }
+
             q.normalize(x.len - xy_trailing);
             q.positive = q_positive;
 
@@ -1939,7 +1950,8 @@ pub const Const = struct {
             }
         } else {
             // Non power-of-two: batch divisions per word size.
-            const digits_per_limb = math.log(Limb, base, maxInt(Limb));
+            // We use a HalfLimb here so the division uses the faster lldiv0p5 over lldiv1 codepath.
+            const digits_per_limb = math.log(HalfLimb, base, maxInt(HalfLimb));
             var limb_base: Limb = 1;
             var j: usize = 0;
             while (j < digits_per_limb) : (j += 1) {
@@ -3208,6 +3220,30 @@ fn lldiv1(quo: []Limb, rem: *Limb, a: []const Limb, b: Limb) void {
     }
 }
 
+fn lldiv0p5(quo: []Limb, rem: *Limb, a: []const Limb, b: HalfLimb) void {
+    @setRuntimeSafety(debug_safety);
+    assert(a.len > 1 or a[0] >= b);
+    assert(quo.len >= a.len);
+
+    rem.* = 0;
+    for (a) |_, ri| {
+        const i = a.len - ri - 1;
+        const ai_high = a[i] >> half_limb_bits;
+        const ai_low = a[i] & ((1 << half_limb_bits) - 1);
+
+        // Split the division into two divisions acting on half a limb each. Carry remainder.
+        const ai_high_with_carry = (rem.* << half_limb_bits) | ai_high;
+        const ai_high_quo = ai_high_with_carry / b;
+        rem.* = ai_high_with_carry % b;
+
+        const ai_low_with_carry = (rem.* << half_limb_bits) | ai_low;
+        const ai_low_quo = ai_low_with_carry / b;
+        rem.* = ai_low_with_carry % b;
+
+        quo[i] = (ai_high_quo << half_limb_bits) | ai_low_quo;
+    }
+}
+
 fn llshl(r: []Limb, a: []const Limb, shift: usize) void {
     @setRuntimeSafety(debug_safety);
     assert(a.len >= 1);
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 4c1d12116e..70a9b97a38 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -1064,7 +1064,7 @@ test "big.int mulWrap large" {
     try testing.expect(b.eq(c));
 }
 
-test "big.int div single-single no rem" {
+test "big.int div single-half no rem" {
     var a = try Managed.initSet(testing.allocator, 50);
     defer a.deinit();
     var b = try Managed.initSet(testing.allocator, 5);
@@ -1080,7 +1080,7 @@ test "big.int div single-single no rem" {
     try testing.expect((try r.to(u32)) == 0);
 }
 
-test "big.int div single-single with rem" {
+test "big.int div single-half with rem" {
     var a = try Managed.initSet(testing.allocator, 49);
     defer a.deinit();
     var b = try Managed.initSet(testing.allocator, 5);
@@ -1096,6 +1096,39 @@ test "big.int div single-single with rem" {
     try testing.expect((try r.to(u32)) == 4);
 }
 
+test "big.int div single-single no rem" {
+    // assumes usize is <= 64 bits.
+    var a = try Managed.initSet(testing.allocator, 1 << 52);
+    defer a.deinit();
+    var b = try Managed.initSet(testing.allocator, 1 << 35);
+    defer b.deinit();
+
+    var q = try Managed.init(testing.allocator);
+    defer q.deinit();
+    var r = try Managed.init(testing.allocator);
+    defer r.deinit();
+    try Managed.divTrunc(&q, &r, a.toConst(), b.toConst());
+
+    try testing.expect((try q.to(u32)) == 131072);
+    try testing.expect((try r.to(u32)) == 0);
+}
+
+test "big.int div single-single with rem" {
+    var a = try Managed.initSet(testing.allocator, (1 << 52) | (1 << 33));
+    defer a.deinit();
+    var b = try Managed.initSet(testing.allocator, (1 << 35));
+    defer b.deinit();
+
+    var q = try Managed.init(testing.allocator);
+    defer q.deinit();
+    var r = try Managed.init(testing.allocator);
+    defer r.deinit();
+    try Managed.divTrunc(&q, &r, a.toConst(), b.toConst());
+
+    try testing.expect((try q.to(u64)) == 131072);
+    try testing.expect((try r.to(u64)) == 8589934592);
+}
+
 test "big.int div multi-single no rem" {
     const op1 = 0xffffeeeeddddcccc;
     const op2 = 34;