aarch64: implement mul_with_overflow for <= 32bit ints

Add emitters for `smull`, `umull` and `tst (immediate)` instructions.
2026-02-20 00:08:56 +00:00 · 2022-05-03 12:20:27 +02:00 · 2022-05-03 12:20:27 +02:00 · 8715b01005
commit 8715b01005
parent aaacda4df9
5 changed files with 115 additions and 3 deletions
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@ -1296,6 +1296,11 @@ fn binOpRegister(

    const dest_reg = switch (mir_tag) {
        .cmp_shifted_register => undefined, // cmp has no destination register
+        .smull, .umull => blk: {
+            // TODO can we reuse anything for smull and umull?
+            const raw_reg = try self.register_manager.allocReg(null);
+            break :blk raw_reg.to64();
+        },
        else => if (maybe_inst) |inst| blk: {
            const bin_op = self.air.instructions.items(.data)[inst].bin_op;

@ -1335,6 +1340,8 @@ fn binOpRegister(
            .shift = .lsl,
        } },
        .mul,
+        .smull,
+        .umull,
        .lsl_register,
        .asr_register,
        .lsr_register,
@ -1883,8 +1890,69 @@ fn airOverflow(self: *Self, inst: Air.Inst.Index) !void {
 }

 fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
-    _ = inst;
-    return self.fail("TODO implement airMulWithOverflow for {}", .{self.target.cpu.arch});
+    const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
+    const extra = self.air.extraData(Air.Bin, ty_pl.payload).data;
+    if (self.liveness.isUnused(inst)) return self.finishAir(inst, .dead, .{ extra.lhs, extra.rhs, .none });
+    const result: MCValue = result: {
+        const lhs = try self.resolveInst(extra.lhs);
+        const rhs = try self.resolveInst(extra.rhs);
+        const lhs_ty = self.air.typeOf(extra.lhs);
+        const rhs_ty = self.air.typeOf(extra.rhs);
+
+        const tuple_ty = self.air.typeOfIndex(inst);
+        const tuple_size = @intCast(u32, tuple_ty.abiSize(self.target.*));
+        const tuple_align = tuple_ty.abiAlignment(self.target.*);
+        const overflow_bit_offset = @intCast(u32, tuple_ty.structFieldOffset(1, self.target.*));
+
+        switch (lhs_ty.zigTypeTag()) {
+            .Vector => return self.fail("TODO implement mul_with_overflow for vectors", .{}),
+            .Int => {
+                const int_info = lhs_ty.intInfo(self.target.*);
+
+                if (int_info.bits <= 32) {
+                    const stack_offset = try self.allocMem(inst, tuple_size, tuple_align);
+
+                    try self.spillCompareFlagsIfOccupied();
+                    self.compare_flags_inst = null;
+
+                    const base_tag: Mir.Inst.Tag = switch (int_info.signedness) {
+                        .signed => .smull,
+                        .unsigned => .umull,
+                    };
+
+                    const dest = try self.binOpRegister(base_tag, null, lhs, rhs, lhs_ty, rhs_ty);
+                    const dest_reg = dest.register;
+                    self.register_manager.freezeRegs(&.{dest_reg});
+                    defer self.register_manager.unfreezeRegs(&.{dest_reg});
+
+                    const truncated_reg = try self.register_manager.allocReg(null);
+                    self.register_manager.freezeRegs(&.{truncated_reg});
+                    defer self.register_manager.unfreezeRegs(&.{truncated_reg});
+
+                    try self.truncRegister(dest_reg, truncated_reg, int_info.signedness, int_info.bits);
+                    _ = try self.binOp(
+                        .cmp_eq,
+                        null,
+                        dest,
+                        .{ .register = truncated_reg },
+                        Type.usize,
+                        Type.usize,
+                    );
+
+                    try self.genSetStack(lhs_ty, stack_offset, .{ .register = truncated_reg });
+                    try self.genSetStack(Type.initTag(.u1), stack_offset - overflow_bit_offset, .{
+                        .compare_flags_unsigned = .neq,
+                    });
+
+                    break :result MCValue{ .stack_offset = stack_offset };
+                } else if (int_info.bits <= 64) {
+                    return self.fail("TODO implement mul_with_overflow for ints", .{});
+                } else return self.fail("TODO implmenet mul_with_overflow for integers > u64/i64", .{});
+            },
+            else => unreachable,
+        }
+    };
+    return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, .none });
 }

 fn airShlWithOverflow(self: *Self, inst: Air.Inst.Index) !void {
--- a/src/arch/aarch64/Emit.zig
+++ b/src/arch/aarch64/Emit.zig
@ -106,6 +106,7 @@ pub fn emitMir(
            .call_extern => try emit.mirCallExtern(inst),

            .eor_immediate => try emit.mirLogicalImmediate(inst),
+            .tst_immediate => try emit.mirLogicalImmediate(inst),

            .add_shifted_register => try emit.mirAddSubtractShiftedRegister(inst),
            .adds_shifted_register => try emit.mirAddSubtractShiftedRegister(inst),
@ -166,6 +167,8 @@ pub fn emitMir(
            .movz => try emit.mirMoveWideImmediate(inst),

            .mul => try emit.mirDataProcessing3Source(inst),
+            .smull => try emit.mirDataProcessing3Source(inst),
+            .umull => try emit.mirDataProcessing3Source(inst),

            .nop => try emit.mirNop(),

@ -674,6 +677,7 @@ fn mirLogicalImmediate(emit: *Emit, inst: Mir.Inst.Index) !void {

    switch (tag) {
        .eor_immediate => try emit.writeInstruction(Instruction.eorImmediate(rd, rn, imms, immr, n)),
+        .tst_immediate => try emit.writeInstruction(Instruction.tstImmediate(rn, imms, immr, n)),
        else => unreachable,
    }
 }
@ -1000,6 +1004,8 @@ fn mirDataProcessing3Source(emit: *Emit, inst: Mir.Inst.Index) !void {

    switch (tag) {
        .mul => try emit.writeInstruction(Instruction.mul(rrr.rd, rrr.rn, rrr.rm)),
+        .smull => try emit.writeInstruction(Instruction.smull(rrr.rd, rrr.rn, rrr.rm)),
+        .umull => try emit.writeInstruction(Instruction.umull(rrr.rd, rrr.rn, rrr.rm)),
        else => unreachable,
    }
 }
--- a/src/arch/aarch64/Mir.zig
+++ b/src/arch/aarch64/Mir.zig
@ -146,6 +146,8 @@ pub const Inst = struct {
        ret,
        /// Signed bitfield extract
        sbfx,
+        /// Signed multiply long
+        smull,
        /// Signed extend byte
        sxtb,
        /// Signed extend halfword
@ -182,8 +184,12 @@ pub const Inst = struct {
        subs_shifted_register,
        /// Supervisor Call
        svc,
+        /// Test bits (immediate)
+        tst_immediate,
        /// Unsigned bitfield extract
        ubfx,
+        /// Unsigned multiply long
+        umull,
        /// Unsigned extend byte
        uxtb,
        /// Unsigned extend halfword
--- a/src/arch/aarch64/bits.zig
+++ b/src/arch/aarch64/bits.zig
@ -1409,6 +1409,10 @@ pub const Instruction = union(enum) {
        return logicalImmediate(0b11, rd, rn, imms, immr, n);
    }

+    pub fn tstImmediate(rn: Register, imms: u6, immr: u6, n: u1) Instruction {
+        return andsImmediate(.xzr, rn, imms, immr, n);
+    }
+
    // Bitfield

    pub fn sbfm(rd: Register, rn: Register, immr: u6, imms: u6) Instruction {
@ -1564,6 +1568,15 @@ pub const Instruction = union(enum) {
        return dataProcessing3Source(0b00, 0b000, 0b0, rd, rn, rm, ra);
    }

+    pub fn smaddl(rd: Register, rn: Register, rm: Register, ra: Register) Instruction {
+        return dataProcessing3Source(0b00, 0b001, 0b0, rd, rn, rm, ra);
+    }
+
+    pub fn umaddl(rd: Register, rn: Register, rm: Register, ra: Register) Instruction {
+        assert(rd.size() == 64);
+        return dataProcessing3Source(0b00, 0b101, 0b0, rd, rn, rm, ra);
+    }
+
    pub fn msub(rd: Register, rn: Register, rm: Register, ra: Register) Instruction {
        return dataProcessing3Source(0b00, 0b000, 0b1, rd, rn, rm, ra);
    }
@ -1572,6 +1585,14 @@ pub const Instruction = union(enum) {
        return madd(rd, rn, rm, .xzr);
    }

+    pub fn smull(rd: Register, rn: Register, rm: Register) Instruction {
+        return smaddl(rd, rn, rm, .xzr);
+    }
+
+    pub fn umull(rd: Register, rn: Register, rm: Register) Instruction {
+        return umaddl(rd, rn, rm, .xzr);
+    }
+
    pub fn mneg(rd: Register, rn: Register, rm: Register) Instruction {
        return msub(rd, rn, rm, .xzr);
    }
@ -1790,6 +1811,18 @@ test "serialize instructions" {
            .inst = Instruction.lsrImmediate(.x4, .x2, 63),
            .expected = 0b1_10_100110_1_111111_111111_00010_00100,
        },
+        .{ // umull x0, w0, w1
+            .inst = Instruction.umull(.x0, .w0, .w1),
+            .expected = 0b1_00_11011_1_01_00001_0_11111_00000_00000,
+        },
+        .{ // smull x0, w0, w1
+            .inst = Instruction.smull(.x0, .w0, .w1),
+            .expected = 0b1_00_11011_0_01_00001_0_11111_00000_00000,
+        },
+        .{ // tst x0, #0xffffffff00000000
+            .inst = Instruction.tstImmediate(.x0, 0b011111, 0b100000, 0b1),
+            .expected = 0b1_11_100100_1_100000_011111_00000_11111,
+        },
    };

    for (testcases) |case| {
--- a/test/behavior/math.zig
+++ b/test/behavior/math.zig
@ -666,7 +666,6 @@ test "small int addition" {

 test "@mulWithOverflow" {
    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO

    var result: u8 = undefined;
    try expect(@mulWithOverflow(u8, 86, 3, &result));