From 4a5e75245bc60f76a82bb6bcb70553bec7d7c4f2 Mon Sep 17 00:00:00 2001
From: Jakub Konka <kubkon@jakubkonka.com>
Date: Fri, 21 Jan 2022 16:04:26 +0100
Subject: [PATCH 1/4] stage2: clean up preserving callee regs on the stack

Instead of using `push` and `pop` combo, we now re-use our stack
allocation mechanism which means we don't have to worry about
16-byte stack adjustments on macOS as it is handled automatically
for us. Another benefit is that we don't have to backpatch stack
offsets when pulling args from the stack.
---
 src/arch/x86_64/CodeGen.zig  | 104 ++++++++++++++++-------------------
 src/arch/x86_64/Emit.zig     |  34 ++++++------
 src/arch/x86_64/Mir.zig      |   7 ++-
 src/arch/x86_64/PrintMir.zig |  40 +++++++-------
 4 files changed, 90 insertions(+), 95 deletions(-)

diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 522da6e25d..c204bcbad1 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -61,8 +61,6 @@ end_di_column: u32,
 /// which is a relative jump, based on the address following the reloc.
 exitlude_jump_relocs: std.ArrayListUnmanaged(Mir.Inst.Index) = .{},
 
-stack_args_relocs: std.ArrayListUnmanaged(Mir.Inst.Index) = .{},
-
 /// Whenever there is a runtime branch, we push a Branch onto this stack,
 /// and pop it off when the runtime branch joins. This provides an "overlay"
 /// of the table of mappings from instructions to `MCValue` from within the branch.
@@ -286,7 +284,6 @@ pub fn generate(
     defer function.exitlude_jump_relocs.deinit(bin_file.allocator);
     defer function.mir_instructions.deinit(bin_file.allocator);
     defer function.mir_extra.deinit(bin_file.allocator);
-    defer function.stack_args_relocs.deinit(bin_file.allocator);
     defer if (builtin.mode == .Debug) function.mir_to_air_map.deinit();
 
     var call_info = function.resolveCallingConventionValues(fn_type) catch |err| switch (err) {
@@ -378,13 +375,6 @@ pub fn addExtraAssumeCapacity(self: *Self, extra: anytype) u32 {
 fn gen(self: *Self) InnerError!void {
     const cc = self.fn_type.fnCallingConvention();
     if (cc != .Naked) {
-        // push the callee_preserved_regs that were used
-        const backpatch_push_callee_preserved_regs_i = try self.addInst(.{
-            .tag = .push_regs_from_callee_preserved_regs,
-            .ops = undefined,
-            .data = .{ .regs_to_push_or_pop = undefined }, // to be backpatched
-        });
-
         _ = try self.addInst(.{
             .tag = .push,
             .ops = (Mir.Ops{
@@ -416,6 +406,15 @@ fn gen(self: *Self) InnerError!void {
             .data = undefined,
         });
 
+        // push the callee_preserved_regs that were used
+        const backpatch_push_callee_preserved_regs_i = try self.addInst(.{
+            .tag = .push_regs_from_callee_preserved_regs,
+            .ops = (Mir.Ops{
+                .reg1 = .rbp,
+            }).encode(),
+            .data = .{ .payload = undefined }, // to be backpatched
+        });
+
         try self.genBody(self.air.getMainBody());
 
         // TODO can single exitlude jump reloc be elided? What if it is not at the end of the code?
@@ -429,6 +428,33 @@ fn gen(self: *Self) InnerError!void {
             self.mir_instructions.items(.data)[jmp_reloc].inst = @intCast(u32, self.mir_instructions.len);
         }
 
+        // calculate the data for callee_preserved_regs to be pushed and popped
+        const callee_preserved_regs_payload = blk: {
+            var data = Mir.RegsToPushOrPop{
+                .regs = 0,
+                .disp = mem.alignForwardGeneric(u32, self.next_stack_offset, 8),
+            };
+            inline for (callee_preserved_regs) |reg, i| {
+                if (self.register_manager.isRegAllocated(reg)) {
+                    data.regs |= 1 << @intCast(u5, i);
+                    self.max_end_stack += 8;
+                }
+            }
+            break :blk try self.addExtra(data);
+        };
+
+        const data = self.mir_instructions.items(.data);
+        // backpatch the push instruction
+        data[backpatch_push_callee_preserved_regs_i].payload = callee_preserved_regs_payload;
+        // pop the callee_preserved_regs
+        _ = try self.addInst(.{
+            .tag = .pop_regs_from_callee_preserved_regs,
+            .ops = (Mir.Ops{
+                .reg1 = .rbp,
+            }).encode(),
+            .data = .{ .payload = callee_preserved_regs_payload },
+        });
+
         _ = try self.addInst(.{
             .tag = .dbg_epilogue_begin,
             .ops = undefined,
@@ -450,34 +476,6 @@ fn gen(self: *Self) InnerError!void {
             .data = undefined,
         });
 
-        // calculate the data for callee_preserved_regs to be pushed and popped
-        var callee_preserved_regs_push_data: u32 = 0x0;
-        // TODO this is required on macOS since macOS actively checks for stack alignment
-        // at every extern call site. As far as I can tell, macOS accounts for the typical
-        // function prologue first 2 instructions of:
-        // ...
-        // push rbp
-        // mov rsp, rbp
-        // ...
-        // Thus we don't need to adjust the stack for the first push instruction. However,
-        // any subsequent push of values on the stack such as when preserving registers,
-        // needs to be taken into account here.
-        var stack_adjustment: u32 = 0;
-        inline for (callee_preserved_regs) |reg, i| {
-            if (self.register_manager.isRegAllocated(reg)) {
-                callee_preserved_regs_push_data |= 1 << @intCast(u5, i);
-                stack_adjustment += @divExact(reg.size(), 8);
-            }
-        }
-        const data = self.mir_instructions.items(.data);
-        // backpatch the push instruction
-        data[backpatch_push_callee_preserved_regs_i].regs_to_push_or_pop = callee_preserved_regs_push_data;
-        // pop the callee_preserved_regs
-        _ = try self.addInst(.{
-            .tag = .pop_regs_from_callee_preserved_regs,
-            .ops = undefined,
-            .data = .{ .regs_to_push_or_pop = callee_preserved_regs_push_data },
-        });
         _ = try self.addInst(.{
             .tag = .ret,
             .ops = (Mir.Ops{
@@ -488,36 +486,28 @@ fn gen(self: *Self) InnerError!void {
 
         // Adjust the stack
         const stack_end = self.max_end_stack;
-        if (stack_end > math.maxInt(i32) - stack_adjustment) {
+        if (stack_end > math.maxInt(i32)) {
             return self.failSymbol("too much stack used in call parameters", .{});
         }
         // TODO we should reuse this mechanism to align the stack when calling any function even if
         // we do not pass any args on the stack BUT we still push regs to stack with `push` inst.
         const aligned_stack_end = @intCast(u32, mem.alignForward(stack_end, self.stack_align));
-        if (aligned_stack_end > 0 or (stack_adjustment > 0 and self.target.isDarwin())) {
-            const imm = if (self.target.isDarwin()) aligned_stack_end + stack_adjustment else aligned_stack_end;
+        if (aligned_stack_end > 0) {
             self.mir_instructions.set(backpatch_stack_sub, .{
                 .tag = .sub,
                 .ops = (Mir.Ops{
                     .reg1 = .rsp,
                 }).encode(),
-                .data = .{ .imm = imm },
+                .data = .{ .imm = aligned_stack_end },
             });
             self.mir_instructions.set(backpatch_stack_add, .{
                 .tag = .add,
                 .ops = (Mir.Ops{
                     .reg1 = .rsp,
                 }).encode(),
-                .data = .{ .imm = imm },
+                .data = .{ .imm = aligned_stack_end },
             });
         }
-        while (self.stack_args_relocs.popOrNull()) |index| {
-            // TODO like above, gotta figure out the alignment shenanigans for macOS, etc.
-            const adjustment = if (self.target.isDarwin()) 2 * stack_adjustment else stack_adjustment;
-            // +16 bytes to account for saved return address of the `call` instruction and
-            // `push rbp`.
-            self.mir_instructions.items(.data)[index].imm += adjustment + aligned_stack_end + 16;
-        }
     } else {
         _ = try self.addInst(.{
             .tag = .dbg_prologue_end,
@@ -2194,16 +2184,15 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
 
                 if (abi_size <= 8) {
                     const reg = try self.register_manager.allocReg(inst, &.{});
-                    const reloc = try self.addInst(.{
+                    _ = try self.addInst(.{
                         .tag = .mov,
                         .ops = (Mir.Ops{
                             .reg1 = registerAlias(reg, @intCast(u32, abi_size)),
-                            .reg2 = .rsp,
+                            .reg2 = .rbp,
                             .flags = 0b01,
                         }).encode(),
-                        .data = .{ .imm = off },
+                        .data = .{ .imm = off + 16 },
                     });
-                    try self.stack_args_relocs.append(self.bin_file.allocator, reloc);
                     break :blk .{ .register = reg };
                 }
 
@@ -2217,15 +2206,14 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
                 try self.register_manager.getReg(.rax, null);
                 try self.register_manager.getReg(.rcx, null);
 
-                const reloc = try self.addInst(.{
+                _ = try self.addInst(.{
                     .tag = .lea,
                     .ops = (Mir.Ops{
                         .reg1 = addr_reg.to64(),
-                        .reg2 = .rsp,
+                        .reg2 = .rbp,
                     }).encode(),
-                    .data = .{ .imm = off },
+                    .data = .{ .imm = off + 16 },
                 });
-                try self.stack_args_relocs.append(self.bin_file.allocator, reloc);
 
                 // TODO allow for abi_size to be u64
                 try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
diff --git a/src/arch/x86_64/Emit.zig b/src/arch/x86_64/Emit.zig
index 4ec80dd1ba..caaddb2e73 100644
--- a/src/arch/x86_64/Emit.zig
+++ b/src/arch/x86_64/Emit.zig
@@ -251,23 +251,25 @@ fn mirPushPop(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
     }
 }
 fn mirPushPopRegsFromCalleePreservedRegs(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
-    const callee_preserved_regs = bits.callee_preserved_regs;
-    const regs = emit.mir.instructions.items(.data)[inst].regs_to_push_or_pop;
-    if (tag == .push) {
-        for (callee_preserved_regs) |reg, i| {
-            if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
-            lowerToOEnc(.push, reg, emit.code) catch |err|
-                return emit.failWithLoweringError(err);
-        }
-    } else {
-        // pop in the reverse direction
-        var i = callee_preserved_regs.len;
-        while (i > 0) : (i -= 1) {
-            const reg = callee_preserved_regs[i - 1];
-            if ((regs >> @intCast(u5, i - 1)) & 1 == 0) continue;
-            lowerToOEnc(.pop, reg, emit.code) catch |err|
-                return emit.failWithLoweringError(err);
+    const ops = Mir.Ops.decode(emit.mir.instructions.items(.ops)[inst]);
+    const payload = emit.mir.instructions.items(.data)[inst].payload;
+    const data = emit.mir.extraData(Mir.RegsToPushOrPop, payload).data;
+    const regs = data.regs;
+    var disp: u32 = data.disp + 8;
+    for (bits.callee_preserved_regs) |reg, i| {
+        if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
+        if (tag == .push) {
+            lowerToMrEnc(.mov, RegisterOrMemory.mem(.qword_ptr, .{
+                .disp = @bitCast(u32, -@intCast(i32, disp)),
+                .base = ops.reg1,
+            }), reg.to64(), emit.code) catch |err| return emit.failWithLoweringError(err);
+        } else {
+            lowerToRmEnc(.mov, reg.to64(), RegisterOrMemory.mem(.qword_ptr, .{
+                .disp = @bitCast(u32, -@intCast(i32, disp)),
+                .base = ops.reg1,
+            }), emit.code) catch |err| return emit.failWithLoweringError(err);
         }
+        disp += 8;
     }
 }
 
diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig
index 7bab6ce39b..e3b79a8329 100644
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@@ -333,8 +333,6 @@ pub const Inst = struct {
         got_entry: u32,
         /// Index into `extra`. Meaning of what can be found there is context-dependent.
         payload: u32,
-        /// A bitfield of which callee_preserved_regs to push
-        regs_to_push_or_pop: u32,
     };
 
     // Make sure we don't accidentally make instructions bigger than expected.
@@ -346,6 +344,11 @@ pub const Inst = struct {
     }
 };
 
+pub const RegsToPushOrPop = struct {
+    regs: u32,
+    disp: u32,
+};
+
 pub const ImmPair = struct {
     dest_off: u32,
     operand: u32,
diff --git a/src/arch/x86_64/PrintMir.zig b/src/arch/x86_64/PrintMir.zig
index 67ebb2aa58..a7a4666f77 100644
--- a/src/arch/x86_64/PrintMir.zig
+++ b/src/arch/x86_64/PrintMir.zig
@@ -180,26 +180,28 @@ fn mirPushPop(print: *const Print, tag: Mir.Inst.Tag, inst: Mir.Inst.Index, w: a
     try w.writeByte('\n');
 }
 fn mirPushPopRegsFromCalleePreservedRegs(print: *const Print, tag: Mir.Inst.Tag, inst: Mir.Inst.Index, w: anytype) !void {
-    const callee_preserved_regs = bits.callee_preserved_regs;
-    // PUSH/POP reg
-
-    const regs = print.mir.instructions.items(.data)[inst].regs_to_push_or_pop;
-    if (regs == 0) return w.writeAll("push/pop no regs from callee_preserved_regs\n");
-    if (tag == .push) {
-        try w.writeAll("push ");
-        for (callee_preserved_regs) |reg, i| {
-            if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
-            try w.print("{s}, ", .{@tagName(reg)});
-        }
-    } else {
-        // pop in the reverse direction
-        var i = callee_preserved_regs.len;
-        try w.writeAll("pop ");
-        while (i > 0) : (i -= 1) {
-            if ((regs >> @intCast(u5, i - 1)) & 1 == 0) continue;
-            const reg = callee_preserved_regs[i - 1];
-            try w.print("{s}, ", .{@tagName(reg)});
+    const ops = Mir.Ops.decode(print.mir.instructions.items(.ops)[inst]);
+    const payload = print.mir.instructions.items(.data)[inst].payload;
+    const data = print.mir.extraData(Mir.RegsToPushOrPop, payload).data;
+    const regs = data.regs;
+    var disp: u32 = data.disp + 8;
+    if (regs == 0) return w.writeAll("no regs from callee_preserved_regs\n");
+    for (bits.callee_preserved_regs) |reg, i| {
+        if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
+        if (tag == .push) {
+            try w.print("mov qword ptr [{s} + {d}], {s}", .{
+                @tagName(ops.reg1),
+                @bitCast(u32, -@intCast(i32, disp)),
+                @tagName(reg.to64()),
+            });
+        } else {
+            try w.print("mov {s}, qword ptr [{s} + {d}]", .{
+                @tagName(reg.to64()),
+                @tagName(ops.reg1),
+                @bitCast(u32, -@intCast(i32, disp)),
+            });
         }
+        disp += 8;
     }
     try w.writeByte('\n');
 }

From 5fb921539b47693f9b28fb414009395e0c7332d5 Mon Sep 17 00:00:00 2001
From: Jakub Konka <kubkon@jakubkonka.com>
Date: Fri, 21 Jan 2022 17:43:20 +0100
Subject: [PATCH 2/4] stage2: do not copy args passed via stack into functions
 locals

---
 src/arch/x86_64/CodeGen.zig | 262 +++++++++++++++---------------------
 1 file changed, 108 insertions(+), 154 deletions(-)

diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index c204bcbad1..36696fac69 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -43,7 +43,7 @@ err_msg: ?*ErrorMsg,
 args: []MCValue,
 ret_mcv: MCValue,
 fn_type: Type,
-arg_index: usize,
+arg_index: u32,
 src_loc: Module.SrcLoc,
 stack_align: u32,
 
@@ -117,9 +117,9 @@ pub const MCValue = union(enum) {
     memory: u64,
     /// The value is one of the stack variables.
     /// If the type is a pointer, it means the pointer address is in the stack at this offset.
-    stack_offset: u32,
+    stack_offset: i32,
     /// The value is a pointer to one of the stack variables (payload is stack offset).
-    ptr_stack_offset: u32,
+    ptr_stack_offset: i32,
     /// The value is in the compare flags assuming an unsigned operation,
     /// with this operator applied on top of it.
     compare_flags_unsigned: math.CompareOperator,
@@ -485,13 +485,12 @@ fn gen(self: *Self) InnerError!void {
         });
 
         // Adjust the stack
-        const stack_end = self.max_end_stack;
-        if (stack_end > math.maxInt(i32)) {
+        if (self.max_end_stack > math.maxInt(i32)) {
             return self.failSymbol("too much stack used in call parameters", .{});
         }
         // TODO we should reuse this mechanism to align the stack when calling any function even if
         // we do not pass any args on the stack BUT we still push regs to stack with `push` inst.
-        const aligned_stack_end = @intCast(u32, mem.alignForward(stack_end, self.stack_align));
+        const aligned_stack_end = @intCast(u32, mem.alignForward(self.max_end_stack, self.stack_align));
         if (aligned_stack_end > 0) {
             self.mir_instructions.set(backpatch_stack_sub, .{
                 .tag = .sub,
@@ -798,7 +797,7 @@ fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
         }
     }
     const stack_offset = try self.allocMem(inst, abi_size, abi_align);
-    return MCValue{ .stack_offset = stack_offset };
+    return MCValue{ .stack_offset = @intCast(i32, stack_offset) };
 }
 
 pub fn spillInstruction(self: *Self, reg: Register, inst: Air.Inst.Index) !void {
@@ -844,12 +843,12 @@ fn copyToNewRegisterWithExceptions(
 
 fn airAlloc(self: *Self, inst: Air.Inst.Index) !void {
     const stack_offset = try self.allocMemPtr(inst);
-    return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
+    return self.finishAir(inst, .{ .ptr_stack_offset = @intCast(i32, stack_offset) }, .{ .none, .none, .none });
 }
 
 fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void {
     const stack_offset = try self.allocMemPtr(inst);
-    return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
+    return self.finishAir(inst, .{ .ptr_stack_offset = @intCast(i32, stack_offset) }, .{ .none, .none, .none });
 }
 
 fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
@@ -1409,7 +1408,7 @@ fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) !void {
                         .reg1 = addr_reg.to64(),
                         .reg2 = .rbp,
                     }).encode(),
-                    .data = .{ .imm = @bitCast(u32, -@intCast(i32, off + array_abi_size)) },
+                    .data = .{ .imm = @bitCast(u32, -(off + @intCast(i32, array_abi_size))) },
                 });
             },
             else => return self.fail("TODO implement array_elem_val when array is {}", .{array}),
@@ -1613,7 +1612,7 @@ fn load(self: *Self, dst_mcv: MCValue, ptr: MCValue, ptr_ty: Type) InnerError!vo
                     try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
 
                     return self.genInlineMemcpy(
-                        @bitCast(u32, -@intCast(i32, off + abi_size)),
+                        -(off + @intCast(i32, abi_size)),
                         .rbp,
                         registerAlias(addr_reg, @divExact(reg.size(), 8)),
                         count_reg.to64(),
@@ -1770,10 +1769,10 @@ fn structFieldPtr(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, inde
     return if (self.liveness.isUnused(inst)) .dead else result: {
         const mcv = try self.resolveInst(operand);
         const struct_ty = self.air.typeOf(operand).childType();
-        const struct_size = @intCast(u32, struct_ty.abiSize(self.target.*));
-        const struct_field_offset = @intCast(u32, struct_ty.structFieldOffset(index, self.target.*));
+        const struct_size = @intCast(i32, struct_ty.abiSize(self.target.*));
+        const struct_field_offset = @intCast(i32, struct_ty.structFieldOffset(index, self.target.*));
         const struct_field_ty = struct_ty.structFieldType(index);
-        const struct_field_size = @intCast(u32, struct_field_ty.abiSize(self.target.*));
+        const struct_field_size = @intCast(i32, struct_field_ty.abiSize(self.target.*));
 
         switch (mcv) {
             .ptr_stack_offset => |off| {
@@ -1793,10 +1792,10 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = if (self.liveness.isUnused(inst)) .dead else result: {
         const mcv = try self.resolveInst(operand);
         const struct_ty = self.air.typeOf(operand);
-        const struct_size = @intCast(u32, struct_ty.abiSize(self.target.*));
-        const struct_field_offset = @intCast(u32, struct_ty.structFieldOffset(index, self.target.*));
+        const struct_size = @intCast(i32, struct_ty.abiSize(self.target.*));
+        const struct_field_offset = @intCast(i32, struct_ty.structFieldOffset(index, self.target.*));
         const struct_field_ty = struct_ty.structFieldType(index);
-        const struct_field_size = @intCast(u32, struct_field_ty.abiSize(self.target.*));
+        const struct_field_size = @intCast(i32, struct_field_ty.abiSize(self.target.*));
 
         switch (mcv) {
             .stack_offset => |off| {
@@ -1960,7 +1959,7 @@ fn genBinMathOpMir(
                         return self.fail("stack offset too large", .{});
                     }
                     const abi_size = dst_ty.abiSize(self.target.*);
-                    const adj_off = off + abi_size;
+                    const adj_off = off + @intCast(i32, abi_size);
                     _ = try self.addInst(.{
                         .tag = mir_tag,
                         .ops = (Mir.Ops{
@@ -1968,7 +1967,7 @@ fn genBinMathOpMir(
                             .reg2 = .rbp,
                             .flags = 0b01,
                         }).encode(),
-                        .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                        .data = .{ .imm = @bitCast(u32, -adj_off) },
                     });
                 },
                 .compare_flags_unsigned => {
@@ -1987,7 +1986,7 @@ fn genBinMathOpMir(
             if (abi_size > 8) {
                 return self.fail("TODO implement ADD/SUB/CMP for stack dst with large ABI", .{});
             }
-            const adj_off = off + abi_size;
+            const adj_off = off + @intCast(i32, abi_size);
 
             switch (src_mcv) {
                 .none => unreachable,
@@ -2003,7 +2002,7 @@ fn genBinMathOpMir(
                             .reg2 = registerAlias(src_reg, @intCast(u32, abi_size)),
                             .flags = 0b10,
                         }).encode(),
-                        .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                        .data = .{ .imm = @bitCast(u32, -adj_off) },
                     });
                 },
                 .immediate => |imm| {
@@ -2024,7 +2023,7 @@ fn genBinMathOpMir(
                         else => unreachable,
                     };
                     const payload = try self.addExtra(Mir.ImmPair{
-                        .dest_off = @bitCast(u32, -@intCast(i32, adj_off)),
+                        .dest_off = @bitCast(u32, -adj_off),
                         .operand = @truncate(u32, imm),
                     });
                     _ = try self.addInst(.{
@@ -2162,7 +2161,7 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
     const mcv = self.args[arg_index];
     const payload = try self.addExtra(Mir.ArgDbgInfo{
         .air_inst = inst,
-        .arg_index = @truncate(u32, arg_index), // TODO can arg_index: u32?
+        .arg_index = arg_index,
     });
     _ = try self.addInst(.{
         .tag = .arg_dbg_info,
@@ -2178,56 +2177,13 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
                 self.register_manager.getRegAssumeFree(reg.to64(), inst);
                 break :blk mcv;
             },
-            .stack_offset => |off| {
+            .stack_offset => {
                 const ty = self.air.typeOfIndex(inst);
                 const abi_size = ty.abiSize(self.target.*);
-
-                if (abi_size <= 8) {
-                    const reg = try self.register_manager.allocReg(inst, &.{});
-                    _ = try self.addInst(.{
-                        .tag = .mov,
-                        .ops = (Mir.Ops{
-                            .reg1 = registerAlias(reg, @intCast(u32, abi_size)),
-                            .reg2 = .rbp,
-                            .flags = 0b01,
-                        }).encode(),
-                        .data = .{ .imm = off + 16 },
-                    });
-                    break :blk .{ .register = reg };
-                }
-
-                // TODO copy ellision
-                const dst_mcv = try self.allocRegOrMem(inst, false);
-                const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
-                const addr_reg = regs[0];
-                const count_reg = regs[1];
-                const tmp_reg = regs[2];
-
-                try self.register_manager.getReg(.rax, null);
-                try self.register_manager.getReg(.rcx, null);
-
-                _ = try self.addInst(.{
-                    .tag = .lea,
-                    .ops = (Mir.Ops{
-                        .reg1 = addr_reg.to64(),
-                        .reg2 = .rbp,
-                    }).encode(),
-                    .data = .{ .imm = off + 16 },
-                });
-
-                // TODO allow for abi_size to be u64
-                try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
-                try self.genInlineMemcpy(
-                    @bitCast(u32, -@intCast(i32, dst_mcv.stack_offset + abi_size)),
-                    .rbp,
-                    addr_reg.to64(),
-                    count_reg.to64(),
-                    tmp_reg.to8(),
-                );
-
-                break :blk dst_mcv;
+                const off = @intCast(i32, (arg_index + 1) * abi_size) + 16;
+                break :blk MCValue{ .stack_offset = -off };
             },
-            else => unreachable,
+            else => return self.fail("TODO implement arg for {}", .{mcv}),
         }
     };
 
@@ -2252,64 +2208,6 @@ fn airFence(self: *Self) !void {
     //return self.finishAirBookkeeping();
 }
 
-fn genSetStackArg(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerError!void {
-    const abi_size = ty.abiSize(self.target.*);
-    switch (mcv) {
-        .dead => unreachable,
-        .ptr_embedded_in_code => unreachable,
-        .unreach, .none => return,
-        .register => |reg| {
-            _ = try self.addInst(.{
-                .tag = .mov,
-                .ops = (Mir.Ops{
-                    .reg1 = .rsp,
-                    .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
-                    .flags = 0b10,
-                }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, stack_offset + abi_size)) },
-            });
-        },
-        .ptr_stack_offset => {
-            const reg = try self.copyToTmpRegister(ty, mcv);
-            return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
-        },
-        .stack_offset => |unadjusted_off| {
-            if (abi_size <= 8) {
-                const reg = try self.copyToTmpRegister(ty, mcv);
-                return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
-            }
-
-            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
-            const addr_reg = regs[0];
-            const count_reg = regs[1];
-            const tmp_reg = regs[2];
-
-            try self.register_manager.getReg(.rax, null);
-            try self.register_manager.getReg(.rcx, null);
-
-            _ = try self.addInst(.{
-                .tag = .lea,
-                .ops = (Mir.Ops{
-                    .reg1 = addr_reg.to64(),
-                    .reg2 = .rbp,
-                }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, unadjusted_off + abi_size)) },
-            });
-
-            // TODO allow for abi_size to be u64
-            try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
-            try self.genInlineMemcpy(
-                @bitCast(u32, -@intCast(i32, stack_offset + abi_size)),
-                .rsp,
-                addr_reg.to64(),
-                count_reg.to64(),
-                tmp_reg.to8(),
-            );
-        },
-        else => return self.fail("TODO implement args on stack for {}", .{mcv}),
-    }
-}
-
 fn airCall(self: *Self, inst: Air.Inst.Index) !void {
     const pl_op = self.air.instructions.items(.data)[inst].pl_op;
     const callee = pl_op.operand;
@@ -2326,12 +2224,9 @@ fn airCall(self: *Self, inst: Air.Inst.Index) !void {
     var info = try self.resolveCallingConventionValues(fn_ty);
     defer info.deinit(self);
 
-    var count: usize = info.args.len;
     var stack_adjustment: u32 = 0;
-    while (count > 0) : (count -= 1) {
-        const arg_i = count - 1;
+    for (args) |arg, arg_i| {
         const mc_arg = info.args[arg_i];
-        const arg = args[arg_i];
         const arg_ty = self.air.typeOf(arg);
         const arg_mcv = try self.resolveInst(args[arg_i]);
         // Here we do not use setRegOrMem even though the logic is similar, because
@@ -2343,9 +2238,9 @@ fn airCall(self: *Self, inst: Air.Inst.Index) !void {
                 try self.genSetReg(arg_ty, reg, arg_mcv);
             },
             .stack_offset => |off| {
-                const abi_size = arg_ty.abiSize(self.target.*);
+                const abi_size = @intCast(u32, arg_ty.abiSize(self.target.*));
                 try self.genSetStackArg(arg_ty, off, arg_mcv);
-                stack_adjustment += @intCast(u32, abi_size);
+                stack_adjustment += abi_size;
             },
             .ptr_stack_offset => {
                 return self.fail("TODO implement calling with MCValue.ptr_stack_offset arg", .{});
@@ -3257,7 +3152,65 @@ fn setRegOrMem(self: *Self, ty: Type, loc: MCValue, val: MCValue) !void {
     }
 }
 
-fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerError!void {
+fn genSetStackArg(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerError!void {
+    const abi_size = ty.abiSize(self.target.*);
+    switch (mcv) {
+        .dead => unreachable,
+        .ptr_embedded_in_code => unreachable,
+        .unreach, .none => return,
+        .register => |reg| {
+            _ = try self.addInst(.{
+                .tag = .mov,
+                .ops = (Mir.Ops{
+                    .reg1 = .rsp,
+                    .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
+                    .flags = 0b10,
+                }).encode(),
+                .data = .{ .imm = @bitCast(u32, -(stack_offset + @intCast(i32, abi_size))) },
+            });
+        },
+        .ptr_stack_offset => {
+            const reg = try self.copyToTmpRegister(ty, mcv);
+            return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
+        },
+        .stack_offset => |unadjusted_off| {
+            if (abi_size <= 8) {
+                const reg = try self.copyToTmpRegister(ty, mcv);
+                return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
+            }
+
+            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
+            const addr_reg = regs[0];
+            const count_reg = regs[1];
+            const tmp_reg = regs[2];
+
+            try self.register_manager.getReg(.rax, null);
+            try self.register_manager.getReg(.rcx, null);
+
+            _ = try self.addInst(.{
+                .tag = .lea,
+                .ops = (Mir.Ops{
+                    .reg1 = addr_reg.to64(),
+                    .reg2 = .rbp,
+                }).encode(),
+                .data = .{ .imm = @bitCast(u32, -(unadjusted_off + @intCast(i32, abi_size))) },
+            });
+
+            // TODO allow for abi_size to be u64
+            try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
+            try self.genInlineMemcpy(
+                -(stack_offset + @intCast(i32, abi_size)),
+                .rsp,
+                addr_reg.to64(),
+                count_reg.to64(),
+                tmp_reg.to8(),
+            );
+        },
+        else => return self.fail("TODO implement args on stack for {}", .{mcv}),
+    }
+}
+
+fn genSetStack(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerError!void {
     switch (mcv) {
         .dead => unreachable,
         .ptr_embedded_in_code => unreachable,
@@ -3284,7 +3237,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
         },
         .immediate => |x_big| {
             const abi_size = ty.abiSize(self.target.*);
-            const adj_off = stack_offset + abi_size;
+            const adj_off = stack_offset + @intCast(i32, abi_size);
             if (adj_off > 128) {
                 return self.fail("TODO implement set stack variable with large stack offset", .{});
             }
@@ -3294,7 +3247,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     // offset from rbp, which is at the top of the stack frame.
                     // mov [rbp+offset], immediate
                     const payload = try self.addExtra(Mir.ImmPair{
-                        .dest_off = @bitCast(u32, -@intCast(i32, adj_off)),
+                        .dest_off = @bitCast(u32, -adj_off),
                         .operand = @truncate(u32, x_big),
                     });
                     _ = try self.addInst(.{
@@ -3314,7 +3267,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 8 => {
                     // We have a positive stack offset value but we want a twos complement negative
                     // offset from rbp, which is at the top of the stack frame.
-                    const negative_offset = -@intCast(i32, adj_off);
+                    const negative_offset = -adj_off;
 
                     // 64 bit write to memory would take two mov's anyways so we
                     // insted just use two 32 bit writes to avoid register allocation
@@ -3357,7 +3310,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 return self.fail("stack offset too large", .{});
             }
             const abi_size = ty.abiSize(self.target.*);
-            const adj_off = stack_offset + abi_size;
+            const adj_off = stack_offset + @intCast(i32, abi_size);
             _ = try self.addInst(.{
                 .tag = .mov,
                 .ops = (Mir.Ops{
@@ -3365,7 +3318,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
                     .flags = 0b10,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                .data = .{ .imm = @bitCast(u32, -adj_off) },
             });
         },
         .memory, .embedded_in_code => {
@@ -3391,7 +3344,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 return self.genSetStack(ty, stack_offset, MCValue{ .register = reg });
             }
 
-            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
+            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx, .rbp });
             const addr_reg = regs[0];
             const count_reg = regs[1];
             const tmp_reg = regs[2];
@@ -3405,14 +3358,14 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     .reg1 = addr_reg.to64(),
                     .reg2 = .rbp,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off + abi_size)) },
+                .data = .{ .imm = @bitCast(u32, -(off + @intCast(i32, abi_size))) },
             });
 
             // TODO allow for abi_size to be u64
             try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
 
             return self.genInlineMemcpy(
-                @bitCast(u32, -@intCast(i32, stack_offset + abi_size)),
+                -(stack_offset + @intCast(i32, abi_size)),
                 .rbp,
                 addr_reg.to64(),
                 count_reg.to64(),
@@ -3424,7 +3377,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
 
 fn genInlineMemcpy(
     self: *Self,
-    stack_offset: u32,
+    stack_offset: i32,
     stack_reg: Register,
     addr_reg: Register,
     count_reg: Register,
@@ -3482,7 +3435,7 @@ fn genInlineMemcpy(
             .reg1 = stack_reg,
             .reg2 = tmp_reg.to8(),
         }).encode(),
-        .data = .{ .imm = stack_offset },
+        .data = .{ .imm = @bitCast(u32, stack_offset) },
     });
 
     // add rcx, 1
@@ -3523,14 +3476,14 @@ fn genInlineMemcpy(
     try self.performReloc(loop_reloc);
 }
 
-fn genInlineMemset(self: *Self, ty: Type, stack_offset: u32, value: MCValue) InnerError!void {
+fn genInlineMemset(self: *Self, ty: Type, stack_offset: i32, value: MCValue) InnerError!void {
     try self.register_manager.getReg(.rax, null);
     const abi_size = ty.abiSize(self.target.*);
-    const adj_off = stack_offset + abi_size;
+    const adj_off = stack_offset + @intCast(i32, abi_size);
     if (adj_off > 128) {
         return self.fail("TODO inline memset with large stack offset", .{});
     }
-    const negative_offset = @bitCast(u32, -@intCast(i32, adj_off));
+    const negative_offset = @bitCast(u32, -adj_off);
 
     // We are actually counting `abi_size` bytes; however, we reuse the index register
     // as both the counter and offset scaler, hence we need to subtract one from `abi_size`
@@ -3621,7 +3574,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
             const ptr_abi_size = ty.abiSize(self.target.*);
             const elem_ty = ty.childType();
             const elem_abi_size = elem_ty.abiSize(self.target.*);
-            const off = unadjusted_off + elem_abi_size;
+            const off = unadjusted_off + @intCast(i32, elem_abi_size);
             if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                 return self.fail("stack offset too large", .{});
             }
@@ -3631,7 +3584,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     .reg1 = registerAlias(reg, @intCast(u32, ptr_abi_size)),
                     .reg2 = .rbp,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off)) },
+                .data = .{ .imm = @bitCast(u32, -off) },
             });
         },
         .ptr_embedded_in_code => unreachable,
@@ -3818,7 +3771,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
         },
         .stack_offset => |unadjusted_off| {
             const abi_size = ty.abiSize(self.target.*);
-            const off = unadjusted_off + abi_size;
+            const off = unadjusted_off + @intCast(i32, abi_size);
             if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                 return self.fail("stack offset too large", .{});
             }
@@ -3829,7 +3782,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     .reg2 = .rbp,
                     .flags = 0b01,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off)) },
+                .data = .{ .imm = @bitCast(u32, -off) },
             });
         },
     }
@@ -3854,7 +3807,7 @@ fn airArrayToSlice(self: *Self, inst: Air.Inst.Index) !void {
     const array_ty = ptr_ty.childType();
     const array_len = array_ty.arrayLenIncludingSentinel();
     const result: MCValue = if (self.liveness.isUnused(inst)) .dead else blk: {
-        const stack_offset = try self.allocMem(inst, 16, 16);
+        const stack_offset = @intCast(i32, try self.allocMem(inst, 16, 16));
         try self.genSetStack(ptr_ty, stack_offset + 8, ptr);
         try self.genSetStack(Type.initTag(.u64), stack_offset, .{ .immediate = array_len });
         break :blk .{ .stack_offset = stack_offset };
@@ -4235,6 +4188,7 @@ fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
             var next_stack_offset: u32 = 0;
             var count: usize = param_types.len;
             while (count > 0) : (count -= 1) {
+                // for (param_types) |ty, i| {
                 const i = count - 1;
                 const ty = param_types[i];
                 if (!ty.hasCodeGenBits()) {
@@ -4253,7 +4207,7 @@ fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
                     // such as ptr and len of slices as separate registers.
                     // TODO: also we need to honor the C ABI for relevant types rather than passing on
                     // the stack here.
-                    result.args[i] = .{ .stack_offset = next_stack_offset };
+                    result.args[i] = .{ .stack_offset = @intCast(i32, next_stack_offset) };
                     next_stack_offset += param_size;
                 }
             }

From 062ddb693f3b060a59bc3881cbc6cea2cc8e2855 Mon Sep 17 00:00:00 2001
From: Jakub Konka <kubkon@jakubkonka.com>
Date: Sat, 22 Jan 2022 00:58:18 +0100
Subject: [PATCH 3/4] stage2: fix improper capacity prealloc in lowerToRm
 encoding

---
 src/arch/x86_64/Emit.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arch/x86_64/Emit.zig b/src/arch/x86_64/Emit.zig
index caaddb2e73..058feb56d7 100644
--- a/src/arch/x86_64/Emit.zig
+++ b/src/arch/x86_64/Emit.zig
@@ -1605,7 +1605,7 @@ fn lowerToRmEnc(
             if (reg.size() != src_reg.size()) {
                 return error.OperandSizeMismatch;
             }
-            const encoder = try Encoder.init(code, 3);
+            const encoder = try Encoder.init(code, 4);
             encoder.rex(.{
                 .w = setRexWRegister(reg) or setRexWRegister(src_reg),
                 .r = reg.isExtended(),

From 406c85f9ba056e10899feed18dae91e20942dc55 Mon Sep 17 00:00:00 2001
From: Jakub Konka <kubkon@jakubkonka.com>
Date: Sat, 22 Jan 2022 08:47:04 +0100
Subject: [PATCH 4/4] macho+elf: fix integer overflow in allocateAtom

If there is a big atom available for re-use in the free list, and
it's the last atom in section, it's ideal capacity might span the
entire section in which case we do not want to calculate the actual
end VM addr of the symbol since it may overflow. Instead, we just take
the max capacity available as end VM addr estimate. In this case,
the max capacity equals `std.math.maxInt(u64)`.
---
 src/link/Elf.zig   | 2 +-
 src/link/MachO.zig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/link/Elf.zig b/src/link/Elf.zig
index 63381d24a4..bfd472161a 100644
--- a/src/link/Elf.zig
+++ b/src/link/Elf.zig
@@ -2118,7 +2118,7 @@ fn allocateTextBlock(self: *Elf, block_list: *TextBlockList, text_block: *TextBl
             const sym = self.local_symbols.items[big_block.local_sym_index];
             const capacity = big_block.capacity(self.*);
             const ideal_capacity = padToIdeal(capacity);
-            const ideal_capacity_end_vaddr = sym.st_value + ideal_capacity;
+            const ideal_capacity_end_vaddr = std.math.add(u64, sym.st_value, ideal_capacity) catch ideal_capacity;
             const capacity_end_vaddr = sym.st_value + capacity;
             const new_start_vaddr_unaligned = capacity_end_vaddr - new_block_ideal_capacity;
             const new_start_vaddr = mem.alignBackwardGeneric(u64, new_start_vaddr_unaligned, alignment);
diff --git a/src/link/MachO.zig b/src/link/MachO.zig
index 23ba1ee4b5..d7385f1f33 100644
--- a/src/link/MachO.zig
+++ b/src/link/MachO.zig
@@ -5064,7 +5064,7 @@ fn allocateAtom(self: *MachO, atom: *Atom, new_atom_size: u64, alignment: u64, m
             const sym = self.locals.items[big_atom.local_sym_index];
             const capacity = big_atom.capacity(self.*);
             const ideal_capacity = if (needs_padding) padToIdeal(capacity) else capacity;
-            const ideal_capacity_end_vaddr = sym.n_value + ideal_capacity;
+            const ideal_capacity_end_vaddr = math.add(u64, sym.n_value, ideal_capacity) catch ideal_capacity;
             const capacity_end_vaddr = sym.n_value + capacity;
             const new_start_vaddr_unaligned = capacity_end_vaddr - new_atom_ideal_capacity;
             const new_start_vaddr = mem.alignBackwardGeneric(u64, new_start_vaddr_unaligned, alignment);