diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 522da6e25d..36696fac69 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -43,7 +43,7 @@ err_msg: ?*ErrorMsg,
 args: []MCValue,
 ret_mcv: MCValue,
 fn_type: Type,
-arg_index: usize,
+arg_index: u32,
 src_loc: Module.SrcLoc,
 stack_align: u32,
 
@@ -61,8 +61,6 @@ end_di_column: u32,
 /// which is a relative jump, based on the address following the reloc.
 exitlude_jump_relocs: std.ArrayListUnmanaged(Mir.Inst.Index) = .{},
 
-stack_args_relocs: std.ArrayListUnmanaged(Mir.Inst.Index) = .{},
-
 /// Whenever there is a runtime branch, we push a Branch onto this stack,
 /// and pop it off when the runtime branch joins. This provides an "overlay"
 /// of the table of mappings from instructions to `MCValue` from within the branch.
@@ -119,9 +117,9 @@ pub const MCValue = union(enum) {
     memory: u64,
     /// The value is one of the stack variables.
     /// If the type is a pointer, it means the pointer address is in the stack at this offset.
-    stack_offset: u32,
+    stack_offset: i32,
     /// The value is a pointer to one of the stack variables (payload is stack offset).
-    ptr_stack_offset: u32,
+    ptr_stack_offset: i32,
     /// The value is in the compare flags assuming an unsigned operation,
     /// with this operator applied on top of it.
     compare_flags_unsigned: math.CompareOperator,
@@ -286,7 +284,6 @@ pub fn generate(
     defer function.exitlude_jump_relocs.deinit(bin_file.allocator);
     defer function.mir_instructions.deinit(bin_file.allocator);
     defer function.mir_extra.deinit(bin_file.allocator);
-    defer function.stack_args_relocs.deinit(bin_file.allocator);
     defer if (builtin.mode == .Debug) function.mir_to_air_map.deinit();
 
     var call_info = function.resolveCallingConventionValues(fn_type) catch |err| switch (err) {
@@ -378,13 +375,6 @@ pub fn addExtraAssumeCapacity(self: *Self, extra: anytype) u32 {
 fn gen(self: *Self) InnerError!void {
     const cc = self.fn_type.fnCallingConvention();
     if (cc != .Naked) {
-        // push the callee_preserved_regs that were used
-        const backpatch_push_callee_preserved_regs_i = try self.addInst(.{
-            .tag = .push_regs_from_callee_preserved_regs,
-            .ops = undefined,
-            .data = .{ .regs_to_push_or_pop = undefined }, // to be backpatched
-        });
-
         _ = try self.addInst(.{
             .tag = .push,
             .ops = (Mir.Ops{
@@ -416,6 +406,15 @@ fn gen(self: *Self) InnerError!void {
             .data = undefined,
         });
 
+        // push the callee_preserved_regs that were used
+        const backpatch_push_callee_preserved_regs_i = try self.addInst(.{
+            .tag = .push_regs_from_callee_preserved_regs,
+            .ops = (Mir.Ops{
+                .reg1 = .rbp,
+            }).encode(),
+            .data = .{ .payload = undefined }, // to be backpatched
+        });
+
         try self.genBody(self.air.getMainBody());
 
         // TODO can single exitlude jump reloc be elided? What if it is not at the end of the code?
@@ -429,6 +428,33 @@ fn gen(self: *Self) InnerError!void {
             self.mir_instructions.items(.data)[jmp_reloc].inst = @intCast(u32, self.mir_instructions.len);
         }
 
+        // calculate the data for callee_preserved_regs to be pushed and popped
+        const callee_preserved_regs_payload = blk: {
+            var data = Mir.RegsToPushOrPop{
+                .regs = 0,
+                .disp = mem.alignForwardGeneric(u32, self.next_stack_offset, 8),
+            };
+            inline for (callee_preserved_regs) |reg, i| {
+                if (self.register_manager.isRegAllocated(reg)) {
+                    data.regs |= 1 << @intCast(u5, i);
+                    self.max_end_stack += 8;
+                }
+            }
+            break :blk try self.addExtra(data);
+        };
+
+        const data = self.mir_instructions.items(.data);
+        // backpatch the push instruction
+        data[backpatch_push_callee_preserved_regs_i].payload = callee_preserved_regs_payload;
+        // pop the callee_preserved_regs
+        _ = try self.addInst(.{
+            .tag = .pop_regs_from_callee_preserved_regs,
+            .ops = (Mir.Ops{
+                .reg1 = .rbp,
+            }).encode(),
+            .data = .{ .payload = callee_preserved_regs_payload },
+        });
+
         _ = try self.addInst(.{
             .tag = .dbg_epilogue_begin,
             .ops = undefined,
@@ -450,34 +476,6 @@ fn gen(self: *Self) InnerError!void {
             .data = undefined,
         });
 
-        // calculate the data for callee_preserved_regs to be pushed and popped
-        var callee_preserved_regs_push_data: u32 = 0x0;
-        // TODO this is required on macOS since macOS actively checks for stack alignment
-        // at every extern call site. As far as I can tell, macOS accounts for the typical
-        // function prologue first 2 instructions of:
-        // ...
-        // push rbp
-        // mov rsp, rbp
-        // ...
-        // Thus we don't need to adjust the stack for the first push instruction. However,
-        // any subsequent push of values on the stack such as when preserving registers,
-        // needs to be taken into account here.
-        var stack_adjustment: u32 = 0;
-        inline for (callee_preserved_regs) |reg, i| {
-            if (self.register_manager.isRegAllocated(reg)) {
-                callee_preserved_regs_push_data |= 1 << @intCast(u5, i);
-                stack_adjustment += @divExact(reg.size(), 8);
-            }
-        }
-        const data = self.mir_instructions.items(.data);
-        // backpatch the push instruction
-        data[backpatch_push_callee_preserved_regs_i].regs_to_push_or_pop = callee_preserved_regs_push_data;
-        // pop the callee_preserved_regs
-        _ = try self.addInst(.{
-            .tag = .pop_regs_from_callee_preserved_regs,
-            .ops = undefined,
-            .data = .{ .regs_to_push_or_pop = callee_preserved_regs_push_data },
-        });
         _ = try self.addInst(.{
             .tag = .ret,
             .ops = (Mir.Ops{
@@ -487,37 +485,28 @@ fn gen(self: *Self) InnerError!void {
         });
 
         // Adjust the stack
-        const stack_end = self.max_end_stack;
-        if (stack_end > math.maxInt(i32) - stack_adjustment) {
+        if (self.max_end_stack > math.maxInt(i32)) {
             return self.failSymbol("too much stack used in call parameters", .{});
         }
         // TODO we should reuse this mechanism to align the stack when calling any function even if
         // we do not pass any args on the stack BUT we still push regs to stack with `push` inst.
-        const aligned_stack_end = @intCast(u32, mem.alignForward(stack_end, self.stack_align));
-        if (aligned_stack_end > 0 or (stack_adjustment > 0 and self.target.isDarwin())) {
-            const imm = if (self.target.isDarwin()) aligned_stack_end + stack_adjustment else aligned_stack_end;
+        const aligned_stack_end = @intCast(u32, mem.alignForward(self.max_end_stack, self.stack_align));
+        if (aligned_stack_end > 0) {
             self.mir_instructions.set(backpatch_stack_sub, .{
                 .tag = .sub,
                 .ops = (Mir.Ops{
                     .reg1 = .rsp,
                 }).encode(),
-                .data = .{ .imm = imm },
+                .data = .{ .imm = aligned_stack_end },
             });
             self.mir_instructions.set(backpatch_stack_add, .{
                 .tag = .add,
                 .ops = (Mir.Ops{
                     .reg1 = .rsp,
                 }).encode(),
-                .data = .{ .imm = imm },
+                .data = .{ .imm = aligned_stack_end },
             });
         }
-        while (self.stack_args_relocs.popOrNull()) |index| {
-            // TODO like above, gotta figure out the alignment shenanigans for macOS, etc.
-            const adjustment = if (self.target.isDarwin()) 2 * stack_adjustment else stack_adjustment;
-            // +16 bytes to account for saved return address of the `call` instruction and
-            // `push rbp`.
-            self.mir_instructions.items(.data)[index].imm += adjustment + aligned_stack_end + 16;
-        }
     } else {
         _ = try self.addInst(.{
             .tag = .dbg_prologue_end,
@@ -808,7 +797,7 @@ fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
         }
     }
     const stack_offset = try self.allocMem(inst, abi_size, abi_align);
-    return MCValue{ .stack_offset = stack_offset };
+    return MCValue{ .stack_offset = @intCast(i32, stack_offset) };
 }
 
 pub fn spillInstruction(self: *Self, reg: Register, inst: Air.Inst.Index) !void {
@@ -854,12 +843,12 @@ fn copyToNewRegisterWithExceptions(
 
 fn airAlloc(self: *Self, inst: Air.Inst.Index) !void {
     const stack_offset = try self.allocMemPtr(inst);
-    return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
+    return self.finishAir(inst, .{ .ptr_stack_offset = @intCast(i32, stack_offset) }, .{ .none, .none, .none });
 }
 
 fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void {
     const stack_offset = try self.allocMemPtr(inst);
-    return self.finishAir(inst, .{ .ptr_stack_offset = stack_offset }, .{ .none, .none, .none });
+    return self.finishAir(inst, .{ .ptr_stack_offset = @intCast(i32, stack_offset) }, .{ .none, .none, .none });
 }
 
 fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
@@ -1419,7 +1408,7 @@ fn airArrayElemVal(self: *Self, inst: Air.Inst.Index) !void {
                         .reg1 = addr_reg.to64(),
                         .reg2 = .rbp,
                     }).encode(),
-                    .data = .{ .imm = @bitCast(u32, -@intCast(i32, off + array_abi_size)) },
+                    .data = .{ .imm = @bitCast(u32, -(off + @intCast(i32, array_abi_size))) },
                 });
             },
             else => return self.fail("TODO implement array_elem_val when array is {}", .{array}),
@@ -1623,7 +1612,7 @@ fn load(self: *Self, dst_mcv: MCValue, ptr: MCValue, ptr_ty: Type) InnerError!vo
                     try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
 
                     return self.genInlineMemcpy(
-                        @bitCast(u32, -@intCast(i32, off + abi_size)),
+                        -(off + @intCast(i32, abi_size)),
                         .rbp,
                         registerAlias(addr_reg, @divExact(reg.size(), 8)),
                         count_reg.to64(),
@@ -1780,10 +1769,10 @@ fn structFieldPtr(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, inde
     return if (self.liveness.isUnused(inst)) .dead else result: {
         const mcv = try self.resolveInst(operand);
         const struct_ty = self.air.typeOf(operand).childType();
-        const struct_size = @intCast(u32, struct_ty.abiSize(self.target.*));
-        const struct_field_offset = @intCast(u32, struct_ty.structFieldOffset(index, self.target.*));
+        const struct_size = @intCast(i32, struct_ty.abiSize(self.target.*));
+        const struct_field_offset = @intCast(i32, struct_ty.structFieldOffset(index, self.target.*));
         const struct_field_ty = struct_ty.structFieldType(index);
-        const struct_field_size = @intCast(u32, struct_field_ty.abiSize(self.target.*));
+        const struct_field_size = @intCast(i32, struct_field_ty.abiSize(self.target.*));
 
         switch (mcv) {
             .ptr_stack_offset => |off| {
@@ -1803,10 +1792,10 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void {
     const result: MCValue = if (self.liveness.isUnused(inst)) .dead else result: {
         const mcv = try self.resolveInst(operand);
         const struct_ty = self.air.typeOf(operand);
-        const struct_size = @intCast(u32, struct_ty.abiSize(self.target.*));
-        const struct_field_offset = @intCast(u32, struct_ty.structFieldOffset(index, self.target.*));
+        const struct_size = @intCast(i32, struct_ty.abiSize(self.target.*));
+        const struct_field_offset = @intCast(i32, struct_ty.structFieldOffset(index, self.target.*));
         const struct_field_ty = struct_ty.structFieldType(index);
-        const struct_field_size = @intCast(u32, struct_field_ty.abiSize(self.target.*));
+        const struct_field_size = @intCast(i32, struct_field_ty.abiSize(self.target.*));
 
         switch (mcv) {
             .stack_offset => |off| {
@@ -1970,7 +1959,7 @@ fn genBinMathOpMir(
                         return self.fail("stack offset too large", .{});
                     }
                     const abi_size = dst_ty.abiSize(self.target.*);
-                    const adj_off = off + abi_size;
+                    const adj_off = off + @intCast(i32, abi_size);
                     _ = try self.addInst(.{
                         .tag = mir_tag,
                         .ops = (Mir.Ops{
@@ -1978,7 +1967,7 @@ fn genBinMathOpMir(
                             .reg2 = .rbp,
                             .flags = 0b01,
                         }).encode(),
-                        .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                        .data = .{ .imm = @bitCast(u32, -adj_off) },
                     });
                 },
                 .compare_flags_unsigned => {
@@ -1997,7 +1986,7 @@ fn genBinMathOpMir(
             if (abi_size > 8) {
                 return self.fail("TODO implement ADD/SUB/CMP for stack dst with large ABI", .{});
             }
-            const adj_off = off + abi_size;
+            const adj_off = off + @intCast(i32, abi_size);
 
             switch (src_mcv) {
                 .none => unreachable,
@@ -2013,7 +2002,7 @@ fn genBinMathOpMir(
                             .reg2 = registerAlias(src_reg, @intCast(u32, abi_size)),
                             .flags = 0b10,
                         }).encode(),
-                        .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                        .data = .{ .imm = @bitCast(u32, -adj_off) },
                     });
                 },
                 .immediate => |imm| {
@@ -2034,7 +2023,7 @@ fn genBinMathOpMir(
                         else => unreachable,
                     };
                     const payload = try self.addExtra(Mir.ImmPair{
-                        .dest_off = @bitCast(u32, -@intCast(i32, adj_off)),
+                        .dest_off = @bitCast(u32, -adj_off),
                         .operand = @truncate(u32, imm),
                     });
                     _ = try self.addInst(.{
@@ -2172,7 +2161,7 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
     const mcv = self.args[arg_index];
     const payload = try self.addExtra(Mir.ArgDbgInfo{
         .air_inst = inst,
-        .arg_index = @truncate(u32, arg_index), // TODO can arg_index: u32?
+        .arg_index = arg_index,
     });
     _ = try self.addInst(.{
         .tag = .arg_dbg_info,
@@ -2188,58 +2177,13 @@ fn airArg(self: *Self, inst: Air.Inst.Index) !void {
                 self.register_manager.getRegAssumeFree(reg.to64(), inst);
                 break :blk mcv;
             },
-            .stack_offset => |off| {
+            .stack_offset => {
                 const ty = self.air.typeOfIndex(inst);
                 const abi_size = ty.abiSize(self.target.*);
-
-                if (abi_size <= 8) {
-                    const reg = try self.register_manager.allocReg(inst, &.{});
-                    const reloc = try self.addInst(.{
-                        .tag = .mov,
-                        .ops = (Mir.Ops{
-                            .reg1 = registerAlias(reg, @intCast(u32, abi_size)),
-                            .reg2 = .rsp,
-                            .flags = 0b01,
-                        }).encode(),
-                        .data = .{ .imm = off },
-                    });
-                    try self.stack_args_relocs.append(self.bin_file.allocator, reloc);
-                    break :blk .{ .register = reg };
-                }
-
-                // TODO copy ellision
-                const dst_mcv = try self.allocRegOrMem(inst, false);
-                const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
-                const addr_reg = regs[0];
-                const count_reg = regs[1];
-                const tmp_reg = regs[2];
-
-                try self.register_manager.getReg(.rax, null);
-                try self.register_manager.getReg(.rcx, null);
-
-                const reloc = try self.addInst(.{
-                    .tag = .lea,
-                    .ops = (Mir.Ops{
-                        .reg1 = addr_reg.to64(),
-                        .reg2 = .rsp,
-                    }).encode(),
-                    .data = .{ .imm = off },
-                });
-                try self.stack_args_relocs.append(self.bin_file.allocator, reloc);
-
-                // TODO allow for abi_size to be u64
-                try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
-                try self.genInlineMemcpy(
-                    @bitCast(u32, -@intCast(i32, dst_mcv.stack_offset + abi_size)),
-                    .rbp,
-                    addr_reg.to64(),
-                    count_reg.to64(),
-                    tmp_reg.to8(),
-                );
-
-                break :blk dst_mcv;
+                const off = @intCast(i32, (arg_index + 1) * abi_size) + 16;
+                break :blk MCValue{ .stack_offset = -off };
             },
-            else => unreachable,
+            else => return self.fail("TODO implement arg for {}", .{mcv}),
         }
     };
 
@@ -2264,64 +2208,6 @@ fn airFence(self: *Self) !void {
     //return self.finishAirBookkeeping();
 }
 
-fn genSetStackArg(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerError!void {
-    const abi_size = ty.abiSize(self.target.*);
-    switch (mcv) {
-        .dead => unreachable,
-        .ptr_embedded_in_code => unreachable,
-        .unreach, .none => return,
-        .register => |reg| {
-            _ = try self.addInst(.{
-                .tag = .mov,
-                .ops = (Mir.Ops{
-                    .reg1 = .rsp,
-                    .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
-                    .flags = 0b10,
-                }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, stack_offset + abi_size)) },
-            });
-        },
-        .ptr_stack_offset => {
-            const reg = try self.copyToTmpRegister(ty, mcv);
-            return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
-        },
-        .stack_offset => |unadjusted_off| {
-            if (abi_size <= 8) {
-                const reg = try self.copyToTmpRegister(ty, mcv);
-                return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
-            }
-
-            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
-            const addr_reg = regs[0];
-            const count_reg = regs[1];
-            const tmp_reg = regs[2];
-
-            try self.register_manager.getReg(.rax, null);
-            try self.register_manager.getReg(.rcx, null);
-
-            _ = try self.addInst(.{
-                .tag = .lea,
-                .ops = (Mir.Ops{
-                    .reg1 = addr_reg.to64(),
-                    .reg2 = .rbp,
-                }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, unadjusted_off + abi_size)) },
-            });
-
-            // TODO allow for abi_size to be u64
-            try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
-            try self.genInlineMemcpy(
-                @bitCast(u32, -@intCast(i32, stack_offset + abi_size)),
-                .rsp,
-                addr_reg.to64(),
-                count_reg.to64(),
-                tmp_reg.to8(),
-            );
-        },
-        else => return self.fail("TODO implement args on stack for {}", .{mcv}),
-    }
-}
-
 fn airCall(self: *Self, inst: Air.Inst.Index) !void {
     const pl_op = self.air.instructions.items(.data)[inst].pl_op;
     const callee = pl_op.operand;
@@ -2338,12 +2224,9 @@ fn airCall(self: *Self, inst: Air.Inst.Index) !void {
     var info = try self.resolveCallingConventionValues(fn_ty);
     defer info.deinit(self);
 
-    var count: usize = info.args.len;
     var stack_adjustment: u32 = 0;
-    while (count > 0) : (count -= 1) {
-        const arg_i = count - 1;
+    for (args) |arg, arg_i| {
         const mc_arg = info.args[arg_i];
-        const arg = args[arg_i];
         const arg_ty = self.air.typeOf(arg);
         const arg_mcv = try self.resolveInst(args[arg_i]);
         // Here we do not use setRegOrMem even though the logic is similar, because
@@ -2355,9 +2238,9 @@ fn airCall(self: *Self, inst: Air.Inst.Index) !void {
                 try self.genSetReg(arg_ty, reg, arg_mcv);
             },
             .stack_offset => |off| {
-                const abi_size = arg_ty.abiSize(self.target.*);
+                const abi_size = @intCast(u32, arg_ty.abiSize(self.target.*));
                 try self.genSetStackArg(arg_ty, off, arg_mcv);
-                stack_adjustment += @intCast(u32, abi_size);
+                stack_adjustment += abi_size;
             },
             .ptr_stack_offset => {
                 return self.fail("TODO implement calling with MCValue.ptr_stack_offset arg", .{});
@@ -3269,7 +3152,65 @@ fn setRegOrMem(self: *Self, ty: Type, loc: MCValue, val: MCValue) !void {
     }
 }
 
-fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerError!void {
+fn genSetStackArg(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerError!void {
+    const abi_size = ty.abiSize(self.target.*);
+    switch (mcv) {
+        .dead => unreachable,
+        .ptr_embedded_in_code => unreachable,
+        .unreach, .none => return,
+        .register => |reg| {
+            _ = try self.addInst(.{
+                .tag = .mov,
+                .ops = (Mir.Ops{
+                    .reg1 = .rsp,
+                    .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
+                    .flags = 0b10,
+                }).encode(),
+                .data = .{ .imm = @bitCast(u32, -(stack_offset + @intCast(i32, abi_size))) },
+            });
+        },
+        .ptr_stack_offset => {
+            const reg = try self.copyToTmpRegister(ty, mcv);
+            return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
+        },
+        .stack_offset => |unadjusted_off| {
+            if (abi_size <= 8) {
+                const reg = try self.copyToTmpRegister(ty, mcv);
+                return self.genSetStackArg(ty, stack_offset, MCValue{ .register = reg });
+            }
+
+            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
+            const addr_reg = regs[0];
+            const count_reg = regs[1];
+            const tmp_reg = regs[2];
+
+            try self.register_manager.getReg(.rax, null);
+            try self.register_manager.getReg(.rcx, null);
+
+            _ = try self.addInst(.{
+                .tag = .lea,
+                .ops = (Mir.Ops{
+                    .reg1 = addr_reg.to64(),
+                    .reg2 = .rbp,
+                }).encode(),
+                .data = .{ .imm = @bitCast(u32, -(unadjusted_off + @intCast(i32, abi_size))) },
+            });
+
+            // TODO allow for abi_size to be u64
+            try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
+            try self.genInlineMemcpy(
+                -(stack_offset + @intCast(i32, abi_size)),
+                .rsp,
+                addr_reg.to64(),
+                count_reg.to64(),
+                tmp_reg.to8(),
+            );
+        },
+        else => return self.fail("TODO implement args on stack for {}", .{mcv}),
+    }
+}
+
+fn genSetStack(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerError!void {
     switch (mcv) {
         .dead => unreachable,
         .ptr_embedded_in_code => unreachable,
@@ -3296,7 +3237,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
         },
         .immediate => |x_big| {
             const abi_size = ty.abiSize(self.target.*);
-            const adj_off = stack_offset + abi_size;
+            const adj_off = stack_offset + @intCast(i32, abi_size);
             if (adj_off > 128) {
                 return self.fail("TODO implement set stack variable with large stack offset", .{});
             }
@@ -3306,7 +3247,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     // offset from rbp, which is at the top of the stack frame.
                     // mov [rbp+offset], immediate
                     const payload = try self.addExtra(Mir.ImmPair{
-                        .dest_off = @bitCast(u32, -@intCast(i32, adj_off)),
+                        .dest_off = @bitCast(u32, -adj_off),
                         .operand = @truncate(u32, x_big),
                     });
                     _ = try self.addInst(.{
@@ -3326,7 +3267,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 8 => {
                     // We have a positive stack offset value but we want a twos complement negative
                     // offset from rbp, which is at the top of the stack frame.
-                    const negative_offset = -@intCast(i32, adj_off);
+                    const negative_offset = -adj_off;
 
                     // 64 bit write to memory would take two mov's anyways so we
                     // insted just use two 32 bit writes to avoid register allocation
@@ -3369,7 +3310,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 return self.fail("stack offset too large", .{});
             }
             const abi_size = ty.abiSize(self.target.*);
-            const adj_off = stack_offset + abi_size;
+            const adj_off = stack_offset + @intCast(i32, abi_size);
             _ = try self.addInst(.{
                 .tag = .mov,
                 .ops = (Mir.Ops{
@@ -3377,7 +3318,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     .reg2 = registerAlias(reg, @intCast(u32, abi_size)),
                     .flags = 0b10,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, adj_off)) },
+                .data = .{ .imm = @bitCast(u32, -adj_off) },
             });
         },
         .memory, .embedded_in_code => {
@@ -3403,7 +3344,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                 return self.genSetStack(ty, stack_offset, MCValue{ .register = reg });
             }
 
-            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx });
+            const regs = try self.register_manager.allocRegs(3, .{ null, null, null }, &.{ .rax, .rcx, .rbp });
             const addr_reg = regs[0];
             const count_reg = regs[1];
             const tmp_reg = regs[2];
@@ -3417,14 +3358,14 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
                     .reg1 = addr_reg.to64(),
                     .reg2 = .rbp,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off + abi_size)) },
+                .data = .{ .imm = @bitCast(u32, -(off + @intCast(i32, abi_size))) },
             });
 
             // TODO allow for abi_size to be u64
             try self.genSetReg(Type.initTag(.u32), count_reg, .{ .immediate = @intCast(u32, abi_size) });
 
             return self.genInlineMemcpy(
-                @bitCast(u32, -@intCast(i32, stack_offset + abi_size)),
+                -(stack_offset + @intCast(i32, abi_size)),
                 .rbp,
                 addr_reg.to64(),
                 count_reg.to64(),
@@ -3436,7 +3377,7 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: u32, mcv: MCValue) InnerErro
 
 fn genInlineMemcpy(
     self: *Self,
-    stack_offset: u32,
+    stack_offset: i32,
     stack_reg: Register,
     addr_reg: Register,
     count_reg: Register,
@@ -3494,7 +3435,7 @@ fn genInlineMemcpy(
             .reg1 = stack_reg,
             .reg2 = tmp_reg.to8(),
         }).encode(),
-        .data = .{ .imm = stack_offset },
+        .data = .{ .imm = @bitCast(u32, stack_offset) },
     });
 
     // add rcx, 1
@@ -3535,14 +3476,14 @@ fn genInlineMemcpy(
     try self.performReloc(loop_reloc);
 }
 
-fn genInlineMemset(self: *Self, ty: Type, stack_offset: u32, value: MCValue) InnerError!void {
+fn genInlineMemset(self: *Self, ty: Type, stack_offset: i32, value: MCValue) InnerError!void {
     try self.register_manager.getReg(.rax, null);
     const abi_size = ty.abiSize(self.target.*);
-    const adj_off = stack_offset + abi_size;
+    const adj_off = stack_offset + @intCast(i32, abi_size);
     if (adj_off > 128) {
         return self.fail("TODO inline memset with large stack offset", .{});
     }
-    const negative_offset = @bitCast(u32, -@intCast(i32, adj_off));
+    const negative_offset = @bitCast(u32, -adj_off);
 
     // We are actually counting `abi_size` bytes; however, we reuse the index register
     // as both the counter and offset scaler, hence we need to subtract one from `abi_size`
@@ -3633,7 +3574,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
             const ptr_abi_size = ty.abiSize(self.target.*);
             const elem_ty = ty.childType();
             const elem_abi_size = elem_ty.abiSize(self.target.*);
-            const off = unadjusted_off + elem_abi_size;
+            const off = unadjusted_off + @intCast(i32, elem_abi_size);
             if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                 return self.fail("stack offset too large", .{});
             }
@@ -3643,7 +3584,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     .reg1 = registerAlias(reg, @intCast(u32, ptr_abi_size)),
                     .reg2 = .rbp,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off)) },
+                .data = .{ .imm = @bitCast(u32, -off) },
             });
         },
         .ptr_embedded_in_code => unreachable,
@@ -3830,7 +3771,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
         },
         .stack_offset => |unadjusted_off| {
             const abi_size = ty.abiSize(self.target.*);
-            const off = unadjusted_off + abi_size;
+            const off = unadjusted_off + @intCast(i32, abi_size);
             if (off < std.math.minInt(i32) or off > std.math.maxInt(i32)) {
                 return self.fail("stack offset too large", .{});
             }
@@ -3841,7 +3782,7 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                     .reg2 = .rbp,
                     .flags = 0b01,
                 }).encode(),
-                .data = .{ .imm = @bitCast(u32, -@intCast(i32, off)) },
+                .data = .{ .imm = @bitCast(u32, -off) },
             });
         },
     }
@@ -3866,7 +3807,7 @@ fn airArrayToSlice(self: *Self, inst: Air.Inst.Index) !void {
     const array_ty = ptr_ty.childType();
     const array_len = array_ty.arrayLenIncludingSentinel();
     const result: MCValue = if (self.liveness.isUnused(inst)) .dead else blk: {
-        const stack_offset = try self.allocMem(inst, 16, 16);
+        const stack_offset = @intCast(i32, try self.allocMem(inst, 16, 16));
         try self.genSetStack(ptr_ty, stack_offset + 8, ptr);
         try self.genSetStack(Type.initTag(.u64), stack_offset, .{ .immediate = array_len });
         break :blk .{ .stack_offset = stack_offset };
@@ -4247,6 +4188,7 @@ fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
             var next_stack_offset: u32 = 0;
             var count: usize = param_types.len;
             while (count > 0) : (count -= 1) {
+                // for (param_types) |ty, i| {
                 const i = count - 1;
                 const ty = param_types[i];
                 if (!ty.hasCodeGenBits()) {
@@ -4265,7 +4207,7 @@ fn resolveCallingConventionValues(self: *Self, fn_ty: Type) !CallMCValues {
                     // such as ptr and len of slices as separate registers.
                     // TODO: also we need to honor the C ABI for relevant types rather than passing on
                     // the stack here.
-                    result.args[i] = .{ .stack_offset = next_stack_offset };
+                    result.args[i] = .{ .stack_offset = @intCast(i32, next_stack_offset) };
                     next_stack_offset += param_size;
                 }
             }
diff --git a/src/arch/x86_64/Emit.zig b/src/arch/x86_64/Emit.zig
index 4ec80dd1ba..058feb56d7 100644
--- a/src/arch/x86_64/Emit.zig
+++ b/src/arch/x86_64/Emit.zig
@@ -251,23 +251,25 @@ fn mirPushPop(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
     }
 }
 fn mirPushPopRegsFromCalleePreservedRegs(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
-    const callee_preserved_regs = bits.callee_preserved_regs;
-    const regs = emit.mir.instructions.items(.data)[inst].regs_to_push_or_pop;
-    if (tag == .push) {
-        for (callee_preserved_regs) |reg, i| {
-            if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
-            lowerToOEnc(.push, reg, emit.code) catch |err|
-                return emit.failWithLoweringError(err);
-        }
-    } else {
-        // pop in the reverse direction
-        var i = callee_preserved_regs.len;
-        while (i > 0) : (i -= 1) {
-            const reg = callee_preserved_regs[i - 1];
-            if ((regs >> @intCast(u5, i - 1)) & 1 == 0) continue;
-            lowerToOEnc(.pop, reg, emit.code) catch |err|
-                return emit.failWithLoweringError(err);
+    const ops = Mir.Ops.decode(emit.mir.instructions.items(.ops)[inst]);
+    const payload = emit.mir.instructions.items(.data)[inst].payload;
+    const data = emit.mir.extraData(Mir.RegsToPushOrPop, payload).data;
+    const regs = data.regs;
+    var disp: u32 = data.disp + 8;
+    for (bits.callee_preserved_regs) |reg, i| {
+        if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
+        if (tag == .push) {
+            lowerToMrEnc(.mov, RegisterOrMemory.mem(.qword_ptr, .{
+                .disp = @bitCast(u32, -@intCast(i32, disp)),
+                .base = ops.reg1,
+            }), reg.to64(), emit.code) catch |err| return emit.failWithLoweringError(err);
+        } else {
+            lowerToRmEnc(.mov, reg.to64(), RegisterOrMemory.mem(.qword_ptr, .{
+                .disp = @bitCast(u32, -@intCast(i32, disp)),
+                .base = ops.reg1,
+            }), emit.code) catch |err| return emit.failWithLoweringError(err);
         }
+        disp += 8;
     }
 }
 
@@ -1603,7 +1605,7 @@ fn lowerToRmEnc(
             if (reg.size() != src_reg.size()) {
                 return error.OperandSizeMismatch;
             }
-            const encoder = try Encoder.init(code, 3);
+            const encoder = try Encoder.init(code, 4);
             encoder.rex(.{
                 .w = setRexWRegister(reg) or setRexWRegister(src_reg),
                 .r = reg.isExtended(),
diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig
index 7bab6ce39b..e3b79a8329 100644
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@@ -333,8 +333,6 @@ pub const Inst = struct {
         got_entry: u32,
         /// Index into `extra`. Meaning of what can be found there is context-dependent.
         payload: u32,
-        /// A bitfield of which callee_preserved_regs to push
-        regs_to_push_or_pop: u32,
     };
 
     // Make sure we don't accidentally make instructions bigger than expected.
@@ -346,6 +344,11 @@ pub const Inst = struct {
     }
 };
 
+pub const RegsToPushOrPop = struct {
+    regs: u32,
+    disp: u32,
+};
+
 pub const ImmPair = struct {
     dest_off: u32,
     operand: u32,
diff --git a/src/arch/x86_64/PrintMir.zig b/src/arch/x86_64/PrintMir.zig
index 67ebb2aa58..a7a4666f77 100644
--- a/src/arch/x86_64/PrintMir.zig
+++ b/src/arch/x86_64/PrintMir.zig
@@ -180,26 +180,28 @@ fn mirPushPop(print: *const Print, tag: Mir.Inst.Tag, inst: Mir.Inst.Index, w: a
     try w.writeByte('\n');
 }
 fn mirPushPopRegsFromCalleePreservedRegs(print: *const Print, tag: Mir.Inst.Tag, inst: Mir.Inst.Index, w: anytype) !void {
-    const callee_preserved_regs = bits.callee_preserved_regs;
-    // PUSH/POP reg
-
-    const regs = print.mir.instructions.items(.data)[inst].regs_to_push_or_pop;
-    if (regs == 0) return w.writeAll("push/pop no regs from callee_preserved_regs\n");
-    if (tag == .push) {
-        try w.writeAll("push ");
-        for (callee_preserved_regs) |reg, i| {
-            if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
-            try w.print("{s}, ", .{@tagName(reg)});
-        }
-    } else {
-        // pop in the reverse direction
-        var i = callee_preserved_regs.len;
-        try w.writeAll("pop ");
-        while (i > 0) : (i -= 1) {
-            if ((regs >> @intCast(u5, i - 1)) & 1 == 0) continue;
-            const reg = callee_preserved_regs[i - 1];
-            try w.print("{s}, ", .{@tagName(reg)});
+    const ops = Mir.Ops.decode(print.mir.instructions.items(.ops)[inst]);
+    const payload = print.mir.instructions.items(.data)[inst].payload;
+    const data = print.mir.extraData(Mir.RegsToPushOrPop, payload).data;
+    const regs = data.regs;
+    var disp: u32 = data.disp + 8;
+    if (regs == 0) return w.writeAll("no regs from callee_preserved_regs\n");
+    for (bits.callee_preserved_regs) |reg, i| {
+        if ((regs >> @intCast(u5, i)) & 1 == 0) continue;
+        if (tag == .push) {
+            try w.print("mov qword ptr [{s} + {d}], {s}", .{
+                @tagName(ops.reg1),
+                @bitCast(u32, -@intCast(i32, disp)),
+                @tagName(reg.to64()),
+            });
+        } else {
+            try w.print("mov {s}, qword ptr [{s} + {d}]", .{
+                @tagName(reg.to64()),
+                @tagName(ops.reg1),
+                @bitCast(u32, -@intCast(i32, disp)),
+            });
         }
+        disp += 8;
     }
     try w.writeByte('\n');
 }
diff --git a/src/link/Elf.zig b/src/link/Elf.zig
index 63381d24a4..bfd472161a 100644
--- a/src/link/Elf.zig
+++ b/src/link/Elf.zig
@@ -2118,7 +2118,7 @@ fn allocateTextBlock(self: *Elf, block_list: *TextBlockList, text_block: *TextBl
             const sym = self.local_symbols.items[big_block.local_sym_index];
             const capacity = big_block.capacity(self.*);
             const ideal_capacity = padToIdeal(capacity);
-            const ideal_capacity_end_vaddr = sym.st_value + ideal_capacity;
+            const ideal_capacity_end_vaddr = std.math.add(u64, sym.st_value, ideal_capacity) catch ideal_capacity;
             const capacity_end_vaddr = sym.st_value + capacity;
             const new_start_vaddr_unaligned = capacity_end_vaddr - new_block_ideal_capacity;
             const new_start_vaddr = mem.alignBackwardGeneric(u64, new_start_vaddr_unaligned, alignment);
diff --git a/src/link/MachO.zig b/src/link/MachO.zig
index 23ba1ee4b5..d7385f1f33 100644
--- a/src/link/MachO.zig
+++ b/src/link/MachO.zig
@@ -5064,7 +5064,7 @@ fn allocateAtom(self: *MachO, atom: *Atom, new_atom_size: u64, alignment: u64, m
             const sym = self.locals.items[big_atom.local_sym_index];
             const capacity = big_atom.capacity(self.*);
             const ideal_capacity = if (needs_padding) padToIdeal(capacity) else capacity;
-            const ideal_capacity_end_vaddr = sym.n_value + ideal_capacity;
+            const ideal_capacity_end_vaddr = math.add(u64, sym.n_value, ideal_capacity) catch ideal_capacity;
             const capacity_end_vaddr = sym.n_value + capacity;
             const new_start_vaddr_unaligned = capacity_end_vaddr - new_atom_ideal_capacity;
             const new_start_vaddr = mem.alignBackwardGeneric(u64, new_start_vaddr_unaligned, alignment);