From ef9aeb6ac415348e16f04913839002929064c91e Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 17 Jul 2020 09:33:56 -0700
Subject: [PATCH 1/4] stage2: codegen: refactor to always have comptime arch

---
 src-self-hosted/codegen.zig        | 2220 ++++++++++++++--------------
 src-self-hosted/codegen/x86_64.zig |    5 +-
 2 files changed, 1116 insertions(+), 1109 deletions(-)

diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig
index e78ee28b5d..c259eb2595 100644
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@@ -11,6 +11,8 @@ const ErrorMsg = Module.ErrorMsg;
 const Target = std.Target;
 const Allocator = mem.Allocator;
 const trace = @import("tracy.zig").trace;
+const x86_64 = @import("codegen/x86_64.zig");
+const x86 = @import("codegen/x86.zig");
 
 /// The codegen-related data that is stored in `ir.Inst.Block` instructions.
 pub const BlockData = struct {
@@ -32,67 +34,75 @@ pub const Result = union(enum) {
     fail: *Module.ErrorMsg,
 };
 
+pub const GenerateSymbolError = error{
+    OutOfMemory,
+    /// A Decl that this symbol depends on had a semantic analysis failure.
+    AnalysisFail,
+};
+
 pub fn generateSymbol(
     bin_file: *link.File.Elf,
     src: usize,
     typed_value: TypedValue,
     code: *std.ArrayList(u8),
-) error{
-    OutOfMemory,
-    /// A Decl that this symbol depends on had a semantic analysis failure.
-    AnalysisFail,
-}!Result {
+) GenerateSymbolError!Result {
     const tracy = trace(@src());
     defer tracy.end();
 
     switch (typed_value.ty.zigTypeTag()) {
         .Fn => {
-            const module_fn = typed_value.val.cast(Value.Payload.Function).?.func;
-
-            const fn_type = module_fn.owner_decl.typed_value.most_recent.typed_value.ty;
-            const param_types = try bin_file.allocator.alloc(Type, fn_type.fnParamLen());
-            defer bin_file.allocator.free(param_types);
-            fn_type.fnParamTypes(param_types);
-            var mc_args = try bin_file.allocator.alloc(MCValue, param_types.len);
-            defer bin_file.allocator.free(mc_args);
-
-            var branch_stack = std.ArrayList(Function.Branch).init(bin_file.allocator);
-            defer {
-                assert(branch_stack.items.len == 1);
-                branch_stack.items[0].deinit(bin_file.allocator);
-                branch_stack.deinit();
-            }
-            const branch = try branch_stack.addOne();
-            branch.* = .{};
-
-            var function = Function{
-                .gpa = bin_file.allocator,
-                .target = &bin_file.options.target,
-                .bin_file = bin_file,
-                .mod_fn = module_fn,
-                .code = code,
-                .err_msg = null,
-                .args = mc_args,
-                .arg_index = 0,
-                .branch_stack = &branch_stack,
-                .src = src,
-            };
-
-            const cc = fn_type.fnCallingConvention();
-            branch.max_end_stack = function.resolveParameters(src, cc, param_types, mc_args) catch |err| switch (err) {
-                error.CodegenFail => return Result{ .fail = function.err_msg.? },
-                else => |e| return e,
-            };
-
-            function.gen() catch |err| switch (err) {
-                error.CodegenFail => return Result{ .fail = function.err_msg.? },
-                else => |e| return e,
-            };
-
-            if (function.err_msg) |em| {
-                return Result{ .fail = em };
-            } else {
-                return Result{ .appended = {} };
+            switch (bin_file.options.target.cpu.arch) {
+                .arm => return Function(.arm).generateSymbol(bin_file, src, typed_value, code),
+                .armeb => return Function(.armeb).generateSymbol(bin_file, src, typed_value, code),
+                .aarch64 => return Function(.aarch64).generateSymbol(bin_file, src, typed_value, code),
+                .aarch64_be => return Function(.aarch64_be).generateSymbol(bin_file, src, typed_value, code),
+                .aarch64_32 => return Function(.aarch64_32).generateSymbol(bin_file, src, typed_value, code),
+                .arc => return Function(.arc).generateSymbol(bin_file, src, typed_value, code),
+                .avr => return Function(.avr).generateSymbol(bin_file, src, typed_value, code),
+                .bpfel => return Function(.bpfel).generateSymbol(bin_file, src, typed_value, code),
+                .bpfeb => return Function(.bpfeb).generateSymbol(bin_file, src, typed_value, code),
+                .hexagon => return Function(.hexagon).generateSymbol(bin_file, src, typed_value, code),
+                .mips => return Function(.mips).generateSymbol(bin_file, src, typed_value, code),
+                .mipsel => return Function(.mipsel).generateSymbol(bin_file, src, typed_value, code),
+                .mips64 => return Function(.mips64).generateSymbol(bin_file, src, typed_value, code),
+                .mips64el => return Function(.mips64el).generateSymbol(bin_file, src, typed_value, code),
+                .msp430 => return Function(.msp430).generateSymbol(bin_file, src, typed_value, code),
+                .powerpc => return Function(.powerpc).generateSymbol(bin_file, src, typed_value, code),
+                .powerpc64 => return Function(.powerpc64).generateSymbol(bin_file, src, typed_value, code),
+                .powerpc64le => return Function(.powerpc64le).generateSymbol(bin_file, src, typed_value, code),
+                .r600 => return Function(.r600).generateSymbol(bin_file, src, typed_value, code),
+                .amdgcn => return Function(.amdgcn).generateSymbol(bin_file, src, typed_value, code),
+                .riscv32 => return Function(.riscv32).generateSymbol(bin_file, src, typed_value, code),
+                .riscv64 => return Function(.riscv64).generateSymbol(bin_file, src, typed_value, code),
+                .sparc => return Function(.sparc).generateSymbol(bin_file, src, typed_value, code),
+                .sparcv9 => return Function(.sparcv9).generateSymbol(bin_file, src, typed_value, code),
+                .sparcel => return Function(.sparcel).generateSymbol(bin_file, src, typed_value, code),
+                .s390x => return Function(.s390x).generateSymbol(bin_file, src, typed_value, code),
+                .tce => return Function(.tce).generateSymbol(bin_file, src, typed_value, code),
+                .tcele => return Function(.tcele).generateSymbol(bin_file, src, typed_value, code),
+                .thumb => return Function(.thumb).generateSymbol(bin_file, src, typed_value, code),
+                .thumbeb => return Function(.thumbeb).generateSymbol(bin_file, src, typed_value, code),
+                .i386 => return Function(.i386).generateSymbol(bin_file, src, typed_value, code),
+                .x86_64 => return Function(.x86_64).generateSymbol(bin_file, src, typed_value, code),
+                .xcore => return Function(.xcore).generateSymbol(bin_file, src, typed_value, code),
+                .nvptx => return Function(.nvptx).generateSymbol(bin_file, src, typed_value, code),
+                .nvptx64 => return Function(.nvptx64).generateSymbol(bin_file, src, typed_value, code),
+                .le32 => return Function(.le32).generateSymbol(bin_file, src, typed_value, code),
+                .le64 => return Function(.le64).generateSymbol(bin_file, src, typed_value, code),
+                .amdil => return Function(.amdil).generateSymbol(bin_file, src, typed_value, code),
+                .amdil64 => return Function(.amdil64).generateSymbol(bin_file, src, typed_value, code),
+                .hsail => return Function(.hsail).generateSymbol(bin_file, src, typed_value, code),
+                .hsail64 => return Function(.hsail64).generateSymbol(bin_file, src, typed_value, code),
+                .spir => return Function(.spir).generateSymbol(bin_file, src, typed_value, code),
+                .spir64 => return Function(.spir64).generateSymbol(bin_file, src, typed_value, code),
+                .kalimba => return Function(.kalimba).generateSymbol(bin_file, src, typed_value, code),
+                .shave => return Function(.shave).generateSymbol(bin_file, src, typed_value, code),
+                .lanai => return Function(.lanai).generateSymbol(bin_file, src, typed_value, code),
+                .wasm32 => return Function(.wasm32).generateSymbol(bin_file, src, typed_value, code),
+                .wasm64 => return Function(.wasm64).generateSymbol(bin_file, src, typed_value, code),
+                .renderscript32 => return Function(.renderscript32).generateSymbol(bin_file, src, typed_value, code),
+                .renderscript64 => return Function(.renderscript64).generateSymbol(bin_file, src, typed_value, code),
+                .ve => return Function(.ve).generateSymbol(bin_file, src, typed_value, code),
             }
         },
         .Array => {
@@ -189,1101 +199,1095 @@ const InnerError = error{
     CodegenFail,
 };
 
-const MCValue = union(enum) {
-    /// No runtime bits. `void` types, empty structs, u0, enums with 1 tag, etc.
-    none,
-    /// Control flow will not allow this value to be observed.
-    unreach,
-    /// No more references to this value remain.
-    dead,
-    /// A pointer-sized integer that fits in a register.
-    immediate: u64,
-    /// The constant was emitted into the code, at this offset.
-    embedded_in_code: usize,
-    /// The value is in a target-specific register. The value can
-    /// be @intToEnum casted to the respective Reg enum.
-    register: usize,
-    /// The value is in memory at a hard-coded address.
-    memory: u64,
-    /// The value is one of the stack variables.
-    stack_offset: u64,
-    /// The value is in the compare flags assuming an unsigned operation,
-    /// with this operator applied on top of it.
-    compare_flags_unsigned: std.math.CompareOperator,
-    /// The value is in the compare flags assuming a signed operation,
-    /// with this operator applied on top of it.
-    compare_flags_signed: std.math.CompareOperator,
+fn Function(comptime arch: std.Target.Cpu.Arch) type {
+    return struct {
+        gpa: *Allocator,
+        bin_file: *link.File.Elf,
+        target: *const std.Target,
+        mod_fn: *const Module.Fn,
+        code: *std.ArrayList(u8),
+        err_msg: ?*ErrorMsg,
+        args: []MCValue,
+        arg_index: usize,
+        src: usize,
 
-    fn isMemory(mcv: MCValue) bool {
-        return switch (mcv) {
-            .embedded_in_code, .memory, .stack_offset => true,
-            else => false,
-        };
-    }
+        /// Whenever there is a runtime branch, we push a Branch onto this stack,
+        /// and pop it off when the runtime branch joins. This provides an "overlay"
+        /// of the table of mappings from instructions to `MCValue` from within the branch.
+        /// This way we can modify the `MCValue` for an instruction in different ways
+        /// within different branches. Special consideration is needed when a branch
+        /// joins with its parent, to make sure all instructions have the same MCValue
+        /// across each runtime branch upon joining.
+        branch_stack: *std.ArrayList(Branch),
 
-    fn isImmediate(mcv: MCValue) bool {
-        return switch (mcv) {
-            .immediate => true,
-            else => false,
-        };
-    }
+        const MCValue = union(enum) {
+            /// No runtime bits. `void` types, empty structs, u0, enums with 1 tag, etc.
+            none,
+            /// Control flow will not allow this value to be observed.
+            unreach,
+            /// No more references to this value remain.
+            dead,
+            /// A pointer-sized integer that fits in a register.
+            immediate: u64,
+            /// The constant was emitted into the code, at this offset.
+            embedded_in_code: usize,
+            /// The value is in a target-specific register.
+            register: Reg,
+            /// The value is in memory at a hard-coded address.
+            memory: u64,
+            /// The value is one of the stack variables.
+            stack_offset: u64,
+            /// The value is in the compare flags assuming an unsigned operation,
+            /// with this operator applied on top of it.
+            compare_flags_unsigned: std.math.CompareOperator,
+            /// The value is in the compare flags assuming a signed operation,
+            /// with this operator applied on top of it.
+            compare_flags_signed: std.math.CompareOperator,
 
-    fn isMutable(mcv: MCValue) bool {
-        return switch (mcv) {
-            .none => unreachable,
-            .unreach => unreachable,
-            .dead => unreachable,
-
-            .immediate,
-            .embedded_in_code,
-            .memory,
-            .compare_flags_unsigned,
-            .compare_flags_signed,
-            => false,
-
-            .register,
-            .stack_offset,
-            => true,
-        };
-    }
-};
-
-const Function = struct {
-    gpa: *Allocator,
-    bin_file: *link.File.Elf,
-    target: *const std.Target,
-    mod_fn: *const Module.Fn,
-    code: *std.ArrayList(u8),
-    err_msg: ?*ErrorMsg,
-    args: []MCValue,
-    arg_index: usize,
-    src: usize,
-
-    /// Whenever there is a runtime branch, we push a Branch onto this stack,
-    /// and pop it off when the runtime branch joins. This provides an "overlay"
-    /// of the table of mappings from instructions to `MCValue` from within the branch.
-    /// This way we can modify the `MCValue` for an instruction in different ways
-    /// within different branches. Special consideration is needed when a branch
-    /// joins with its parent, to make sure all instructions have the same MCValue
-    /// across each runtime branch upon joining.
-    branch_stack: *std.ArrayList(Branch),
-
-    const Branch = struct {
-        inst_table: std.AutoHashMapUnmanaged(*ir.Inst, MCValue) = .{},
-
-        /// The key is an enum value of an arch-specific register.
-        registers: std.AutoHashMapUnmanaged(usize, RegisterAllocation) = .{},
-
-        /// Maps offset to what is stored there.
-        stack: std.AutoHashMapUnmanaged(usize, StackAllocation) = .{},
-        /// Offset from the stack base, representing the end of the stack frame.
-        max_end_stack: u32 = 0,
-        /// Represents the current end stack offset. If there is no existing slot
-        /// to place a new stack allocation, it goes here, and then bumps `max_end_stack`.
-        next_stack_offset: u32 = 0,
-
-        fn deinit(self: *Branch, gpa: *Allocator) void {
-            self.inst_table.deinit(gpa);
-            self.registers.deinit(gpa);
-            self.stack.deinit(gpa);
-            self.* = undefined;
-        }
-    };
-
-    const RegisterAllocation = struct {
-        inst: *ir.Inst,
-    };
-
-    const StackAllocation = struct {
-        inst: *ir.Inst,
-        size: u32,
-    };
-
-    fn gen(self: *Function) !void {
-        switch (self.target.cpu.arch) {
-            .arm => return self.genArch(.arm),
-            .armeb => return self.genArch(.armeb),
-            .aarch64 => return self.genArch(.aarch64),
-            .aarch64_be => return self.genArch(.aarch64_be),
-            .aarch64_32 => return self.genArch(.aarch64_32),
-            .arc => return self.genArch(.arc),
-            .avr => return self.genArch(.avr),
-            .bpfel => return self.genArch(.bpfel),
-            .bpfeb => return self.genArch(.bpfeb),
-            .hexagon => return self.genArch(.hexagon),
-            .mips => return self.genArch(.mips),
-            .mipsel => return self.genArch(.mipsel),
-            .mips64 => return self.genArch(.mips64),
-            .mips64el => return self.genArch(.mips64el),
-            .msp430 => return self.genArch(.msp430),
-            .powerpc => return self.genArch(.powerpc),
-            .powerpc64 => return self.genArch(.powerpc64),
-            .powerpc64le => return self.genArch(.powerpc64le),
-            .r600 => return self.genArch(.r600),
-            .amdgcn => return self.genArch(.amdgcn),
-            .riscv32 => return self.genArch(.riscv32),
-            .riscv64 => return self.genArch(.riscv64),
-            .sparc => return self.genArch(.sparc),
-            .sparcv9 => return self.genArch(.sparcv9),
-            .sparcel => return self.genArch(.sparcel),
-            .s390x => return self.genArch(.s390x),
-            .tce => return self.genArch(.tce),
-            .tcele => return self.genArch(.tcele),
-            .thumb => return self.genArch(.thumb),
-            .thumbeb => return self.genArch(.thumbeb),
-            .i386 => return self.genArch(.i386),
-            .x86_64 => return self.genArch(.x86_64),
-            .xcore => return self.genArch(.xcore),
-            .nvptx => return self.genArch(.nvptx),
-            .nvptx64 => return self.genArch(.nvptx64),
-            .le32 => return self.genArch(.le32),
-            .le64 => return self.genArch(.le64),
-            .amdil => return self.genArch(.amdil),
-            .amdil64 => return self.genArch(.amdil64),
-            .hsail => return self.genArch(.hsail),
-            .hsail64 => return self.genArch(.hsail64),
-            .spir => return self.genArch(.spir),
-            .spir64 => return self.genArch(.spir64),
-            .kalimba => return self.genArch(.kalimba),
-            .shave => return self.genArch(.shave),
-            .lanai => return self.genArch(.lanai),
-            .wasm32 => return self.genArch(.wasm32),
-            .wasm64 => return self.genArch(.wasm64),
-            .renderscript32 => return self.genArch(.renderscript32),
-            .renderscript64 => return self.genArch(.renderscript64),
-            .ve => return self.genArch(.ve),
-        }
-    }
-
-    fn genArch(self: *Function, comptime arch: std.Target.Cpu.Arch) !void {
-        try self.code.ensureCapacity(self.code.items.len + 11);
-
-        // push rbp
-        // mov rbp, rsp
-        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x55, 0x48, 0x89, 0xe5 });
-
-        // sub rsp, x
-        const stack_end = self.branch_stack.items[0].max_end_stack;
-        if (stack_end > std.math.maxInt(i32)) {
-            return self.fail(self.src, "too much stack used in call parameters", .{});
-        } else if (stack_end > std.math.maxInt(i8)) {
-            // 48 83 ec xx    sub rsp,0x10
-            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xec });
-            const x = @intCast(u32, stack_end);
-            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
-        } else if (stack_end != 0) {
-            // 48 81 ec xx xx xx xx   sub rsp,0x80
-            const x = @intCast(u8, stack_end);
-            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xec, x });
-        }
-
-        try self.genBody(self.mod_fn.analysis.success, arch);
-    }
-
-    fn genBody(self: *Function, body: ir.Body, comptime arch: std.Target.Cpu.Arch) InnerError!void {
-        const inst_table = &self.branch_stack.items[0].inst_table;
-        for (body.instructions) |inst| {
-            const new_inst = try self.genFuncInst(inst, arch);
-            try inst_table.putNoClobber(self.gpa, inst, new_inst);
-        }
-    }
-
-    fn genFuncInst(self: *Function, inst: *ir.Inst, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        switch (inst.tag) {
-            .add => return self.genAdd(inst.cast(ir.Inst.Add).?, arch),
-            .arg => return self.genArg(inst.cast(ir.Inst.Arg).?),
-            .assembly => return self.genAsm(inst.cast(ir.Inst.Assembly).?, arch),
-            .bitcast => return self.genBitCast(inst.cast(ir.Inst.BitCast).?),
-            .block => return self.genBlock(inst.cast(ir.Inst.Block).?, arch),
-            .br => return self.genBr(inst.cast(ir.Inst.Br).?, arch),
-            .breakpoint => return self.genBreakpoint(inst.src, arch),
-            .brvoid => return self.genBrVoid(inst.cast(ir.Inst.BrVoid).?, arch),
-            .call => return self.genCall(inst.cast(ir.Inst.Call).?, arch),
-            .cmp => return self.genCmp(inst.cast(ir.Inst.Cmp).?, arch),
-            .condbr => return self.genCondBr(inst.cast(ir.Inst.CondBr).?, arch),
-            .constant => unreachable, // excluded from function bodies
-            .isnonnull => return self.genIsNonNull(inst.cast(ir.Inst.IsNonNull).?, arch),
-            .isnull => return self.genIsNull(inst.cast(ir.Inst.IsNull).?, arch),
-            .ptrtoint => return self.genPtrToInt(inst.cast(ir.Inst.PtrToInt).?),
-            .ret => return self.genRet(inst.cast(ir.Inst.Ret).?, arch),
-            .retvoid => return self.genRetVoid(inst.cast(ir.Inst.RetVoid).?, arch),
-            .sub => return self.genSub(inst.cast(ir.Inst.Sub).?, arch),
-            .unreach => return MCValue{ .unreach = {} },
-            .not => return self.genNot(inst.cast(ir.Inst.Not).?, arch),
-        }
-    }
-
-    fn genNot(self: *Function, inst: *ir.Inst.Not, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // No side effects, so if it's unreferenced, do nothing.
-        if (inst.base.isUnused())
-            return MCValue.dead;
-        const operand = try self.resolveInst(inst.args.operand);
-        switch (operand) {
-            .dead => unreachable,
-            .unreach => unreachable,
-            .compare_flags_unsigned => |op| return MCValue{
-                .compare_flags_unsigned = switch (op) {
-                    .gte => .lt,
-                    .gt => .lte,
-                    .neq => .eq,
-                    .lt => .gte,
-                    .lte => .gt,
-                    .eq => .neq,
-                },
-            },
-            .compare_flags_signed => |op| return MCValue{
-                .compare_flags_signed = switch (op) {
-                    .gte => .lt,
-                    .gt => .lte,
-                    .neq => .eq,
-                    .lt => .gte,
-                    .lte => .gt,
-                    .eq => .neq,
-                },
-            },
-            else => {},
-        }
-
-        switch (arch) {
-            .x86_64 => {
-                var imm = ir.Inst.Constant{
-                    .base = .{
-                        .tag = .constant,
-                        .deaths = 0,
-                        .ty = inst.args.operand.ty,
-                        .src = inst.args.operand.src,
-                    },
-                    .val = Value.initTag(.bool_true),
+            fn isMemory(mcv: MCValue) bool {
+                return switch (mcv) {
+                    .embedded_in_code, .memory, .stack_offset => true,
+                    else => false,
                 };
-                return try self.genX8664BinMath(&inst.base, inst.args.operand, &imm.base, 6, 0x30);
-            },
-            else => return self.fail(inst.base.src, "TODO implement NOT for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genAdd(self: *Function, inst: *ir.Inst.Add, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // No side effects, so if it's unreferenced, do nothing.
-        if (inst.base.isUnused())
-            return MCValue.dead;
-        switch (arch) {
-            .x86_64 => {
-                return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 0, 0x00);
-            },
-            else => return self.fail(inst.base.src, "TODO implement add for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genSub(self: *Function, inst: *ir.Inst.Sub, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // No side effects, so if it's unreferenced, do nothing.
-        if (inst.base.isUnused())
-            return MCValue.dead;
-        switch (arch) {
-            .x86_64 => {
-                return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 5, 0x28);
-            },
-            else => return self.fail(inst.base.src, "TODO implement sub for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    /// ADD, SUB, XOR, OR, AND
-    fn genX8664BinMath(self: *Function, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst, opx: u8, mr: u8) !MCValue {
-        try self.code.ensureCapacity(self.code.items.len + 8);
-
-        const lhs = try self.resolveInst(op_lhs);
-        const rhs = try self.resolveInst(op_rhs);
-
-        // There are 2 operands, destination and source.
-        // Either one, but not both, can be a memory operand.
-        // Source operand can be an immediate, 8 bits or 32 bits.
-        // So, if either one of the operands dies with this instruction, we can use it
-        // as the result MCValue.
-        var dst_mcv: MCValue = undefined;
-        var src_mcv: MCValue = undefined;
-        var src_inst: *ir.Inst = undefined;
-        if (inst.operandDies(0) and lhs.isMutable()) {
-            // LHS dies; use it as the destination.
-            // Both operands cannot be memory.
-            src_inst = op_rhs;
-            if (lhs.isMemory() and rhs.isMemory()) {
-                dst_mcv = try self.copyToNewRegister(op_lhs);
-                src_mcv = rhs;
-            } else {
-                dst_mcv = lhs;
-                src_mcv = rhs;
             }
-        } else if (inst.operandDies(1) and rhs.isMutable()) {
-            // RHS dies; use it as the destination.
-            // Both operands cannot be memory.
-            src_inst = op_lhs;
-            if (lhs.isMemory() and rhs.isMemory()) {
-                dst_mcv = try self.copyToNewRegister(op_rhs);
-                src_mcv = lhs;
-            } else {
-                dst_mcv = rhs;
-                src_mcv = lhs;
+
+            fn isImmediate(mcv: MCValue) bool {
+                return switch (mcv) {
+                    .immediate => true,
+                    else => false,
+                };
             }
-        } else {
-            if (lhs.isMemory()) {
-                dst_mcv = try self.copyToNewRegister(op_lhs);
-                src_mcv = rhs;
-                src_inst = op_rhs;
-            } else {
-                dst_mcv = try self.copyToNewRegister(op_rhs);
-                src_mcv = lhs;
-                src_inst = op_lhs;
-            }
-        }
-        // This instruction supports only signed 32-bit immediates at most. If the immediate
-        // value is larger than this, we put it in a register.
-        // A potential opportunity for future optimization here would be keeping track
-        // of the fact that the instruction is available both as an immediate
-        // and as a register.
-        switch (src_mcv) {
-            .immediate => |imm| {
-                if (imm > std.math.maxInt(u31)) {
-                    src_mcv = try self.copyToNewRegister(src_inst);
-                }
-            },
-            else => {},
-        }
 
-        try self.genX8664BinMathCode(inst.src, dst_mcv, src_mcv, opx, mr);
-
-        return dst_mcv;
-    }
-
-    fn genX8664BinMathCode(self: *Function, src: usize, dst_mcv: MCValue, src_mcv: MCValue, opx: u8, mr: u8) !void {
-        switch (dst_mcv) {
-            .none => unreachable,
-            .dead, .unreach, .immediate => unreachable,
-            .compare_flags_unsigned => unreachable,
-            .compare_flags_signed => unreachable,
-            .register => |dst_reg_usize| {
-                const dst_reg = @intToEnum(Reg(.x86_64), @intCast(u8, dst_reg_usize));
-                switch (src_mcv) {
+            fn isMutable(mcv: MCValue) bool {
+                return switch (mcv) {
                     .none => unreachable,
-                    .dead, .unreach => unreachable,
-                    .register => |src_reg_usize| {
-                        const src_reg = @intToEnum(Reg(.x86_64), @intCast(u8, src_reg_usize));
-                        self.rex(.{ .b = dst_reg.isExtended(), .r = src_reg.isExtended(), .w = dst_reg.size() == 64 });
-                        self.code.appendSliceAssumeCapacity(&[_]u8{ mr + 0x1, 0xC0 | (@as(u8, src_reg.id() & 0b111) << 3) | @as(u8, dst_reg.id() & 0b111) });
+                    .unreach => unreachable,
+                    .dead => unreachable,
+
+                    .immediate,
+                    .embedded_in_code,
+                    .memory,
+                    .compare_flags_unsigned,
+                    .compare_flags_signed,
+                    => false,
+
+                    .register,
+                    .stack_offset,
+                    => true,
+                };
+            }
+        };
+
+        const Branch = struct {
+            inst_table: std.AutoHashMapUnmanaged(*ir.Inst, MCValue) = .{},
+
+            /// The key is an enum value of an arch-specific register.
+            registers: std.AutoHashMapUnmanaged(usize, RegisterAllocation) = .{},
+
+            /// Maps offset to what is stored there.
+            stack: std.AutoHashMapUnmanaged(usize, StackAllocation) = .{},
+            /// Offset from the stack base, representing the end of the stack frame.
+            max_end_stack: u32 = 0,
+            /// Represents the current end stack offset. If there is no existing slot
+            /// to place a new stack allocation, it goes here, and then bumps `max_end_stack`.
+            next_stack_offset: u32 = 0,
+
+            fn deinit(self: *Branch, gpa: *Allocator) void {
+                self.inst_table.deinit(gpa);
+                self.registers.deinit(gpa);
+                self.stack.deinit(gpa);
+                self.* = undefined;
+            }
+        };
+
+        const RegisterAllocation = struct {
+            inst: *ir.Inst,
+        };
+
+        const StackAllocation = struct {
+            inst: *ir.Inst,
+            size: u32,
+        };
+
+        const Self = @This();
+
+        fn generateSymbol(
+            bin_file: *link.File.Elf,
+            src: usize,
+            typed_value: TypedValue,
+            code: *std.ArrayList(u8),
+        ) GenerateSymbolError!Result {
+            const module_fn = typed_value.val.cast(Value.Payload.Function).?.func;
+
+            const fn_type = module_fn.owner_decl.typed_value.most_recent.typed_value.ty;
+            const param_types = try bin_file.allocator.alloc(Type, fn_type.fnParamLen());
+            defer bin_file.allocator.free(param_types);
+            fn_type.fnParamTypes(param_types);
+            var mc_args = try bin_file.allocator.alloc(MCValue, param_types.len);
+            defer bin_file.allocator.free(mc_args);
+
+            var branch_stack = std.ArrayList(Branch).init(bin_file.allocator);
+            defer {
+                assert(branch_stack.items.len == 1);
+                branch_stack.items[0].deinit(bin_file.allocator);
+                branch_stack.deinit();
+            }
+            const branch = try branch_stack.addOne();
+            branch.* = .{};
+
+            var function = Self{
+                .gpa = bin_file.allocator,
+                .target = &bin_file.options.target,
+                .bin_file = bin_file,
+                .mod_fn = module_fn,
+                .code = code,
+                .err_msg = null,
+                .args = mc_args,
+                .arg_index = 0,
+                .branch_stack = &branch_stack,
+                .src = src,
+            };
+
+            const cc = fn_type.fnCallingConvention();
+            branch.max_end_stack = function.resolveParameters(src, cc, param_types, mc_args) catch |err| switch (err) {
+                error.CodegenFail => return Result{ .fail = function.err_msg.? },
+                else => |e| return e,
+            };
+
+            function.gen() catch |err| switch (err) {
+                error.CodegenFail => return Result{ .fail = function.err_msg.? },
+                else => |e| return e,
+            };
+
+            if (function.err_msg) |em| {
+                return Result{ .fail = em };
+            } else {
+                return Result{ .appended = {} };
+            }
+        }
+
+        fn gen(self: *Self) !void {
+            try self.code.ensureCapacity(self.code.items.len + 11);
+
+            // push rbp
+            // mov rbp, rsp
+            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x55, 0x48, 0x89, 0xe5 });
+
+            // sub rsp, x
+            const stack_end = self.branch_stack.items[0].max_end_stack;
+            if (stack_end > std.math.maxInt(i32)) {
+                return self.fail(self.src, "too much stack used in call parameters", .{});
+            } else if (stack_end > std.math.maxInt(i8)) {
+                // 48 83 ec xx    sub rsp,0x10
+                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x81, 0xec });
+                const x = @intCast(u32, stack_end);
+                mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), x);
+            } else if (stack_end != 0) {
+                // 48 81 ec xx xx xx xx   sub rsp,0x80
+                const x = @intCast(u8, stack_end);
+                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x48, 0x83, 0xec, x });
+            }
+
+            try self.genBody(self.mod_fn.analysis.success);
+        }
+
+        fn genBody(self: *Self, body: ir.Body) InnerError!void {
+            const inst_table = &self.branch_stack.items[0].inst_table;
+            for (body.instructions) |inst| {
+                const new_inst = try self.genFuncInst(inst);
+                try inst_table.putNoClobber(self.gpa, inst, new_inst);
+            }
+        }
+
+        fn genFuncInst(self: *Self, inst: *ir.Inst) !MCValue {
+            switch (inst.tag) {
+                .add => return self.genAdd(inst.cast(ir.Inst.Add).?),
+                .arg => return self.genArg(inst.cast(ir.Inst.Arg).?),
+                .assembly => return self.genAsm(inst.cast(ir.Inst.Assembly).?),
+                .bitcast => return self.genBitCast(inst.cast(ir.Inst.BitCast).?),
+                .block => return self.genBlock(inst.cast(ir.Inst.Block).?),
+                .br => return self.genBr(inst.cast(ir.Inst.Br).?),
+                .breakpoint => return self.genBreakpoint(inst.src),
+                .brvoid => return self.genBrVoid(inst.cast(ir.Inst.BrVoid).?),
+                .call => return self.genCall(inst.cast(ir.Inst.Call).?),
+                .cmp => return self.genCmp(inst.cast(ir.Inst.Cmp).?),
+                .condbr => return self.genCondBr(inst.cast(ir.Inst.CondBr).?),
+                .constant => unreachable, // excluded from function bodies
+                .isnonnull => return self.genIsNonNull(inst.cast(ir.Inst.IsNonNull).?),
+                .isnull => return self.genIsNull(inst.cast(ir.Inst.IsNull).?),
+                .ptrtoint => return self.genPtrToInt(inst.cast(ir.Inst.PtrToInt).?),
+                .ret => return self.genRet(inst.cast(ir.Inst.Ret).?),
+                .retvoid => return self.genRetVoid(inst.cast(ir.Inst.RetVoid).?),
+                .sub => return self.genSub(inst.cast(ir.Inst.Sub).?),
+                .unreach => return MCValue{ .unreach = {} },
+                .not => return self.genNot(inst.cast(ir.Inst.Not).?),
+            }
+        }
+
+        fn genNot(self: *Self, inst: *ir.Inst.Not) !MCValue {
+            // No side effects, so if it's unreferenced, do nothing.
+            if (inst.base.isUnused())
+                return MCValue.dead;
+            const operand = try self.resolveInst(inst.args.operand);
+            switch (operand) {
+                .dead => unreachable,
+                .unreach => unreachable,
+                .compare_flags_unsigned => |op| return MCValue{
+                    .compare_flags_unsigned = switch (op) {
+                        .gte => .lt,
+                        .gt => .lte,
+                        .neq => .eq,
+                        .lt => .gte,
+                        .lte => .gt,
+                        .eq => .neq,
                     },
-                    .immediate => |imm| {
-                        const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode.
-                        // 81 /opx id
-                        if (imm32 <= std.math.maxInt(u7)) {
-                            self.rex(.{ .b = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{
-                                0x83,
-                                0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
-                                @intCast(u8, imm32),
-                            });
-                        } else {
-                            self.rex(.{ .r = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
-                            self.code.appendSliceAssumeCapacity(&[_]u8{
-                                0x81,
-                                0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
-                            });
-                            std.mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), imm32);
+                },
+                .compare_flags_signed => |op| return MCValue{
+                    .compare_flags_signed = switch (op) {
+                        .gte => .lt,
+                        .gt => .lte,
+                        .neq => .eq,
+                        .lt => .gte,
+                        .lte => .gt,
+                        .eq => .neq,
+                    },
+                },
+                else => {},
+            }
+
+            switch (arch) {
+                .x86_64 => {
+                    var imm = ir.Inst.Constant{
+                        .base = .{
+                            .tag = .constant,
+                            .deaths = 0,
+                            .ty = inst.args.operand.ty,
+                            .src = inst.args.operand.src,
+                        },
+                        .val = Value.initTag(.bool_true),
+                    };
+                    return try self.genX8664BinMath(&inst.base, inst.args.operand, &imm.base, 6, 0x30);
+                },
+                else => return self.fail(inst.base.src, "TODO implement NOT for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genAdd(self: *Self, inst: *ir.Inst.Add) !MCValue {
+            // No side effects, so if it's unreferenced, do nothing.
+            if (inst.base.isUnused())
+                return MCValue.dead;
+            switch (arch) {
+                .x86_64 => {
+                    return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 0, 0x00);
+                },
+                else => return self.fail(inst.base.src, "TODO implement add for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genSub(self: *Self, inst: *ir.Inst.Sub) !MCValue {
+            // No side effects, so if it's unreferenced, do nothing.
+            if (inst.base.isUnused())
+                return MCValue.dead;
+            switch (arch) {
+                .x86_64 => {
+                    return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 5, 0x28);
+                },
+                else => return self.fail(inst.base.src, "TODO implement sub for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        /// ADD, SUB, XOR, OR, AND
+        fn genX8664BinMath(self: *Self, inst: *ir.Inst, op_lhs: *ir.Inst, op_rhs: *ir.Inst, opx: u8, mr: u8) !MCValue {
+            try self.code.ensureCapacity(self.code.items.len + 8);
+
+            const lhs = try self.resolveInst(op_lhs);
+            const rhs = try self.resolveInst(op_rhs);
+
+            // There are 2 operands, destination and source.
+            // Either one, but not both, can be a memory operand.
+            // Source operand can be an immediate, 8 bits or 32 bits.
+            // So, if either one of the operands dies with this instruction, we can use it
+            // as the result MCValue.
+            var dst_mcv: MCValue = undefined;
+            var src_mcv: MCValue = undefined;
+            var src_inst: *ir.Inst = undefined;
+            if (inst.operandDies(0) and lhs.isMutable()) {
+                // LHS dies; use it as the destination.
+                // Both operands cannot be memory.
+                src_inst = op_rhs;
+                if (lhs.isMemory() and rhs.isMemory()) {
+                    dst_mcv = try self.moveToNewRegister(op_lhs);
+                    src_mcv = rhs;
+                } else {
+                    dst_mcv = lhs;
+                    src_mcv = rhs;
+                }
+            } else if (inst.operandDies(1) and rhs.isMutable()) {
+                // RHS dies; use it as the destination.
+                // Both operands cannot be memory.
+                src_inst = op_lhs;
+                if (lhs.isMemory() and rhs.isMemory()) {
+                    dst_mcv = try self.moveToNewRegister(op_rhs);
+                    src_mcv = lhs;
+                } else {
+                    dst_mcv = rhs;
+                    src_mcv = lhs;
+                }
+            } else {
+                if (lhs.isMemory()) {
+                    dst_mcv = try self.moveToNewRegister(op_lhs);
+                    src_mcv = rhs;
+                    src_inst = op_rhs;
+                } else {
+                    dst_mcv = try self.moveToNewRegister(op_rhs);
+                    src_mcv = lhs;
+                    src_inst = op_lhs;
+                }
+            }
+            // This instruction supports only signed 32-bit immediates at most. If the immediate
+            // value is larger than this, we put it in a register.
+            // A potential opportunity for future optimization here would be keeping track
+            // of the fact that the instruction is available both as an immediate
+            // and as a register.
+            switch (src_mcv) {
+                .immediate => |imm| {
+                    if (imm > std.math.maxInt(u31)) {
+                        src_mcv = try self.moveToNewRegister(src_inst);
+                    }
+                },
+                else => {},
+            }
+
+            try self.genX8664BinMathCode(inst.src, dst_mcv, src_mcv, opx, mr);
+
+            return dst_mcv;
+        }
+
+        fn genX8664BinMathCode(self: *Self, src: usize, dst_mcv: MCValue, src_mcv: MCValue, opx: u8, mr: u8) !void {
+            switch (dst_mcv) {
+                .none => unreachable,
+                .dead, .unreach, .immediate => unreachable,
+                .compare_flags_unsigned => unreachable,
+                .compare_flags_signed => unreachable,
+                .register => |dst_reg| {
+                    switch (src_mcv) {
+                        .none => unreachable,
+                        .dead, .unreach => unreachable,
+                        .register => |src_reg| {
+                            self.rex(.{ .b = dst_reg.isExtended(), .r = src_reg.isExtended(), .w = dst_reg.size() == 64 });
+                            self.code.appendSliceAssumeCapacity(&[_]u8{ mr + 0x1, 0xC0 | (@as(u8, src_reg.id() & 0b111) << 3) | @as(u8, dst_reg.id() & 0b111) });
+                        },
+                        .immediate => |imm| {
+                            const imm32 = @intCast(u31, imm); // This case must be handled before calling genX8664BinMathCode.
+                            // 81 /opx id
+                            if (imm32 <= std.math.maxInt(u7)) {
+                                self.rex(.{ .b = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
+                                self.code.appendSliceAssumeCapacity(&[_]u8{
+                                    0x83,
+                                    0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
+                                    @intCast(u8, imm32),
+                                });
+                            } else {
+                                self.rex(.{ .r = dst_reg.isExtended(), .w = dst_reg.size() == 64 });
+                                self.code.appendSliceAssumeCapacity(&[_]u8{
+                                    0x81,
+                                    0xC0 | (opx << 3) | @truncate(u3, dst_reg.id()),
+                                });
+                                std.mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), imm32);
+                            }
+                        },
+                        .embedded_in_code, .memory, .stack_offset => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{});
+                        },
+                        .compare_flags_unsigned => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{});
+                        },
+                        .compare_flags_signed => {
+                            return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (signed)", .{});
+                        },
+                    }
+                },
+                .embedded_in_code, .memory, .stack_offset => {
+                    return self.fail(src, "TODO implement x86 ADD/SUB/CMP destination memory", .{});
+                },
+            }
+        }
+
+        fn genArg(self: *Self, inst: *ir.Inst.Arg) !MCValue {
+            const i = self.arg_index;
+            self.arg_index += 1;
+            return self.args[i];
+        }
+
+        fn genBreakpoint(self: *Self, src: usize) !MCValue {
+            switch (arch) {
+                .i386, .x86_64 => {
+                    try self.code.append(0xcc); // int3
+                },
+                else => return self.fail(src, "TODO implement @breakpoint() for {}", .{self.target.cpu.arch}),
+            }
+            return .none;
+        }
+
+        fn genCall(self: *Self, inst: *ir.Inst.Call) !MCValue {
+            const fn_ty = inst.args.func.ty;
+            const cc = fn_ty.fnCallingConvention();
+            const param_types = try self.gpa.alloc(Type, fn_ty.fnParamLen());
+            defer self.gpa.free(param_types);
+            fn_ty.fnParamTypes(param_types);
+            var mc_args = try self.gpa.alloc(MCValue, param_types.len);
+            defer self.gpa.free(mc_args);
+            const stack_byte_count = try self.resolveParameters(inst.base.src, cc, param_types, mc_args);
+
+            switch (arch) {
+                .x86_64 => {
+                    for (mc_args) |mc_arg, arg_i| {
+                        const arg = inst.args.args[arg_i];
+                        const arg_mcv = try self.resolveInst(inst.args.args[arg_i]);
+                        switch (mc_arg) {
+                            .none => continue,
+                            .register => |reg| {
+                                try self.genSetReg(arg.src, reg, arg_mcv);
+                                // TODO interact with the register allocator to mark the instruction as moved.
+                            },
+                            .stack_offset => {
+                                // Here we need to emit instructions like this:
+                                // mov     qword ptr [rsp + stack_offset], x
+                                return self.fail(inst.base.src, "TODO implement calling with parameters in memory", .{});
+                            },
+                            .immediate => unreachable,
+                            .unreach => unreachable,
+                            .dead => unreachable,
+                            .embedded_in_code => unreachable,
+                            .memory => unreachable,
+                            .compare_flags_signed => unreachable,
+                            .compare_flags_unsigned => unreachable,
                         }
-                    },
-                    .embedded_in_code, .memory, .stack_offset => {
-                        return self.fail(src, "TODO implement x86 ADD/SUB/CMP source memory", .{});
-                    },
-                    .compare_flags_unsigned => {
-                        return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (unsigned)", .{});
-                    },
-                    .compare_flags_signed => {
-                        return self.fail(src, "TODO implement x86 ADD/SUB/CMP source compare flag (signed)", .{});
-                    },
-                }
-            },
-            .embedded_in_code, .memory, .stack_offset => {
-                return self.fail(src, "TODO implement x86 ADD/SUB/CMP destination memory", .{});
-            },
-        }
-    }
-
-    fn genArg(self: *Function, inst: *ir.Inst.Arg) !MCValue {
-        const i = self.arg_index;
-        self.arg_index += 1;
-        return self.args[i];
-    }
-
-    fn genBreakpoint(self: *Function, src: usize, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        switch (arch) {
-            .i386, .x86_64 => {
-                try self.code.append(0xcc); // int3
-            },
-            else => return self.fail(src, "TODO implement @breakpoint() for {}", .{self.target.cpu.arch}),
-        }
-        return .none;
-    }
-
-    fn genCall(self: *Function, inst: *ir.Inst.Call, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        const fn_ty = inst.args.func.ty;
-        const cc = fn_ty.fnCallingConvention();
-        const param_types = try self.gpa.alloc(Type, fn_ty.fnParamLen());
-        defer self.gpa.free(param_types);
-        fn_ty.fnParamTypes(param_types);
-        var mc_args = try self.gpa.alloc(MCValue, param_types.len);
-        defer self.gpa.free(mc_args);
-        const stack_byte_count = try self.resolveParameters(inst.base.src, cc, param_types, mc_args);
-
-        switch (arch) {
-            .x86_64 => {
-                for (mc_args) |mc_arg, arg_i| {
-                    const arg = inst.args.args[arg_i];
-                    const arg_mcv = try self.resolveInst(inst.args.args[arg_i]);
-                    switch (mc_arg) {
-                        .none => continue,
-                        .register => |reg| {
-                            try self.genSetReg(arg.src, arch, @intToEnum(Reg(arch), @intCast(u8, reg)), arg_mcv);
-                            // TODO interact with the register allocator to mark the instruction as moved.
-                        },
-                        .stack_offset => {
-                            // Here we need to emit instructions like this:
-                            // mov     qword ptr [rsp + stack_offset], x
-                            return self.fail(inst.base.src, "TODO implement calling with parameters in memory", .{});
-                        },
-                        .immediate => unreachable,
-                        .unreach => unreachable,
-                        .dead => unreachable,
-                        .embedded_in_code => unreachable,
-                        .memory => unreachable,
-                        .compare_flags_signed => unreachable,
-                        .compare_flags_unsigned => unreachable,
                     }
-                }
 
-                if (inst.args.func.cast(ir.Inst.Constant)) |func_inst| {
-                    if (func_inst.val.cast(Value.Payload.Function)) |func_val| {
-                        const func = func_val.func;
-                        const got = &self.bin_file.program_headers.items[self.bin_file.phdr_got_index.?];
-                        const ptr_bits = self.target.cpu.arch.ptrBitWidth();
-                        const ptr_bytes: u64 = @divExact(ptr_bits, 8);
-                        const got_addr = @intCast(u32, got.p_vaddr + func.owner_decl.link.offset_table_index * ptr_bytes);
-                        // ff 14 25 xx xx xx xx    call [addr]
-                        try self.code.ensureCapacity(self.code.items.len + 7);
-                        self.code.appendSliceAssumeCapacity(&[3]u8{ 0xff, 0x14, 0x25 });
-                        mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), got_addr);
+                    if (inst.args.func.cast(ir.Inst.Constant)) |func_inst| {
+                        if (func_inst.val.cast(Value.Payload.Function)) |func_val| {
+                            const func = func_val.func;
+                            const got = &self.bin_file.program_headers.items[self.bin_file.phdr_got_index.?];
+                            const ptr_bits = self.target.cpu.arch.ptrBitWidth();
+                            const ptr_bytes: u64 = @divExact(ptr_bits, 8);
+                            const got_addr = @intCast(u32, got.p_vaddr + func.owner_decl.link.offset_table_index * ptr_bytes);
+                            // ff 14 25 xx xx xx xx    call [addr]
+                            try self.code.ensureCapacity(self.code.items.len + 7);
+                            self.code.appendSliceAssumeCapacity(&[3]u8{ 0xff, 0x14, 0x25 });
+                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), got_addr);
+                        } else {
+                            return self.fail(inst.base.src, "TODO implement calling bitcasted functions", .{});
+                        }
                     } else {
-                        return self.fail(inst.base.src, "TODO implement calling bitcasted functions", .{});
+                        return self.fail(inst.base.src, "TODO implement calling runtime known function pointer", .{});
                     }
-                } else {
-                    return self.fail(inst.base.src, "TODO implement calling runtime known function pointer", .{});
-                }
-            },
-            else => return self.fail(inst.base.src, "TODO implement call for {}", .{self.target.cpu.arch}),
-        }
-
-        const return_type = fn_ty.fnReturnType();
-        switch (return_type.zigTypeTag()) {
-            .Void => return MCValue{ .none = {} },
-            .NoReturn => return MCValue{ .unreach = {} },
-            else => return self.fail(inst.base.src, "TODO implement fn call with non-void return value", .{}),
-        }
-    }
-
-    fn ret(self: *Function, src: usize, comptime arch: std.Target.Cpu.Arch, mcv: MCValue) !MCValue {
-        if (mcv != .none) {
-            return self.fail(src, "TODO implement return with non-void operand", .{});
-        }
-        switch (arch) {
-            .i386 => {
-                try self.code.append(0xc3); // ret
-            },
-            .x86_64 => {
-                try self.code.appendSlice(&[_]u8{
-                    0x5d, // pop rbp
-                    0xc3, // ret
-                });
-            },
-            else => return self.fail(src, "TODO implement return for {}", .{self.target.cpu.arch}),
-        }
-        return .unreach;
-    }
-
-    fn genRet(self: *Function, inst: *ir.Inst.Ret, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        const operand = try self.resolveInst(inst.args.operand);
-        return self.ret(inst.base.src, arch, operand);
-    }
-
-    fn genRetVoid(self: *Function, inst: *ir.Inst.RetVoid, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        return self.ret(inst.base.src, arch, .none);
-    }
-
-    fn genCmp(self: *Function, inst: *ir.Inst.Cmp, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // No side effects, so if it's unreferenced, do nothing.
-        if (inst.base.isUnused())
-            return MCValue.dead;
-        switch (arch) {
-            .x86_64 => {
-                try self.code.ensureCapacity(self.code.items.len + 8);
-
-                const lhs = try self.resolveInst(inst.args.lhs);
-                const rhs = try self.resolveInst(inst.args.rhs);
-
-                // There are 2 operands, destination and source.
-                // Either one, but not both, can be a memory operand.
-                // Source operand can be an immediate, 8 bits or 32 bits.
-                const dst_mcv = if (lhs.isImmediate() or (lhs.isMemory() and rhs.isMemory()))
-                    try self.copyToNewRegister(inst.args.lhs)
-                else
-                    lhs;
-                // This instruction supports only signed 32-bit immediates at most.
-                const src_mcv = try self.limitImmediateType(inst.args.rhs, i32);
-
-                try self.genX8664BinMathCode(inst.base.src, dst_mcv, src_mcv, 7, 0x38);
-                const info = inst.args.lhs.ty.intInfo(self.target.*);
-                if (info.signed) {
-                    return MCValue{ .compare_flags_signed = inst.args.op };
-                } else {
-                    return MCValue{ .compare_flags_unsigned = inst.args.op };
-                }
-            },
-            else => return self.fail(inst.base.src, "TODO implement cmp for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genCondBr(self: *Function, inst: *ir.Inst.CondBr, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        switch (arch) {
-            .x86_64 => {
-                try self.code.ensureCapacity(self.code.items.len + 6);
-
-                const cond = try self.resolveInst(inst.args.condition);
-                switch (cond) {
-                    .compare_flags_signed => |cmp_op| {
-                        // Here we map to the opposite opcode because the jump is to the false branch.
-                        const opcode: u8 = switch (cmp_op) {
-                            .gte => 0x8c,
-                            .gt => 0x8e,
-                            .neq => 0x84,
-                            .lt => 0x8d,
-                            .lte => 0x8f,
-                            .eq => 0x85,
-                        };
-                        return self.genX86CondBr(inst, opcode, arch);
-                    },
-                    .compare_flags_unsigned => |cmp_op| {
-                        // Here we map to the opposite opcode because the jump is to the false branch.
-                        const opcode: u8 = switch (cmp_op) {
-                            .gte => 0x82,
-                            .gt => 0x86,
-                            .neq => 0x84,
-                            .lt => 0x83,
-                            .lte => 0x87,
-                            .eq => 0x85,
-                        };
-                        return self.genX86CondBr(inst, opcode, arch);
-                    },
-                    .register => |reg_usize| {
-                        const reg = @intToEnum(Reg(arch), @intCast(u8, reg_usize));
-                        // test reg, 1
-                        // TODO detect al, ax, eax
-                        try self.code.ensureCapacity(self.code.items.len + 4);
-                        self.rex(.{ .b = reg.isExtended(), .w = reg.size() == 64 });
-                        self.code.appendSliceAssumeCapacity(&[_]u8{
-                            0xf6,
-                            @as(u8, 0xC0) | (0 << 3) | @truncate(u3, reg.id()),
-                            0x01,
-                        });
-                        return self.genX86CondBr(inst, 0x84, arch);
-                    },
-                    else => return self.fail(inst.base.src, "TODO implement condbr {} when condition is {}", .{ self.target.cpu.arch, @tagName(cond) }),
-                }
-            },
-            else => return self.fail(inst.base.src, "TODO implement condbr for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genX86CondBr(self: *Function, inst: *ir.Inst.CondBr, opcode: u8, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode });
-        const reloc = Reloc{ .rel32 = self.code.items.len };
-        self.code.items.len += 4;
-        try self.genBody(inst.args.true_body, arch);
-        try self.performReloc(inst.base.src, reloc);
-        try self.genBody(inst.args.false_body, arch);
-        return MCValue.unreach;
-    }
-
-    fn genIsNull(self: *Function, inst: *ir.Inst.IsNull, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        switch (arch) {
-            else => return self.fail(inst.base.src, "TODO implement isnull for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genIsNonNull(self: *Function, inst: *ir.Inst.IsNonNull, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // Here you can specialize this instruction if it makes sense to, otherwise the default
-        // will call genIsNull and invert the result.
-        switch (arch) {
-            else => return self.fail(inst.base.src, "TODO call genIsNull and invert the result ", .{}),
-        }
-    }
-
-    fn genBlock(self: *Function, inst: *ir.Inst.Block, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        if (inst.base.ty.hasCodeGenBits()) {
-            return self.fail(inst.base.src, "TODO codegen Block with non-void type", .{});
-        }
-        // A block is nothing but a setup to be able to jump to the end.
-        defer inst.codegen.relocs.deinit(self.gpa);
-        try self.genBody(inst.args.body, arch);
-
-        for (inst.codegen.relocs.items) |reloc| try self.performReloc(inst.base.src, reloc);
-
-        return MCValue.none;
-    }
-
-    fn performReloc(self: *Function, src: usize, reloc: Reloc) !void {
-        switch (reloc) {
-            .rel32 => |pos| {
-                const amt = self.code.items.len - (pos + 4);
-                const s32_amt = std.math.cast(i32, amt) catch
-                    return self.fail(src, "unable to perform relocation: jump too far", .{});
-                mem.writeIntLittle(i32, self.code.items[pos..][0..4], s32_amt);
-            },
-        }
-    }
-
-    fn genBr(self: *Function, inst: *ir.Inst.Br, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        if (!inst.args.operand.ty.hasCodeGenBits())
-            return self.brVoid(inst.base.src, inst.args.block, arch);
-
-        const operand = try self.resolveInst(inst.args.operand);
-        switch (arch) {
-            else => return self.fail(inst.base.src, "TODO implement br for {}", .{self.target.cpu.arch}),
-        }
-    }
-
-    fn genBrVoid(self: *Function, inst: *ir.Inst.BrVoid, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        return self.brVoid(inst.base.src, inst.args.block, arch);
-    }
-
-    fn brVoid(self: *Function, src: usize, block: *ir.Inst.Block, comptime arch: std.Target.Cpu.Arch) !MCValue {
-        // Emit a jump with a relocation. It will be patched up after the block ends.
-        try block.codegen.relocs.ensureCapacity(self.gpa, block.codegen.relocs.items.len + 1);
-
-        switch (arch) {
-            .i386, .x86_64 => {
-                // TODO optimization opportunity: figure out when we can emit this as a 2 byte instruction
-                // which is available if the jump is 127 bytes or less forward.
-                try self.code.resize(self.code.items.len + 5);
-                self.code.items[self.code.items.len - 5] = 0xe9; // jmp rel32
-                // Leave the jump offset undefined
-                block.codegen.relocs.appendAssumeCapacity(.{ .rel32 = self.code.items.len - 4 });
-            },
-            else => return self.fail(src, "TODO implement brvoid for {}", .{self.target.cpu.arch}),
-        }
-        return .none;
-    }
-
-    fn genAsm(self: *Function, inst: *ir.Inst.Assembly, comptime arch: Target.Cpu.Arch) !MCValue {
-        if (!inst.args.is_volatile and inst.base.isUnused())
-            return MCValue.dead;
-        if (arch != .x86_64 and arch != .i386) {
-            return self.fail(inst.base.src, "TODO implement inline asm support for more architectures", .{});
-        }
-        for (inst.args.inputs) |input, i| {
-            if (input.len < 3 or input[0] != '{' or input[input.len - 1] != '}') {
-                return self.fail(inst.base.src, "unrecognized asm input constraint: '{}'", .{input});
+                },
+                else => return self.fail(inst.base.src, "TODO implement call for {}", .{self.target.cpu.arch}),
             }
-            const reg_name = input[1 .. input.len - 1];
-            const reg = parseRegName(arch, reg_name) orelse
-                return self.fail(inst.base.src, "unrecognized register: '{}'", .{reg_name});
-            const arg = try self.resolveInst(inst.args.args[i]);
-            try self.genSetReg(inst.base.src, arch, reg, arg);
-        }
 
-        if (mem.eql(u8, inst.args.asm_source, "syscall")) {
-            try self.code.appendSlice(&[_]u8{ 0x0f, 0x05 });
-        } else {
-            return self.fail(inst.base.src, "TODO implement support for more x86 assembly instructions", .{});
-        }
-
-        if (inst.args.output) |output| {
-            if (output.len < 4 or output[0] != '=' or output[1] != '{' or output[output.len - 1] != '}') {
-                return self.fail(inst.base.src, "unrecognized asm output constraint: '{}'", .{output});
+            const return_type = fn_ty.fnReturnType();
+            switch (return_type.zigTypeTag()) {
+                .Void => return MCValue{ .none = {} },
+                .NoReturn => return MCValue{ .unreach = {} },
+                else => return self.fail(inst.base.src, "TODO implement fn call with non-void return value", .{}),
             }
-            const reg_name = output[2 .. output.len - 1];
-            const reg = parseRegName(arch, reg_name) orelse
-                return self.fail(inst.base.src, "unrecognized register: '{}'", .{reg_name});
-            return MCValue{ .register = @enumToInt(reg) };
-        } else {
+        }
+
+        fn ret(self: *Self, src: usize, mcv: MCValue) !MCValue {
+            if (mcv != .none) {
+                return self.fail(src, "TODO implement return with non-void operand", .{});
+            }
+            switch (arch) {
+                .i386 => {
+                    try self.code.append(0xc3); // ret
+                },
+                .x86_64 => {
+                    try self.code.appendSlice(&[_]u8{
+                        0x5d, // pop rbp
+                        0xc3, // ret
+                    });
+                },
+                else => return self.fail(src, "TODO implement return for {}", .{self.target.cpu.arch}),
+            }
+            return .unreach;
+        }
+
+        fn genRet(self: *Self, inst: *ir.Inst.Ret) !MCValue {
+            const operand = try self.resolveInst(inst.args.operand);
+            return self.ret(inst.base.src, operand);
+        }
+
+        fn genRetVoid(self: *Self, inst: *ir.Inst.RetVoid) !MCValue {
+            return self.ret(inst.base.src, .none);
+        }
+
+        fn genCmp(self: *Self, inst: *ir.Inst.Cmp) !MCValue {
+            // No side effects, so if it's unreferenced, do nothing.
+            if (inst.base.isUnused())
+                return MCValue.dead;
+            switch (arch) {
+                .x86_64 => {
+                    try self.code.ensureCapacity(self.code.items.len + 8);
+
+                    const lhs = try self.resolveInst(inst.args.lhs);
+                    const rhs = try self.resolveInst(inst.args.rhs);
+
+                    // There are 2 operands, destination and source.
+                    // Either one, but not both, can be a memory operand.
+                    // Source operand can be an immediate, 8 bits or 32 bits.
+                    const dst_mcv = if (lhs.isImmediate() or (lhs.isMemory() and rhs.isMemory()))
+                        try self.moveToNewRegister(inst.args.lhs)
+                    else
+                        lhs;
+                    // This instruction supports only signed 32-bit immediates at most.
+                    const src_mcv = try self.limitImmediateType(inst.args.rhs, i32);
+
+                    try self.genX8664BinMathCode(inst.base.src, dst_mcv, src_mcv, 7, 0x38);
+                    const info = inst.args.lhs.ty.intInfo(self.target.*);
+                    if (info.signed) {
+                        return MCValue{ .compare_flags_signed = inst.args.op };
+                    } else {
+                        return MCValue{ .compare_flags_unsigned = inst.args.op };
+                    }
+                },
+                else => return self.fail(inst.base.src, "TODO implement cmp for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genCondBr(self: *Self, inst: *ir.Inst.CondBr) !MCValue {
+            switch (arch) {
+                .x86_64 => {
+                    try self.code.ensureCapacity(self.code.items.len + 6);
+
+                    const cond = try self.resolveInst(inst.args.condition);
+                    switch (cond) {
+                        .compare_flags_signed => |cmp_op| {
+                            // Here we map to the opposite opcode because the jump is to the false branch.
+                            const opcode: u8 = switch (cmp_op) {
+                                .gte => 0x8c,
+                                .gt => 0x8e,
+                                .neq => 0x84,
+                                .lt => 0x8d,
+                                .lte => 0x8f,
+                                .eq => 0x85,
+                            };
+                            return self.genX86CondBr(inst, opcode);
+                        },
+                        .compare_flags_unsigned => |cmp_op| {
+                            // Here we map to the opposite opcode because the jump is to the false branch.
+                            const opcode: u8 = switch (cmp_op) {
+                                .gte => 0x82,
+                                .gt => 0x86,
+                                .neq => 0x84,
+                                .lt => 0x83,
+                                .lte => 0x87,
+                                .eq => 0x85,
+                            };
+                            return self.genX86CondBr(inst, opcode);
+                        },
+                        .register => |reg| {
+                            // test reg, 1
+                            // TODO detect al, ax, eax
+                            try self.code.ensureCapacity(self.code.items.len + 4);
+                            self.rex(.{ .b = reg.isExtended(), .w = reg.size() == 64 });
+                            self.code.appendSliceAssumeCapacity(&[_]u8{
+                                0xf6,
+                                @as(u8, 0xC0) | (0 << 3) | @truncate(u3, reg.id()),
+                                0x01,
+                            });
+                            return self.genX86CondBr(inst, 0x84);
+                        },
+                        else => return self.fail(inst.base.src, "TODO implement condbr {} when condition is {}", .{ self.target.cpu.arch, @tagName(cond) }),
+                    }
+                },
+                else => return self.fail(inst.base.src, "TODO implement condbr for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genX86CondBr(self: *Self, inst: *ir.Inst.CondBr, opcode: u8) !MCValue {
+            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode });
+            const reloc = Reloc{ .rel32 = self.code.items.len };
+            self.code.items.len += 4;
+            try self.genBody(inst.args.true_body);
+            try self.performReloc(inst.base.src, reloc);
+            try self.genBody(inst.args.false_body);
+            return MCValue.unreach;
+        }
+
+        fn genIsNull(self: *Self, inst: *ir.Inst.IsNull) !MCValue {
+            switch (arch) {
+                else => return self.fail(inst.base.src, "TODO implement isnull for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genIsNonNull(self: *Self, inst: *ir.Inst.IsNonNull) !MCValue {
+            // Here you can specialize this instruction if it makes sense to, otherwise the default
+            // will call genIsNull and invert the result.
+            switch (arch) {
+                else => return self.fail(inst.base.src, "TODO call genIsNull and invert the result ", .{}),
+            }
+        }
+
+        fn genBlock(self: *Self, inst: *ir.Inst.Block) !MCValue {
+            if (inst.base.ty.hasCodeGenBits()) {
+                return self.fail(inst.base.src, "TODO codegen Block with non-void type", .{});
+            }
+            // A block is nothing but a setup to be able to jump to the end.
+            defer inst.codegen.relocs.deinit(self.gpa);
+            try self.genBody(inst.args.body);
+
+            for (inst.codegen.relocs.items) |reloc| try self.performReloc(inst.base.src, reloc);
+
             return MCValue.none;
         }
-    }
 
-    /// Encodes a REX prefix as specified, and appends it to the instruction
-    /// stream. This only modifies the instruction stream if at least one bit
-    /// is set true, which has a few implications:
-    ///
-    /// * The length of the instruction buffer will be modified *if* the
-    /// resulting REX is meaningful, but will remain the same if it is not.
-    /// * Deliberately inserting a "meaningless REX" requires explicit usage of
-    /// 0x40, and cannot be done via this function.
-    fn rex(self: *Function, arg: struct { b: bool = false, w: bool = false, x: bool = false, r: bool = false }) void {
-        //  From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB.
-        var value: u8 = 0x40;
-        if (arg.b) {
-            value |= 0x1;
+        fn performReloc(self: *Self, src: usize, reloc: Reloc) !void {
+            switch (reloc) {
+                .rel32 => |pos| {
+                    const amt = self.code.items.len - (pos + 4);
+                    const s32_amt = std.math.cast(i32, amt) catch
+                        return self.fail(src, "unable to perform relocation: jump too far", .{});
+                    mem.writeIntLittle(i32, self.code.items[pos..][0..4], s32_amt);
+                },
+            }
         }
-        if (arg.x) {
-            value |= 0x2;
-        }
-        if (arg.r) {
-            value |= 0x4;
-        }
-        if (arg.w) {
-            value |= 0x8;
-        }
-        if (value != 0x40) {
-            self.code.appendAssumeCapacity(value);
-        }
-    }
 
-    fn genSetReg(self: *Function, src: usize, comptime arch: Target.Cpu.Arch, reg: Reg(arch), mcv: MCValue) error{ CodegenFail, OutOfMemory }!void {
-        switch (arch) {
-            .x86_64 => switch (mcv) {
-                .dead => unreachable,
-                .none => unreachable,
-                .unreach => unreachable,
-                .compare_flags_unsigned => |op| {
-                    try self.code.ensureCapacity(self.code.items.len + 3);
-                    self.rex(.{ .b = reg.isExtended(), .w = reg.size() == 64 });
-                    const opcode: u8 = switch (op) {
-                        .gte => 0x93,
-                        .gt => 0x97,
-                        .neq => 0x95,
-                        .lt => 0x92,
-                        .lte => 0x96,
-                        .eq => 0x94,
-                    };
-                    const id = @as(u8, reg.id() & 0b111);
-                    self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode, 0xC0 | id });
+        fn genBr(self: *Self, inst: *ir.Inst.Br) !MCValue {
+            if (!inst.args.operand.ty.hasCodeGenBits())
+                return self.brVoid(inst.base.src, inst.args.block);
+
+            const operand = try self.resolveInst(inst.args.operand);
+            switch (arch) {
+                else => return self.fail(inst.base.src, "TODO implement br for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn genBrVoid(self: *Self, inst: *ir.Inst.BrVoid) !MCValue {
+            return self.brVoid(inst.base.src, inst.args.block);
+        }
+
+        fn brVoid(self: *Self, src: usize, block: *ir.Inst.Block) !MCValue {
+            // Emit a jump with a relocation. It will be patched up after the block ends.
+            try block.codegen.relocs.ensureCapacity(self.gpa, block.codegen.relocs.items.len + 1);
+
+            switch (arch) {
+                .i386, .x86_64 => {
+                    // TODO optimization opportunity: figure out when we can emit this as a 2 byte instruction
+                    // which is available if the jump is 127 bytes or less forward.
+                    try self.code.resize(self.code.items.len + 5);
+                    self.code.items[self.code.items.len - 5] = 0xe9; // jmp rel32
+                    // Leave the jump offset undefined
+                    block.codegen.relocs.appendAssumeCapacity(.{ .rel32 = self.code.items.len - 4 });
                 },
-                .compare_flags_signed => |op| {
-                    return self.fail(src, "TODO set register with compare flags value (signed)", .{});
-                },
-                .immediate => |x| {
-                    if (reg.size() != 64) {
-                        return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
-                    }
-                    // 32-bit moves zero-extend to 64-bit, so xoring the 32-bit
-                    // register is the fastest way to zero a register.
-                    if (x == 0) {
-                        // The encoding for `xor r32, r32` is `0x31 /r`.
-                        // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the
-                        // ModR/M byte of the instruction contains a register operand and an r/m operand."
-                        //
-                        // R/M bytes are composed of two bits for the mode, then three bits for the register,
-                        // then three bits for the operand. Since we're zeroing a register, the two three-bit
-                        // values will be identical, and the mode is three (the raw register value).
-                        //
-                        // If we're accessing e.g. r8d, we need to use a REX prefix before the actual operation. Since
-                        // this is a 32-bit operation, the W flag is set to zero. X is also zero, as we're not using a SIB.
-                        // Both R and B are set, as we're extending, in effect, the register bits *and* the operand.
+                else => return self.fail(src, "TODO implement brvoid for {}", .{self.target.cpu.arch}),
+            }
+            return .none;
+        }
+
+        fn genAsm(self: *Self, inst: *ir.Inst.Assembly) !MCValue {
+            if (!inst.args.is_volatile and inst.base.isUnused())
+                return MCValue.dead;
+            if (arch != .x86_64 and arch != .i386) {
+                return self.fail(inst.base.src, "TODO implement inline asm support for more architectures", .{});
+            }
+            for (inst.args.inputs) |input, i| {
+                if (input.len < 3 or input[0] != '{' or input[input.len - 1] != '}') {
+                    return self.fail(inst.base.src, "unrecognized asm input constraint: '{}'", .{input});
+                }
+                const reg_name = input[1 .. input.len - 1];
+                const reg = parseRegName(reg_name) orelse
+                    return self.fail(inst.base.src, "unrecognized register: '{}'", .{reg_name});
+                const arg = try self.resolveInst(inst.args.args[i]);
+                try self.genSetReg(inst.base.src, reg, arg);
+            }
+
+            if (mem.eql(u8, inst.args.asm_source, "syscall")) {
+                try self.code.appendSlice(&[_]u8{ 0x0f, 0x05 });
+            } else {
+                return self.fail(inst.base.src, "TODO implement support for more x86 assembly instructions", .{});
+            }
+
+            if (inst.args.output) |output| {
+                if (output.len < 4 or output[0] != '=' or output[1] != '{' or output[output.len - 1] != '}') {
+                    return self.fail(inst.base.src, "unrecognized asm output constraint: '{}'", .{output});
+                }
+                const reg_name = output[2 .. output.len - 1];
+                const reg = parseRegName(reg_name) orelse
+                    return self.fail(inst.base.src, "unrecognized register: '{}'", .{reg_name});
+                return MCValue{ .register = reg };
+            } else {
+                return MCValue.none;
+            }
+        }
+
+        /// Encodes a REX prefix as specified, and appends it to the instruction
+        /// stream. This only modifies the instruction stream if at least one bit
+        /// is set true, which has a few implications:
+        ///
+        /// * The length of the instruction buffer will be modified *if* the
+        /// resulting REX is meaningful, but will remain the same if it is not.
+        /// * Deliberately inserting a "meaningless REX" requires explicit usage of
+        /// 0x40, and cannot be done via this function.
+        fn rex(self: *Self, arg: struct { b: bool = false, w: bool = false, x: bool = false, r: bool = false }) void {
+            //  From section 2.2.1.2 of the manual, REX is encoded as b0100WRXB.
+            var value: u8 = 0x40;
+            if (arg.b) {
+                value |= 0x1;
+            }
+            if (arg.x) {
+                value |= 0x2;
+            }
+            if (arg.r) {
+                value |= 0x4;
+            }
+            if (arg.w) {
+                value |= 0x8;
+            }
+            if (value != 0x40) {
+                self.code.appendAssumeCapacity(value);
+            }
+        }
+
+        fn genSetReg(self: *Self, src: usize, reg: Reg, mcv: MCValue) error{ CodegenFail, OutOfMemory }!void {
+            switch (arch) {
+                .x86_64 => switch (mcv) {
+                    .dead => unreachable,
+                    .none => unreachable,
+                    .unreach => unreachable,
+                    .compare_flags_unsigned => |op| {
                         try self.code.ensureCapacity(self.code.items.len + 3);
-                        self.rex(.{ .r = reg.isExtended(), .b = reg.isExtended() });
+                        self.rex(.{ .b = reg.isExtended(), .w = reg.size() == 64 });
+                        const opcode: u8 = switch (op) {
+                            .gte => 0x93,
+                            .gt => 0x97,
+                            .neq => 0x95,
+                            .lt => 0x92,
+                            .lte => 0x96,
+                            .eq => 0x94,
+                        };
                         const id = @as(u8, reg.id() & 0b111);
-                        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x31, 0xC0 | id << 3 | id });
-                        return;
-                    }
-                    if (x <= std.math.maxInt(u32)) {
-                        // Next best case: if we set the lower four bytes, the upper four will be zeroed.
-                        //
-                        // The encoding for `mov IMM32 -> REG` is (0xB8 + R) IMM.
-                        if (reg.isExtended()) {
-                            // Just as with XORing, we need a REX prefix. This time though, we only
-                            // need the B bit set, as we're extending the opcode's register field,
-                            // and there is no Mod R/M byte.
-                            //
-                            // Thus, we need b01000001, or 0x41.
-                            try self.code.resize(self.code.items.len + 6);
-                            self.code.items[self.code.items.len - 6] = 0x41;
-                        } else {
-                            try self.code.resize(self.code.items.len + 5);
+                        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode, 0xC0 | id });
+                    },
+                    .compare_flags_signed => |op| {
+                        return self.fail(src, "TODO set register with compare flags value (signed)", .{});
+                    },
+                    .immediate => |x| {
+                        if (reg.size() != 64) {
+                            return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
                         }
-                        self.code.items[self.code.items.len - 5] = 0xB8 | @as(u8, reg.id() & 0b111);
-                        const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
-                        mem.writeIntLittle(u32, imm_ptr, @intCast(u32, x));
-                        return;
-                    }
-                    // Worst case: we need to load the 64-bit register with the IMM. GNU's assemblers calls
-                    // this `movabs`, though this is officially just a different variant of the plain `mov`
-                    // instruction.
-                    //
-                    // This encoding is, in fact, the *same* as the one used for 32-bit loads. The only
-                    // difference is that we set REX.W before the instruction, which extends the load to
-                    // 64-bit and uses the full bit-width of the register.
-                    //
-                    // Since we always need a REX here, let's just check if we also need to set REX.B.
-                    //
-                    // In this case, the encoding of the REX byte is 0b0100100B
-                    try self.code.ensureCapacity(self.code.items.len + 10);
-                    self.rex(.{ .w = true, .b = reg.isExtended() });
-                    self.code.items.len += 9;
-                    self.code.items[self.code.items.len - 9] = 0xB8 | @as(u8, reg.id() & 0b111);
-                    const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
-                    mem.writeIntLittle(u64, imm_ptr, x);
-                },
-                .embedded_in_code => |code_offset| {
-                    if (reg.size() != 64) {
-                        return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
-                    }
-                    // We need the offset from RIP in a signed i32 twos complement.
-                    // The instruction is 7 bytes long and RIP points to the next instruction.
-                    try self.code.ensureCapacity(self.code.items.len + 7);
-                    // 64-bit LEA is encoded as REX.W 8D /r. If the register is extended, the REX byte is modified,
-                    // but the operation size is unchanged. Since we're using a disp32, we want mode 0 and lower three
-                    // bits as five.
-                    // REX 0x8D 0b00RRR101, where RRR is the lower three bits of the id.
-                    self.rex(.{ .w = true, .b = reg.isExtended() });
-                    self.code.items.len += 6;
-                    const rip = self.code.items.len;
-                    const big_offset = @intCast(i64, code_offset) - @intCast(i64, rip);
-                    const offset = @intCast(i32, big_offset);
-                    self.code.items[self.code.items.len - 6] = 0x8D;
-                    self.code.items[self.code.items.len - 5] = 0b101 | (@as(u8, reg.id() & 0b111) << 3);
-                    const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
-                    mem.writeIntLittle(i32, imm_ptr, offset);
-                },
-                .register => |r| {
-                    if (reg.size() != 64) {
-                        return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
-                    }
-                    const src_reg = @intToEnum(Reg(arch), @intCast(u8, r));
-                    // This is a variant of 8B /r. Since we're using 64-bit moves, we require a REX.
-                    // This is thus three bytes: REX 0x8B R/M.
-                    // If the destination is extended, the R field must be 1.
-                    // If the *source* is extended, the B field must be 1.
-                    // Since the register is being accessed directly, the R/M mode is three. The reg field (the middle
-                    // three bits) contain the destination, and the R/M field (the lower three bits) contain the source.
-                    try self.code.ensureCapacity(self.code.items.len + 3);
-                    self.rex(.{ .w = true, .r = reg.isExtended(), .b = src_reg.isExtended() });
-                    const R = 0xC0 | (@as(u8, reg.id() & 0b111) << 3) | @as(u8, src_reg.id() & 0b111);
-                    self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, R });
-                },
-                .memory => |x| {
-                    if (reg.size() != 64) {
-                        return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
-                    }
-                    if (x <= std.math.maxInt(u32)) {
-                        // Moving from memory to a register is a variant of `8B /r`.
-                        // Since we're using 64-bit moves, we require a REX.
-                        // This variant also requires a SIB, as it would otherwise be RIP-relative.
-                        // We want mode zero with the lower three bits set to four to indicate an SIB with no other displacement.
-                        // The SIB must be 0x25, to indicate a disp32 with no scaled index.
-                        // 0b00RRR100, where RRR is the lower three bits of the register ID.
-                        // The instruction is thus eight bytes; REX 0x8B 0b00RRR100 0x25 followed by a four-byte disp32.
-                        try self.code.ensureCapacity(self.code.items.len + 8);
-                        self.rex(.{ .w = true, .b = reg.isExtended() });
-                        self.code.appendSliceAssumeCapacity(&[_]u8{
-                            0x8B,
-                            0x04 | (@as(u8, reg.id() & 0b111) << 3), // R
-                            0x25,
-                        });
-                        mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), @intCast(u32, x));
-                    } else {
-                        // If this is RAX, we can use a direct load; otherwise, we need to load the address, then indirectly load
-                        // the value.
-                        if (reg.id() == 0) {
-                            // REX.W 0xA1 moffs64*
-                            // moffs64* is a 64-bit offset "relative to segment base", which really just means the
-                            // absolute address for all practical purposes.
-                            try self.code.resize(self.code.items.len + 10);
-                            // REX.W == 0x48
-                            self.code.items[self.code.items.len - 10] = 0x48;
-                            self.code.items[self.code.items.len - 9] = 0xA1;
-                            const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
-                            mem.writeIntLittle(u64, imm_ptr, x);
-                        } else {
-                            // This requires two instructions; a move imm as used above, followed by an indirect load using the register
-                            // as the address and the register as the destination.
+                        // 32-bit moves zero-extend to 64-bit, so xoring the 32-bit
+                        // register is the fastest way to zero a register.
+                        if (x == 0) {
+                            // The encoding for `xor r32, r32` is `0x31 /r`.
+                            // Section 3.1.1.1 of the Intel x64 Manual states that "/r indicates that the
+                            // ModR/M byte of the instruction contains a register operand and an r/m operand."
                             //
-                            // This cannot be used if the lower three bits of the id are equal to four or five, as there
-                            // is no way to possibly encode it. This means that RSP, RBP, R12, and R13 cannot be used with
-                            // this instruction.
-                            const id3 = @truncate(u3, reg.id());
-                            std.debug.assert(id3 != 4 and id3 != 5);
-
-                            // Rather than duplicate the logic used for the move, we just use a self-call with a new MCValue.
-                            try self.genSetReg(src, arch, reg, MCValue{ .immediate = x });
-
-                            // Now, the register contains the address of the value to load into it
-                            // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant.
-                            // TODO: determine whether to allow other sized registers, and if so, handle them properly.
-                            // This operation requires three bytes: REX 0x8B R/M
+                            // R/M bytes are composed of two bits for the mode, then three bits for the register,
+                            // then three bits for the operand. Since we're zeroing a register, the two three-bit
+                            // values will be identical, and the mode is three (the raw register value).
+                            //
+                            // If we're accessing e.g. r8d, we need to use a REX prefix before the actual operation. Since
+                            // this is a 32-bit operation, the W flag is set to zero. X is also zero, as we're not using a SIB.
+                            // Both R and B are set, as we're extending, in effect, the register bits *and* the operand.
                             try self.code.ensureCapacity(self.code.items.len + 3);
-                            // For this operation, we want R/M mode *zero* (use register indirectly), and the two register
-                            // values must match. Thus, it's 00ABCABC where ABC is the lower three bits of the register ID.
-                            //
-                            // Furthermore, if this is an extended register, both B and R must be set in the REX byte, as *both*
-                            // register operands need to be marked as extended.
-                            self.rex(.{ .w = true, .b = reg.isExtended(), .r = reg.isExtended() });
-                            const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id());
-                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM });
+                            self.rex(.{ .r = reg.isExtended(), .b = reg.isExtended() });
+                            const id = @as(u8, reg.id() & 0b111);
+                            self.code.appendSliceAssumeCapacity(&[_]u8{ 0x31, 0xC0 | id << 3 | id });
+                            return;
                         }
-                    }
-                },
-                .stack_offset => |off| {
-                    return self.fail(src, "TODO implement genSetReg for stack variables", .{});
-                },
-            },
-            else => return self.fail(src, "TODO implement genSetReg for more architectures", .{}),
-        }
-    }
-
-    fn genPtrToInt(self: *Function, inst: *ir.Inst.PtrToInt) !MCValue {
-        // no-op
-        return self.resolveInst(inst.args.ptr);
-    }
-
-    fn genBitCast(self: *Function, inst: *ir.Inst.BitCast) !MCValue {
-        const operand = try self.resolveInst(inst.args.operand);
-        return operand;
-    }
-
-    fn resolveInst(self: *Function, inst: *ir.Inst) !MCValue {
-        // Constants have static lifetimes, so they are always memoized in the outer most table.
-        if (inst.cast(ir.Inst.Constant)) |const_inst| {
-            const branch = &self.branch_stack.items[0];
-            const gop = try branch.inst_table.getOrPut(self.gpa, inst);
-            if (!gop.found_existing) {
-                gop.entry.value = try self.genTypedValue(inst.src, .{ .ty = inst.ty, .val = const_inst.val });
-            }
-            return gop.entry.value;
-        }
-
-        // Treat each stack item as a "layer" on top of the previous one.
-        var i: usize = self.branch_stack.items.len;
-        while (true) {
-            i -= 1;
-            if (self.branch_stack.items[i].inst_table.get(inst)) |mcv| {
-                return mcv;
-            }
-        }
-    }
-
-    fn copyToNewRegister(self: *Function, inst: *ir.Inst) !MCValue {
-        return self.fail(inst.src, "TODO implement copyToNewRegister", .{});
-    }
-
-    /// If the MCValue is an immediate, and it does not fit within this type,
-    /// we put it in a register.
-    /// A potential opportunity for future optimization here would be keeping track
-    /// of the fact that the instruction is available both as an immediate
-    /// and as a register.
-    fn limitImmediateType(self: *Function, inst: *ir.Inst, comptime T: type) !MCValue {
-        const mcv = try self.resolveInst(inst);
-        const ti = @typeInfo(T).Int;
-        switch (mcv) {
-            .immediate => |imm| {
-                // This immediate is unsigned.
-                const U = @Type(.{
-                    .Int = .{
-                        .bits = ti.bits - @boolToInt(ti.is_signed),
-                        .is_signed = false,
+                        if (x <= std.math.maxInt(u32)) {
+                            // Next best case: if we set the lower four bytes, the upper four will be zeroed.
+                            //
+                            // The encoding for `mov IMM32 -> REG` is (0xB8 + R) IMM.
+                            if (reg.isExtended()) {
+                                // Just as with XORing, we need a REX prefix. This time though, we only
+                                // need the B bit set, as we're extending the opcode's register field,
+                                // and there is no Mod R/M byte.
+                                //
+                                // Thus, we need b01000001, or 0x41.
+                                try self.code.resize(self.code.items.len + 6);
+                                self.code.items[self.code.items.len - 6] = 0x41;
+                            } else {
+                                try self.code.resize(self.code.items.len + 5);
+                            }
+                            self.code.items[self.code.items.len - 5] = 0xB8 | @as(u8, reg.id() & 0b111);
+                            const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
+                            mem.writeIntLittle(u32, imm_ptr, @intCast(u32, x));
+                            return;
+                        }
+                        // Worst case: we need to load the 64-bit register with the IMM. GNU's assemblers calls
+                        // this `movabs`, though this is officially just a different variant of the plain `mov`
+                        // instruction.
+                        //
+                        // This encoding is, in fact, the *same* as the one used for 32-bit loads. The only
+                        // difference is that we set REX.W before the instruction, which extends the load to
+                        // 64-bit and uses the full bit-width of the register.
+                        //
+                        // Since we always need a REX here, let's just check if we also need to set REX.B.
+                        //
+                        // In this case, the encoding of the REX byte is 0b0100100B
+                        try self.code.ensureCapacity(self.code.items.len + 10);
+                        self.rex(.{ .w = true, .b = reg.isExtended() });
+                        self.code.items.len += 9;
+                        self.code.items[self.code.items.len - 9] = 0xB8 | @as(u8, reg.id() & 0b111);
+                        const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
+                        mem.writeIntLittle(u64, imm_ptr, x);
                     },
-                });
-                if (imm >= std.math.maxInt(U)) {
-                    return self.copyToNewRegister(inst);
-                }
-            },
-            else => {},
-        }
-        return mcv;
-    }
-
-    fn genTypedValue(self: *Function, src: usize, typed_value: TypedValue) !MCValue {
-        const ptr_bits = self.target.cpu.arch.ptrBitWidth();
-        const ptr_bytes: u64 = @divExact(ptr_bits, 8);
-        switch (typed_value.ty.zigTypeTag()) {
-            .Pointer => {
-                if (typed_value.val.cast(Value.Payload.DeclRef)) |payload| {
-                    const got = &self.bin_file.program_headers.items[self.bin_file.phdr_got_index.?];
-                    const decl = payload.decl;
-                    const got_addr = got.p_vaddr + decl.link.offset_table_index * ptr_bytes;
-                    return MCValue{ .memory = got_addr };
-                }
-                return self.fail(src, "TODO codegen more kinds of const pointers", .{});
-            },
-            .Int => {
-                const info = typed_value.ty.intInfo(self.target.*);
-                if (info.bits > ptr_bits or info.signed) {
-                    return self.fail(src, "TODO const int bigger than ptr and signed int", .{});
-                }
-                return MCValue{ .immediate = typed_value.val.toUnsignedInt() };
-            },
-            .Bool => {
-                return MCValue{ .immediate = @boolToInt(typed_value.val.toBool()) };
-            },
-            .ComptimeInt => unreachable, // semantic analysis prevents this
-            .ComptimeFloat => unreachable, // semantic analysis prevents this
-            else => return self.fail(src, "TODO implement const of type '{}'", .{typed_value.ty}),
-        }
-    }
-
-    fn resolveParameters(
-        self: *Function,
-        src: usize,
-        cc: std.builtin.CallingConvention,
-        param_types: []const Type,
-        results: []MCValue,
-    ) !u32 {
-        switch (self.target.cpu.arch) {
-            .x86_64 => {
-                switch (cc) {
-                    .Naked => {
-                        assert(results.len == 0);
-                        return 0;
+                    .embedded_in_code => |code_offset| {
+                        if (reg.size() != 64) {
+                            return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
+                        }
+                        // We need the offset from RIP in a signed i32 twos complement.
+                        // The instruction is 7 bytes long and RIP points to the next instruction.
+                        try self.code.ensureCapacity(self.code.items.len + 7);
+                        // 64-bit LEA is encoded as REX.W 8D /r. If the register is extended, the REX byte is modified,
+                        // but the operation size is unchanged. Since we're using a disp32, we want mode 0 and lower three
+                        // bits as five.
+                        // REX 0x8D 0b00RRR101, where RRR is the lower three bits of the id.
+                        self.rex(.{ .w = true, .b = reg.isExtended() });
+                        self.code.items.len += 6;
+                        const rip = self.code.items.len;
+                        const big_offset = @intCast(i64, code_offset) - @intCast(i64, rip);
+                        const offset = @intCast(i32, big_offset);
+                        self.code.items[self.code.items.len - 6] = 0x8D;
+                        self.code.items[self.code.items.len - 5] = 0b101 | (@as(u8, reg.id() & 0b111) << 3);
+                        const imm_ptr = self.code.items[self.code.items.len - 4 ..][0..4];
+                        mem.writeIntLittle(i32, imm_ptr, offset);
                     },
-                    .Unspecified, .C => {
-                        var next_int_reg: usize = 0;
-                        var next_stack_offset: u32 = 0;
+                    .register => |src_reg| {
+                        if (reg.size() != 64) {
+                            return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
+                        }
+                        // This is a variant of 8B /r. Since we're using 64-bit moves, we require a REX.
+                        // This is thus three bytes: REX 0x8B R/M.
+                        // If the destination is extended, the R field must be 1.
+                        // If the *source* is extended, the B field must be 1.
+                        // Since the register is being accessed directly, the R/M mode is three. The reg field (the middle
+                        // three bits) contain the destination, and the R/M field (the lower three bits) contain the source.
+                        try self.code.ensureCapacity(self.code.items.len + 3);
+                        self.rex(.{ .w = true, .r = reg.isExtended(), .b = src_reg.isExtended() });
+                        const R = 0xC0 | (@as(u8, reg.id() & 0b111) << 3) | @as(u8, src_reg.id() & 0b111);
+                        self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, R });
+                    },
+                    .memory => |x| {
+                        if (reg.size() != 64) {
+                            return self.fail(src, "TODO decide whether to implement non-64-bit loads", .{});
+                        }
+                        if (x <= std.math.maxInt(u32)) {
+                            // Moving from memory to a register is a variant of `8B /r`.
+                            // Since we're using 64-bit moves, we require a REX.
+                            // This variant also requires a SIB, as it would otherwise be RIP-relative.
+                            // We want mode zero with the lower three bits set to four to indicate an SIB with no other displacement.
+                            // The SIB must be 0x25, to indicate a disp32 with no scaled index.
+                            // 0b00RRR100, where RRR is the lower three bits of the register ID.
+                            // The instruction is thus eight bytes; REX 0x8B 0b00RRR100 0x25 followed by a four-byte disp32.
+                            try self.code.ensureCapacity(self.code.items.len + 8);
+                            self.rex(.{ .w = true, .b = reg.isExtended() });
+                            self.code.appendSliceAssumeCapacity(&[_]u8{
+                                0x8B,
+                                0x04 | (@as(u8, reg.id() & 0b111) << 3), // R
+                                0x25,
+                            });
+                            mem.writeIntLittle(u32, self.code.addManyAsArrayAssumeCapacity(4), @intCast(u32, x));
+                        } else {
+                            // If this is RAX, we can use a direct load; otherwise, we need to load the address, then indirectly load
+                            // the value.
+                            if (reg.id() == 0) {
+                                // REX.W 0xA1 moffs64*
+                                // moffs64* is a 64-bit offset "relative to segment base", which really just means the
+                                // absolute address for all practical purposes.
+                                try self.code.resize(self.code.items.len + 10);
+                                // REX.W == 0x48
+                                self.code.items[self.code.items.len - 10] = 0x48;
+                                self.code.items[self.code.items.len - 9] = 0xA1;
+                                const imm_ptr = self.code.items[self.code.items.len - 8 ..][0..8];
+                                mem.writeIntLittle(u64, imm_ptr, x);
+                            } else {
+                                // This requires two instructions; a move imm as used above, followed by an indirect load using the register
+                                // as the address and the register as the destination.
+                                //
+                                // This cannot be used if the lower three bits of the id are equal to four or five, as there
+                                // is no way to possibly encode it. This means that RSP, RBP, R12, and R13 cannot be used with
+                                // this instruction.
+                                const id3 = @truncate(u3, reg.id());
+                                std.debug.assert(id3 != 4 and id3 != 5);
 
-                        const integer_registers = [_]Reg(.x86_64){ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
-                        for (param_types) |ty, i| {
-                            switch (ty.zigTypeTag()) {
-                                .Bool, .Int => {
-                                    if (next_int_reg >= integer_registers.len) {
-                                        results[i] = .{ .stack_offset = next_stack_offset };
-                                        next_stack_offset += @intCast(u32, ty.abiSize(self.target.*));
-                                    } else {
-                                        results[i] = .{ .register = @enumToInt(integer_registers[next_int_reg]) };
-                                        next_int_reg += 1;
-                                    }
-                                },
-                                else => return self.fail(src, "TODO implement function parameters of type {}", .{@tagName(ty.zigTypeTag())}),
+                                // Rather than duplicate the logic used for the move, we just use a self-call with a new MCValue.
+                                try self.genSetReg(src, reg, MCValue{ .immediate = x });
+
+                                // Now, the register contains the address of the value to load into it
+                                // Currently, we're only allowing 64-bit registers, so we need the `REX.W 8B /r` variant.
+                                // TODO: determine whether to allow other sized registers, and if so, handle them properly.
+                                // This operation requires three bytes: REX 0x8B R/M
+                                try self.code.ensureCapacity(self.code.items.len + 3);
+                                // For this operation, we want R/M mode *zero* (use register indirectly), and the two register
+                                // values must match. Thus, it's 00ABCABC where ABC is the lower three bits of the register ID.
+                                //
+                                // Furthermore, if this is an extended register, both B and R must be set in the REX byte, as *both*
+                                // register operands need to be marked as extended.
+                                self.rex(.{ .w = true, .b = reg.isExtended(), .r = reg.isExtended() });
+                                const RM = (@as(u8, reg.id() & 0b111) << 3) | @truncate(u3, reg.id());
+                                self.code.appendSliceAssumeCapacity(&[_]u8{ 0x8B, RM });
                             }
                         }
-                        return next_stack_offset;
                     },
-                    else => return self.fail(src, "TODO implement function parameters for {}", .{cc}),
-                }
-            },
-            else => return self.fail(src, "TODO implement C ABI support for {}", .{self.target.cpu.arch}),
+                    .stack_offset => |off| {
+                        return self.fail(src, "TODO implement genSetReg for stack variables", .{});
+                    },
+                },
+                else => return self.fail(src, "TODO implement genSetReg for more architectures", .{}),
+            }
         }
-    }
 
-    fn fail(self: *Function, src: usize, comptime format: []const u8, args: anytype) error{ CodegenFail, OutOfMemory } {
-        @setCold(true);
-        assert(self.err_msg == null);
-        self.err_msg = try ErrorMsg.create(self.bin_file.allocator, src, format, args);
-        return error.CodegenFail;
-    }
-};
+        fn genPtrToInt(self: *Self, inst: *ir.Inst.PtrToInt) !MCValue {
+            // no-op
+            return self.resolveInst(inst.args.ptr);
+        }
 
-const x86_64 = @import("codegen/x86_64.zig");
-const x86 = @import("codegen/x86.zig");
+        fn genBitCast(self: *Self, inst: *ir.Inst.BitCast) !MCValue {
+            const operand = try self.resolveInst(inst.args.operand);
+            return operand;
+        }
 
-fn Reg(comptime arch: Target.Cpu.Arch) type {
-    return switch (arch) {
-        .i386 => x86.Register,
-        .x86_64 => x86_64.Register,
-        else => @compileError("TODO add more register enums"),
+        fn resolveInst(self: *Self, inst: *ir.Inst) !MCValue {
+            // Constants have static lifetimes, so they are always memoized in the outer most table.
+            if (inst.cast(ir.Inst.Constant)) |const_inst| {
+                const branch = &self.branch_stack.items[0];
+                const gop = try branch.inst_table.getOrPut(self.gpa, inst);
+                if (!gop.found_existing) {
+                    gop.entry.value = try self.genTypedValue(inst.src, .{ .ty = inst.ty, .val = const_inst.val });
+                }
+                return gop.entry.value;
+            }
+
+            // Treat each stack item as a "layer" on top of the previous one.
+            var i: usize = self.branch_stack.items.len;
+            while (true) {
+                i -= 1;
+                if (self.branch_stack.items[i].inst_table.get(inst)) |mcv| {
+                    return mcv;
+                }
+            }
+        }
+
+        fn moveToNewRegister(self: *Self, inst: *ir.Inst) !MCValue {
+            const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
+            return self.fail(inst.src, "TODO implement moveToNewRegister", .{});
+        }
+
+        /// If the MCValue is an immediate, and it does not fit within this type,
+        /// we put it in a register.
+        /// A potential opportunity for future optimization here would be keeping track
+        /// of the fact that the instruction is available both as an immediate
+        /// and as a register.
+        fn limitImmediateType(self: *Self, inst: *ir.Inst, comptime T: type) !MCValue {
+            const mcv = try self.resolveInst(inst);
+            const ti = @typeInfo(T).Int;
+            switch (mcv) {
+                .immediate => |imm| {
+                    // This immediate is unsigned.
+                    const U = @Type(.{
+                        .Int = .{
+                            .bits = ti.bits - @boolToInt(ti.is_signed),
+                            .is_signed = false,
+                        },
+                    });
+                    if (imm >= std.math.maxInt(U)) {
+                        return self.moveToNewRegister(inst);
+                    }
+                },
+                else => {},
+            }
+            return mcv;
+        }
+
+        fn genTypedValue(self: *Self, src: usize, typed_value: TypedValue) !MCValue {
+            const ptr_bits = self.target.cpu.arch.ptrBitWidth();
+            const ptr_bytes: u64 = @divExact(ptr_bits, 8);
+            switch (typed_value.ty.zigTypeTag()) {
+                .Pointer => {
+                    if (typed_value.val.cast(Value.Payload.DeclRef)) |payload| {
+                        const got = &self.bin_file.program_headers.items[self.bin_file.phdr_got_index.?];
+                        const decl = payload.decl;
+                        const got_addr = got.p_vaddr + decl.link.offset_table_index * ptr_bytes;
+                        return MCValue{ .memory = got_addr };
+                    }
+                    return self.fail(src, "TODO codegen more kinds of const pointers", .{});
+                },
+                .Int => {
+                    const info = typed_value.ty.intInfo(self.target.*);
+                    if (info.bits > ptr_bits or info.signed) {
+                        return self.fail(src, "TODO const int bigger than ptr and signed int", .{});
+                    }
+                    return MCValue{ .immediate = typed_value.val.toUnsignedInt() };
+                },
+                .Bool => {
+                    return MCValue{ .immediate = @boolToInt(typed_value.val.toBool()) };
+                },
+                .ComptimeInt => unreachable, // semantic analysis prevents this
+                .ComptimeFloat => unreachable, // semantic analysis prevents this
+                else => return self.fail(src, "TODO implement const of type '{}'", .{typed_value.ty}),
+            }
+        }
+
+        fn resolveParameters(
+            self: *Self,
+            src: usize,
+            cc: std.builtin.CallingConvention,
+            param_types: []const Type,
+            results: []MCValue,
+        ) !u32 {
+            switch (arch) {
+                .x86_64 => {
+                    switch (cc) {
+                        .Naked => {
+                            assert(results.len == 0);
+                            return 0;
+                        },
+                        .Unspecified, .C => {
+                            var next_int_reg: usize = 0;
+                            var next_stack_offset: u32 = 0;
+
+                            const integer_registers = [_]Reg{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
+                            for (param_types) |ty, i| {
+                                switch (ty.zigTypeTag()) {
+                                    .Bool, .Int => {
+                                        if (next_int_reg >= integer_registers.len) {
+                                            results[i] = .{ .stack_offset = next_stack_offset };
+                                            next_stack_offset += @intCast(u32, ty.abiSize(self.target.*));
+                                        } else {
+                                            results[i] = .{ .register = integer_registers[next_int_reg] };
+                                            next_int_reg += 1;
+                                        }
+                                    },
+                                    else => return self.fail(src, "TODO implement function parameters of type {}", .{@tagName(ty.zigTypeTag())}),
+                                }
+                            }
+                            return next_stack_offset;
+                        },
+                        else => return self.fail(src, "TODO implement function parameters for {}", .{cc}),
+                    }
+                },
+                else => return self.fail(src, "TODO implement C ABI support for {}", .{self.target.cpu.arch}),
+            }
+        }
+
+        fn fail(self: *Self, src: usize, comptime format: []const u8, args: anytype) error{ CodegenFail, OutOfMemory } {
+            @setCold(true);
+            assert(self.err_msg == null);
+            self.err_msg = try ErrorMsg.create(self.bin_file.allocator, src, format, args);
+            return error.CodegenFail;
+        }
+
+        const Reg = switch (arch) {
+            .i386 => x86.Register,
+            .x86_64 => x86_64.Register,
+            else => enum { dummy },
+        };
+
+        fn parseRegName(name: []const u8) ?Reg {
+            return std.meta.stringToEnum(Reg, name);
+        }
     };
 }
-
-fn parseRegName(comptime arch: Target.Cpu.Arch, name: []const u8) ?Reg(arch) {
-    return std.meta.stringToEnum(Reg(arch), name);
-}
diff --git a/src-self-hosted/codegen/x86_64.zig b/src-self-hosted/codegen/x86_64.zig
index ddcbd5320e..df8895275c 100644
--- a/src-self-hosted/codegen/x86_64.zig
+++ b/src-self-hosted/codegen/x86_64.zig
@@ -67,4 +67,7 @@ pub const Register = enum(u8) {
     }
 };
 
-// zig fmt: on
\ No newline at end of file
+// zig fmt: on
+
+/// These registers belong to the called function.
+pub const callee_preserved = [_]Register{ rax, rcx, rdx, rsi, rdi, r8, r9, r10, r11 };

From 896472c20e33c81a010b21a6f900e721a2cf0839 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 17 Jul 2020 15:51:15 -0700
Subject: [PATCH 2/4] stage2: implement register copying

---
 src-self-hosted/codegen.zig        | 106 +++++++++++++++++++++--------
 src-self-hosted/codegen/x86.zig    |  14 ++++
 src-self-hosted/codegen/x86_64.zig |  25 +++++--
 test/stage2/compare_output.zig     |  15 ++--
 4 files changed, 124 insertions(+), 36 deletions(-)

diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig
index c259eb2595..6e1686fd3e 100644
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@@ -11,8 +11,6 @@ const ErrorMsg = Module.ErrorMsg;
 const Target = std.Target;
 const Allocator = mem.Allocator;
 const trace = @import("tracy.zig").trace;
-const x86_64 = @import("codegen/x86_64.zig");
-const x86 = @import("codegen/x86.zig");
 
 /// The codegen-related data that is stored in `ir.Inst.Block` instructions.
 pub const BlockData = struct {
@@ -232,7 +230,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             /// The constant was emitted into the code, at this offset.
             embedded_in_code: usize,
             /// The value is in a target-specific register.
-            register: Reg,
+            register: Register,
             /// The value is in memory at a hard-coded address.
             memory: u64,
             /// The value is one of the stack variables.
@@ -280,9 +278,8 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
 
         const Branch = struct {
             inst_table: std.AutoHashMapUnmanaged(*ir.Inst, MCValue) = .{},
-
-            /// The key is an enum value of an arch-specific register.
-            registers: std.AutoHashMapUnmanaged(usize, RegisterAllocation) = .{},
+            registers: std.AutoHashMapUnmanaged(Register, RegisterAllocation) = .{},
+            free_registers: FreeRegInt = std.math.maxInt(FreeRegInt),
 
             /// Maps offset to what is stored there.
             stack: std.AutoHashMapUnmanaged(usize, StackAllocation) = .{},
@@ -292,6 +289,20 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             /// to place a new stack allocation, it goes here, and then bumps `max_end_stack`.
             next_stack_offset: u32 = 0,
 
+            fn markRegUsed(self: *Branch, reg: Register) void {
+                const index = reg.allocIndex() orelse return;
+                const ShiftInt = std.math.Log2Int(FreeRegInt);
+                const shift = @intCast(ShiftInt, index);
+                self.free_registers &= ~(@as(FreeRegInt, 1) << shift);
+            }
+
+            fn markRegFree(self: *Branch, reg: Register) void {
+                const index = reg.allocIndex() orelse return;
+                const ShiftInt = std.math.Log2Int(FreeRegInt);
+                const shift = @intCast(ShiftInt, index);
+                self.free_registers |= @as(FreeRegInt, 1) << shift;
+            }
+
             fn deinit(self: *Branch, gpa: *Allocator) void {
                 self.inst_table.deinit(gpa);
                 self.registers.deinit(gpa);
@@ -516,7 +527,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 // Both operands cannot be memory.
                 src_inst = op_rhs;
                 if (lhs.isMemory() and rhs.isMemory()) {
-                    dst_mcv = try self.moveToNewRegister(op_lhs);
+                    dst_mcv = try self.copyToNewRegister(op_lhs);
                     src_mcv = rhs;
                 } else {
                     dst_mcv = lhs;
@@ -527,7 +538,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 // Both operands cannot be memory.
                 src_inst = op_lhs;
                 if (lhs.isMemory() and rhs.isMemory()) {
-                    dst_mcv = try self.moveToNewRegister(op_rhs);
+                    dst_mcv = try self.copyToNewRegister(op_rhs);
                     src_mcv = lhs;
                 } else {
                     dst_mcv = rhs;
@@ -535,11 +546,11 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 }
             } else {
                 if (lhs.isMemory()) {
-                    dst_mcv = try self.moveToNewRegister(op_lhs);
+                    dst_mcv = try self.copyToNewRegister(op_lhs);
                     src_mcv = rhs;
                     src_inst = op_rhs;
                 } else {
-                    dst_mcv = try self.moveToNewRegister(op_rhs);
+                    dst_mcv = try self.copyToNewRegister(op_rhs);
                     src_mcv = lhs;
                     src_inst = op_lhs;
                 }
@@ -552,7 +563,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             switch (src_mcv) {
                 .immediate => |imm| {
                     if (imm > std.math.maxInt(u31)) {
-                        src_mcv = try self.moveToNewRegister(src_inst);
+                        src_mcv = try self.copyToNewRegister(src_inst);
                     }
                 },
                 else => {},
@@ -614,9 +625,26 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         }
 
         fn genArg(self: *Self, inst: *ir.Inst.Arg) !MCValue {
-            const i = self.arg_index;
+            if (FreeRegInt == u0) {
+                return self.fail(inst.base.src, "TODO implement Register enum for {}", .{self.target.cpu.arch});
+            }
+            if (inst.base.isUnused())
+                return MCValue.dead;
+
+            const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
+            try branch.registers.ensureCapacity(self.gpa, branch.registers.items().len + 1);
+
+            const result = self.args[self.arg_index];
             self.arg_index += 1;
-            return self.args[i];
+
+            switch (result) {
+                .register => |reg| {
+                    branch.registers.putAssumeCapacityNoClobber(reg, .{ .inst = &inst.base });
+                    branch.markRegUsed(reg);
+                },
+                else => {},
+            }
+            return result;
         }
 
         fn genBreakpoint(self: *Self, src: usize) !MCValue {
@@ -737,7 +765,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                     // Either one, but not both, can be a memory operand.
                     // Source operand can be an immediate, 8 bits or 32 bits.
                     const dst_mcv = if (lhs.isImmediate() or (lhs.isMemory() and rhs.isMemory()))
-                        try self.moveToNewRegister(inst.args.lhs)
+                        try self.copyToNewRegister(inst.args.lhs)
                     else
                         lhs;
                     // This instruction supports only signed 32-bit immediates at most.
@@ -949,7 +977,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        fn genSetReg(self: *Self, src: usize, reg: Reg, mcv: MCValue) error{ CodegenFail, OutOfMemory }!void {
+        fn genSetReg(self: *Self, src: usize, reg: Register, mcv: MCValue) error{ CodegenFail, OutOfMemory }!void {
             switch (arch) {
                 .x86_64 => switch (mcv) {
                     .dead => unreachable,
@@ -1171,9 +1199,22 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        fn moveToNewRegister(self: *Self, inst: *ir.Inst) !MCValue {
+        /// Does not "move" the instruction.
+        fn copyToNewRegister(self: *Self, inst: *ir.Inst) !MCValue {
             const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
-            return self.fail(inst.src, "TODO implement moveToNewRegister", .{});
+            try branch.registers.ensureCapacity(self.gpa, branch.registers.items().len + 1);
+            try branch.inst_table.ensureCapacity(self.gpa, branch.inst_table.items().len + 1);
+
+            const free_index = @ctz(FreeRegInt, branch.free_registers);
+            if (free_index >= callee_preserved_regs.len)
+                return self.fail(inst.src, "TODO implement spilling register to stack", .{});
+            branch.free_registers &= ~(@as(FreeRegInt, 1) << free_index);
+            const reg = callee_preserved_regs[free_index];
+            branch.registers.putAssumeCapacityNoClobber(reg, .{ .inst = inst });
+            const old_mcv = branch.inst_table.get(inst).?;
+            const new_mcv: MCValue = .{ .register = reg };
+            try self.genSetReg(inst.src, reg, old_mcv);
+            return new_mcv;
         }
 
         /// If the MCValue is an immediate, and it does not fit within this type,
@@ -1194,7 +1235,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         },
                     });
                     if (imm >= std.math.maxInt(U)) {
-                        return self.moveToNewRegister(inst);
+                        return self.copyToNewRegister(inst);
                     }
                 },
                 else => {},
@@ -1249,15 +1290,14 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                             var next_int_reg: usize = 0;
                             var next_stack_offset: u32 = 0;
 
-                            const integer_registers = [_]Reg{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
                             for (param_types) |ty, i| {
                                 switch (ty.zigTypeTag()) {
                                     .Bool, .Int => {
-                                        if (next_int_reg >= integer_registers.len) {
+                                        if (next_int_reg >= c_abi_int_param_regs.len) {
                                             results[i] = .{ .stack_offset = next_stack_offset };
                                             next_stack_offset += @intCast(u32, ty.abiSize(self.target.*));
                                         } else {
-                                            results[i] = .{ .register = integer_registers[next_int_reg] };
+                                            results[i] = .{ .register = c_abi_int_param_regs[next_int_reg] };
                                             next_int_reg += 1;
                                         }
                                     },
@@ -1280,14 +1320,26 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return error.CodegenFail;
         }
 
-        const Reg = switch (arch) {
-            .i386 => x86.Register,
-            .x86_64 => x86_64.Register,
-            else => enum { dummy },
+        usingnamespace switch (arch) {
+            .i386 => @import("codegen/x86.zig"),
+            .x86_64 => @import("codegen/x86_64.zig"),
+            else => struct {
+                pub const Register = enum {
+                    dummy,
+
+                    pub fn allocIndex(self: Register) ?u4 {
+                        return null;
+                    }
+                };
+                pub const callee_preserved_regs = [_]Register{};
+            },
         };
 
-        fn parseRegName(name: []const u8) ?Reg {
-            return std.meta.stringToEnum(Reg, name);
+        /// An integer whose bits represent all the registers and whether they are free.
+        const FreeRegInt = @Type(.{ .Int = .{ .is_signed = false, .bits = callee_preserved_regs.len } });
+
+        fn parseRegName(name: []const u8) ?Register {
+            return std.meta.stringToEnum(Register, name);
         }
     };
 }
diff --git a/src-self-hosted/codegen/x86.zig b/src-self-hosted/codegen/x86.zig
index da0f4e722a..e0d0848bf5 100644
--- a/src-self-hosted/codegen/x86.zig
+++ b/src-self-hosted/codegen/x86.zig
@@ -25,6 +25,20 @@ pub const Register = enum(u8) {
     pub fn id(self: @This()) u3 {
         return @truncate(u3, @enumToInt(self));
     }
+
+    /// Returns the index into `callee_preserved_regs`.
+    pub fn allocIndex(self: Register) ?u4 {
+        return switch (self) {
+            .eax, .ax, .al => 0,
+            .ecx, .cx, .cl => 1,
+            .edx, .dx, .dl => 2,
+            .esi, .si  => 3,
+            .edi, .di => 4,
+            else => null,
+        };
+    }
 };
 
 // zig fmt: on
+
+pub const callee_preserved_regs = [_]Register{ .eax, .ecx, .edx, .esi, .edi };
diff --git a/src-self-hosted/codegen/x86_64.zig b/src-self-hosted/codegen/x86_64.zig
index df8895275c..2c0937d28d 100644
--- a/src-self-hosted/codegen/x86_64.zig
+++ b/src-self-hosted/codegen/x86_64.zig
@@ -38,7 +38,7 @@ pub const Register = enum(u8) {
     r8b, r9b, r10b, r11b, r12b, r13b, r14b, r15b,
 
     /// Returns the bit-width of the register.
-    pub fn size(self: @This()) u7 {
+    pub fn size(self: Register) u7 {
         return switch (@enumToInt(self)) {
             0...15 => 64,
             16...31 => 32,
@@ -53,7 +53,7 @@ pub const Register = enum(u8) {
     /// other variant of access to those registers, such as r8b, r15d, and so
     /// on. This is needed because access to these registers requires special
     /// handling via the REX prefix, via the B or R bits, depending on context.
-    pub fn isExtended(self: @This()) bool {
+    pub fn isExtended(self: Register) bool {
         return @enumToInt(self) & 0x08 != 0;
     }
 
@@ -62,12 +62,29 @@ pub const Register = enum(u8) {
     /// an instruction (@see isExtended), and requires special handling. The
     /// lower three bits are often embedded directly in instructions (such as
     /// the B8 variant of moves), or used in R/M bytes.
-    pub fn id(self: @This()) u4 {
+    pub fn id(self: Register) u4 {
         return @truncate(u4, @enumToInt(self));
     }
+
+    /// Returns the index into `callee_preserved_regs`.
+    pub fn allocIndex(self: Register) ?u4 {
+        return switch (self) {
+            .rax, .eax, .ax, .al => 0,
+            .rcx, .ecx, .cx, .cl => 1,
+            .rdx, .edx, .dx, .dl => 2,
+            .rsi, .esi, .si  => 3,
+            .rdi, .edi, .di => 4,
+            .r8, .r8d, .r8w, .r8b => 5,
+            .r9, .r9d, .r9w, .r9b => 6,
+            .r10, .r10d, .r10w, .r10b => 7,
+            .r11, .r11d, .r11w, .r11b => 8,
+            else => null,
+        };
+    }
 };
 
 // zig fmt: on
 
 /// These registers belong to the called function.
-pub const callee_preserved = [_]Register{ rax, rcx, rdx, rsi, rdi, r8, r9, r10, r11 };
+pub const callee_preserved_regs = [_]Register{ .rax, .rcx, .rdx, .rsi, .rdi, .r8, .r9, .r10, .r11 };
+pub const c_abi_int_param_regs = [_]Register{ .rdi, .rsi, .rdx, .rcx, .r8, .r9 };
diff --git a/test/stage2/compare_output.zig b/test/stage2/compare_output.zig
index c4f85bfba4..4c8d23f3c6 100644
--- a/test/stage2/compare_output.zig
+++ b/test/stage2/compare_output.zig
@@ -169,9 +169,8 @@ pub fn addCases(ctx: *TestContext) !void {
         ,
             "",
         );
-    }
-    {
-        var case = ctx.exe("assert function", linux_x64);
+
+        // Tests the assert() function.
         case.addCompareOutput(
             \\export fn _start() noreturn {
             \\    add(3, 4);
@@ -199,15 +198,21 @@ pub fn addCases(ctx: *TestContext) !void {
         ,
             "",
         );
+
+        // Tests copying a register. For the `c = a + b`, it has to
+        // preserve both a and b, because they are both used later.
         case.addCompareOutput(
             \\export fn _start() noreturn {
-            \\    add(100, 200);
+            \\    add(3, 4);
             \\
             \\    exit();
             \\}
             \\
             \\fn add(a: u32, b: u32) void {
-            \\    assert(a + b == 300);
+            \\    const c = a + b; // 7
+            \\    const d = a + c; // 10
+            \\    const e = d + b; // 14
+            \\    assert(e == 14);
             \\}
             \\
             \\pub fn assert(ok: bool) void {

From a8065a05a5bc3df4036f1d7abe0928901cf7f5df Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 17 Jul 2020 17:03:24 -0700
Subject: [PATCH 3/4] stage2: fix implementation of liveness operandDies()

---
 src-self-hosted/codegen.zig    |  2 ++
 src-self-hosted/ir.zig         |  2 +-
 test/stage2/compare_output.zig | 36 ++++++++++++++++++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig
index 6e1686fd3e..2cc471a07d 100644
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@@ -407,6 +407,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             for (body.instructions) |inst| {
                 const new_inst = try self.genFuncInst(inst);
                 try inst_table.putNoClobber(self.gpa, inst, new_inst);
+                // TODO process operand deaths
             }
         }
 
@@ -1194,6 +1195,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             while (true) {
                 i -= 1;
                 if (self.branch_stack.items[i].inst_table.get(inst)) |mcv| {
+                    assert(mcv != .dead);
                     return mcv;
                 }
             }
diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig
index a150957de0..9902bd70aa 100644
--- a/src-self-hosted/ir.zig
+++ b/src-self-hosted/ir.zig
@@ -38,7 +38,7 @@ pub const Inst = struct {
 
     pub fn operandDies(self: Inst, index: DeathsBitIndex) bool {
         assert(index < deaths_bits);
-        return @truncate(u1, self.deaths << index) != 0;
+        return @truncate(u1, self.deaths >> index) != 0;
     }
 
     pub fn specialOperandDeaths(self: Inst) bool {
diff --git a/test/stage2/compare_output.zig b/test/stage2/compare_output.zig
index 4c8d23f3c6..6a6772f935 100644
--- a/test/stage2/compare_output.zig
+++ b/test/stage2/compare_output.zig
@@ -231,5 +231,41 @@ pub fn addCases(ctx: *TestContext) !void {
         ,
             "",
         );
+
+        // More stress on the liveness detection.
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    add(3, 4);
+            \\
+            \\    exit();
+            \\}
+            \\
+            \\fn add(a: u32, b: u32) void {
+            \\    const c = a + b; // 7
+            \\    const d = a + c; // 10
+            \\    const e = d + b; // 14
+            \\    const f = d + e; // 24
+            \\    const g = e + f; // 38
+            \\    const h = f + g; // 62
+            \\    const i = g + h; // 100
+            \\    assert(i == 100);
+            \\}
+            \\
+            \\pub fn assert(ok: bool) void {
+            \\    if (!ok) unreachable; // assertion failure
+            \\}
+            \\
+            \\fn exit() noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (0)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
     }
 }

From ef91b11295a549a8173c488d9fd5b3f69b419829 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Mon, 20 Jul 2020 13:11:07 -0700
Subject: [PATCH 4/4] stage2: register allocator processes operand deaths

also rework the IR data structures
---
 src-self-hosted/Module.zig     | 223 ++++++++++++++------
 src-self-hosted/astgen.zig     |   8 +-
 src-self-hosted/codegen.zig    | 150 ++++++++------
 src-self-hosted/codegen/c.zig  |  21 +-
 src-self-hosted/ir.zig         | 361 +++++++++++++++++++++------------
 src-self-hosted/liveness.zig   |  90 +++-----
 src-self-hosted/zir.zig        | 345 ++++++++++++++-----------------
 test/stage2/compare_output.zig |  37 ++++
 8 files changed, 708 insertions(+), 527 deletions(-)

diff --git a/src-self-hosted/Module.zig b/src-self-hosted/Module.zig
index 72e5f6cd63..25136b5289 100644
--- a/src-self-hosted/Module.zig
+++ b/src-self-hosted/Module.zig
@@ -1349,8 +1349,8 @@ fn astGenAndAnalyzeDecl(self: *Module, decl: *Decl) !bool {
 fn analyzeBodyValueAsType(self: *Module, block_scope: *Scope.Block, body: zir.Module.Body) !Type {
     try self.analyzeBody(&block_scope.base, body);
     for (block_scope.instructions.items) |inst| {
-        if (inst.cast(Inst.Ret)) |ret| {
-            const val = try self.resolveConstValue(&block_scope.base, ret.args.operand);
+        if (inst.castTag(.ret)) |ret| {
+            const val = try self.resolveConstValue(&block_scope.base, ret.operand);
             return val.toType();
         } else {
             return self.fail(&block_scope.base, inst.src, "unable to resolve comptime value", .{});
@@ -1938,16 +1938,132 @@ fn analyzeExport(self: *Module, scope: *Scope, src: usize, symbol_name: []const
     };
 }
 
-fn addNewInstArgs(
+fn addNoOp(
     self: *Module,
     block: *Scope.Block,
     src: usize,
     ty: Type,
-    comptime T: type,
-    args: Inst.Args(T),
+    comptime tag: Inst.Tag,
 ) !*Inst {
-    const inst = try self.addNewInst(block, src, ty, T);
-    inst.args = args;
+    const inst = try block.arena.create(tag.Type());
+    inst.* = .{
+        .base = .{
+            .tag = tag,
+            .ty = ty,
+            .src = src,
+        },
+    };
+    try block.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
+}
+
+fn addUnOp(
+    self: *Module,
+    block: *Scope.Block,
+    src: usize,
+    ty: Type,
+    tag: Inst.Tag,
+    operand: *Inst,
+) !*Inst {
+    const inst = try block.arena.create(Inst.UnOp);
+    inst.* = .{
+        .base = .{
+            .tag = tag,
+            .ty = ty,
+            .src = src,
+        },
+        .operand = operand,
+    };
+    try block.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
+}
+
+fn addBinOp(
+    self: *Module,
+    block: *Scope.Block,
+    src: usize,
+    ty: Type,
+    tag: Inst.Tag,
+    lhs: *Inst,
+    rhs: *Inst,
+) !*Inst {
+    const inst = try block.arena.create(Inst.BinOp);
+    inst.* = .{
+        .base = .{
+            .tag = tag,
+            .ty = ty,
+            .src = src,
+        },
+        .lhs = lhs,
+        .rhs = rhs,
+    };
+    try block.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
+}
+
+fn addBr(
+    self: *Module,
+    scope_block: *Scope.Block,
+    src: usize,
+    target_block: *Inst.Block,
+    operand: *Inst,
+) !*Inst {
+    const inst = try scope_block.arena.create(Inst.Br);
+    inst.* = .{
+        .base = .{
+            .tag = .br,
+            .ty = Type.initTag(.noreturn),
+            .src = src,
+        },
+        .operand = operand,
+        .block = target_block,
+    };
+    try scope_block.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
+}
+
+fn addCondBr(
+    self: *Module,
+    block: *Scope.Block,
+    src: usize,
+    condition: *Inst,
+    then_body: ir.Body,
+    else_body: ir.Body,
+) !*Inst {
+    const inst = try block.arena.create(Inst.CondBr);
+    inst.* = .{
+        .base = .{
+            .tag = .condbr,
+            .ty = Type.initTag(.noreturn),
+            .src = src,
+        },
+        .condition = condition,
+        .then_body = then_body,
+        .else_body = else_body,
+    };
+    try block.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
+}
+
+fn addCall(
+    self: *Module,
+    block: *Scope.Block,
+    src: usize,
+    ty: Type,
+    func: *Inst,
+    args: []const *Inst,
+) !*Inst {
+    const inst = try block.arena.create(Inst.Call);
+    inst.* = .{
+        .base = .{
+            .tag = .call,
+            .ty = ty,
+            .src = src,
+        },
+        .func = func,
+        .args = args,
+    };
+    try block.instructions.append(self.gpa, &inst.base);
     return &inst.base;
 }
 
@@ -2017,7 +2133,6 @@ fn addNewInst(self: *Module, block: *Scope.Block, src: usize, ty: Type, comptime
             .ty = ty,
             .src = src,
         },
-        .args = undefined,
     };
     try block.instructions.append(self.gpa, &inst.base);
     return inst;
@@ -2269,7 +2384,7 @@ fn analyzeInstArg(self: *Module, scope: *Scope, inst: *zir.Inst.Arg) InnerError!
         });
     }
     const param_type = fn_ty.fnParamType(param_index);
-    return self.addNewInstArgs(b, inst.base.src, param_type, Inst.Arg, {});
+    return self.addNoOp(b, inst.base.src, param_type, .arg);
 }
 
 fn analyzeInstBlock(self: *Module, scope: *Scope, inst: *zir.Inst.Block) InnerError!*Inst {
@@ -2285,7 +2400,7 @@ fn analyzeInstBlock(self: *Module, scope: *Scope, inst: *zir.Inst.Block) InnerEr
             .ty = undefined, // Set after analysis.
             .src = inst.base.src,
         },
-        .args = undefined,
+        .body = undefined,
     };
 
     var child_block: Scope.Block = .{
@@ -2316,13 +2431,13 @@ fn analyzeInstBlock(self: *Module, scope: *Scope, inst: *zir.Inst.Block) InnerEr
     // to emit a jump instruction to after the block when it encounters the break.
     try parent_block.instructions.append(self.gpa, &block_inst.base);
     block_inst.base.ty = try self.resolvePeerTypes(scope, label.results.items);
-    block_inst.args.body = .{ .instructions = try parent_block.arena.dupe(*Inst, child_block.instructions.items) };
+    block_inst.body = .{ .instructions = try parent_block.arena.dupe(*Inst, child_block.instructions.items) };
     return &block_inst.base;
 }
 
 fn analyzeInstBreakpoint(self: *Module, scope: *Scope, inst: *zir.Inst.Breakpoint) InnerError!*Inst {
     const b = try self.requireRuntimeBlock(scope, inst.base.src);
-    return self.addNewInstArgs(b, inst.base.src, Type.initTag(.void), Inst.Breakpoint, {});
+    return self.addNoOp(b, inst.base.src, Type.initTag(.void), .breakpoint);
 }
 
 fn analyzeInstBreak(self: *Module, scope: *Scope, inst: *zir.Inst.Break) InnerError!*Inst {
@@ -2350,10 +2465,7 @@ fn analyzeBreak(
             if (label.zir_block == zir_block) {
                 try label.results.append(self.gpa, operand);
                 const b = try self.requireRuntimeBlock(scope, src);
-                return self.addNewInstArgs(b, src, Type.initTag(.noreturn), Inst.Br, .{
-                    .block = label.block_inst,
-                    .operand = operand,
-                });
+                return self.addBr(b, src, label.block_inst, operand);
             }
         }
         opt_block = block.parent;
@@ -2484,10 +2596,7 @@ fn analyzeInstCall(self: *Module, scope: *Scope, inst: *zir.Inst.Call) InnerErro
     }
 
     const b = try self.requireRuntimeBlock(scope, inst.base.src);
-    return self.addNewInstArgs(b, inst.base.src, Type.initTag(.void), Inst.Call, .{
-        .func = func,
-        .args = casted_args,
-    });
+    return self.addCall(b, inst.base.src, Type.initTag(.void), func, casted_args);
 }
 
 fn analyzeInstFn(self: *Module, scope: *Scope, fn_inst: *zir.Inst.Fn) InnerError!*Inst {
@@ -2570,14 +2679,14 @@ fn analyzeInstAs(self: *Module, scope: *Scope, as: *zir.Inst.As) InnerError!*Ins
 }
 
 fn analyzeInstPtrToInt(self: *Module, scope: *Scope, ptrtoint: *zir.Inst.PtrToInt) InnerError!*Inst {
-    const ptr = try self.resolveInst(scope, ptrtoint.positionals.ptr);
+    const ptr = try self.resolveInst(scope, ptrtoint.positionals.operand);
     if (ptr.ty.zigTypeTag() != .Pointer) {
-        return self.fail(scope, ptrtoint.positionals.ptr.src, "expected pointer, found '{}'", .{ptr.ty});
+        return self.fail(scope, ptrtoint.positionals.operand.src, "expected pointer, found '{}'", .{ptr.ty});
     }
     // TODO handle known-pointer-address
     const b = try self.requireRuntimeBlock(scope, ptrtoint.base.src);
     const ty = Type.initTag(.usize);
-    return self.addNewInstArgs(b, ptrtoint.base.src, ty, Inst.PtrToInt, .{ .ptr = ptr });
+    return self.addUnOp(b, ptrtoint.base.src, ty, .ptrtoint, ptr);
 }
 
 fn analyzeInstFieldPtr(self: *Module, scope: *Scope, fieldptr: *zir.Inst.FieldPtr) InnerError!*Inst {
@@ -2734,10 +2843,7 @@ fn analyzeInstAdd(self: *Module, scope: *Scope, inst: *zir.Inst.Add) InnerError!
         }
 
         const b = try self.requireRuntimeBlock(scope, inst.base.src);
-        return self.addNewInstArgs(b, inst.base.src, lhs.ty, Inst.Add, .{
-            .lhs = lhs,
-            .rhs = rhs,
-        });
+        return self.addBinOp(b, inst.base.src, lhs.ty, .add, lhs, rhs);
     }
     return self.fail(scope, inst.base.src, "TODO analyze add for {} + {}", .{ lhs.ty.zigTypeTag(), rhs.ty.zigTypeTag() });
 }
@@ -2783,14 +2889,22 @@ fn analyzeInstAsm(self: *Module, scope: *Scope, assembly: *zir.Inst.Asm) InnerEr
     }
 
     const b = try self.requireRuntimeBlock(scope, assembly.base.src);
-    return self.addNewInstArgs(b, assembly.base.src, return_type, Inst.Assembly, .{
+    const inst = try b.arena.create(Inst.Assembly);
+    inst.* = .{
+        .base = .{
+            .tag = .assembly,
+            .ty = return_type,
+            .src = assembly.base.src,
+        },
         .asm_source = asm_source,
         .is_volatile = assembly.kw_args.@"volatile",
         .output = output,
         .inputs = inputs,
         .clobbers = clobbers,
         .args = args,
-    });
+    };
+    try b.instructions.append(self.gpa, &inst.base);
+    return &inst.base;
 }
 
 fn analyzeInstCmp(self: *Module, scope: *Scope, inst: *zir.Inst.Cmp) InnerError!*Inst {
@@ -2818,15 +2932,12 @@ fn analyzeInstCmp(self: *Module, scope: *Scope, inst: *zir.Inst.Cmp) InnerError!
             return self.constBool(scope, inst.base.src, if (op == .eq) is_null else !is_null);
         }
         const b = try self.requireRuntimeBlock(scope, inst.base.src);
-        switch (op) {
-            .eq => return self.addNewInstArgs(b, inst.base.src, Type.initTag(.bool), Inst.IsNull, .{
-                .operand = opt_operand,
-            }),
-            .neq => return self.addNewInstArgs(b, inst.base.src, Type.initTag(.bool), Inst.IsNonNull, .{
-                .operand = opt_operand,
-            }),
+        const inst_tag: Inst.Tag = switch (op) {
+            .eq => .isnull,
+            .neq => .isnonnull,
             else => unreachable,
-        }
+        };
+        return self.addUnOp(b, inst.base.src, Type.initTag(.bool), inst_tag, opt_operand);
     } else if (is_equality_cmp and
         ((lhs_ty_tag == .Null and rhs.ty.isCPtr()) or (rhs_ty_tag == .Null and lhs.ty.isCPtr())))
     {
@@ -2861,7 +2972,7 @@ fn analyzeInstBoolNot(self: *Module, scope: *Scope, inst: *zir.Inst.BoolNot) Inn
         return self.constBool(scope, inst.base.src, !val.toBool());
     }
     const b = try self.requireRuntimeBlock(scope, inst.base.src);
-    return self.addNewInstArgs(b, inst.base.src, bool_type, Inst.Not, .{ .operand = operand });
+    return self.addUnOp(b, inst.base.src, bool_type, .not, operand);
 }
 
 fn analyzeInstIsNull(self: *Module, scope: *Scope, inst: *zir.Inst.IsNull) InnerError!*Inst {
@@ -2879,7 +2990,7 @@ fn analyzeInstCondBr(self: *Module, scope: *Scope, inst: *zir.Inst.CondBr) Inner
     const cond = try self.coerce(scope, Type.initTag(.bool), uncasted_cond);
 
     if (try self.resolveDefinedValue(scope, cond)) |cond_val| {
-        const body = if (cond_val.toBool()) &inst.positionals.true_body else &inst.positionals.false_body;
+        const body = if (cond_val.toBool()) &inst.positionals.then_body else &inst.positionals.else_body;
         try self.analyzeBody(scope, body.*);
         return self.constVoid(scope, inst.base.src);
     }
@@ -2894,7 +3005,7 @@ fn analyzeInstCondBr(self: *Module, scope: *Scope, inst: *zir.Inst.CondBr) Inner
         .arena = parent_block.arena,
     };
     defer true_block.instructions.deinit(self.gpa);
-    try self.analyzeBody(&true_block.base, inst.positionals.true_body);
+    try self.analyzeBody(&true_block.base, inst.positionals.then_body);
 
     var false_block: Scope.Block = .{
         .parent = parent_block,
@@ -2904,13 +3015,11 @@ fn analyzeInstCondBr(self: *Module, scope: *Scope, inst: *zir.Inst.CondBr) Inner
         .arena = parent_block.arena,
     };
     defer false_block.instructions.deinit(self.gpa);
-    try self.analyzeBody(&false_block.base, inst.positionals.false_body);
+    try self.analyzeBody(&false_block.base, inst.positionals.else_body);
 
-    return self.addNewInstArgs(parent_block, inst.base.src, Type.initTag(.noreturn), Inst.CondBr, Inst.Args(Inst.CondBr){
-        .condition = cond,
-        .true_body = .{ .instructions = try scope.arena().dupe(*Inst, true_block.instructions.items) },
-        .false_body = .{ .instructions = try scope.arena().dupe(*Inst, false_block.instructions.items) },
-    });
+    const then_body: ir.Body = .{ .instructions = try scope.arena().dupe(*Inst, true_block.instructions.items) };
+    const else_body: ir.Body = .{ .instructions = try scope.arena().dupe(*Inst, false_block.instructions.items) };
+    return self.addCondBr(parent_block, inst.base.src, cond, then_body, else_body);
 }
 
 fn wantSafety(self: *Module, scope: *Scope) bool {
@@ -2926,20 +3035,20 @@ fn analyzeInstUnreachable(self: *Module, scope: *Scope, unreach: *zir.Inst.Unrea
     const b = try self.requireRuntimeBlock(scope, unreach.base.src);
     if (self.wantSafety(scope)) {
         // TODO Once we have a panic function to call, call it here instead of this.
-        _ = try self.addNewInstArgs(b, unreach.base.src, Type.initTag(.void), Inst.Breakpoint, {});
+        _ = try self.addNoOp(b, unreach.base.src, Type.initTag(.void), .breakpoint);
     }
-    return self.addNewInstArgs(b, unreach.base.src, Type.initTag(.noreturn), Inst.Unreach, {});
+    return self.addNoOp(b, unreach.base.src, Type.initTag(.noreturn), .unreach);
 }
 
 fn analyzeInstRet(self: *Module, scope: *Scope, inst: *zir.Inst.Return) InnerError!*Inst {
     const operand = try self.resolveInst(scope, inst.positionals.operand);
     const b = try self.requireRuntimeBlock(scope, inst.base.src);
-    return self.addNewInstArgs(b, inst.base.src, Type.initTag(.noreturn), Inst.Ret, .{ .operand = operand });
+    return self.addUnOp(b, inst.base.src, Type.initTag(.noreturn), .ret, operand);
 }
 
 fn analyzeInstRetVoid(self: *Module, scope: *Scope, inst: *zir.Inst.ReturnVoid) InnerError!*Inst {
     const b = try self.requireRuntimeBlock(scope, inst.base.src);
-    return self.addNewInstArgs(b, inst.base.src, Type.initTag(.noreturn), Inst.RetVoid, {});
+    return self.addNoOp(b, inst.base.src, Type.initTag(.noreturn), .retvoid);
 }
 
 fn analyzeBody(self: *Module, scope: *Scope, body: zir.Module.Body) !void {
@@ -3027,11 +3136,7 @@ fn cmpNumeric(
         };
         const casted_lhs = try self.coerce(scope, dest_type, lhs);
         const casted_rhs = try self.coerce(scope, dest_type, rhs);
-        return self.addNewInstArgs(b, src, dest_type, Inst.Cmp, .{
-            .lhs = casted_lhs,
-            .rhs = casted_rhs,
-            .op = op,
-        });
+        return self.addBinOp(b, src, dest_type, Inst.Tag.fromCmpOp(op), casted_lhs, casted_rhs);
     }
     // For mixed unsigned integer sizes, implicit cast both operands to the larger integer.
     // For mixed signed and unsigned integers, implicit cast both operands to a signed
@@ -3131,11 +3236,7 @@ fn cmpNumeric(
     const casted_lhs = try self.coerce(scope, dest_type, lhs);
     const casted_rhs = try self.coerce(scope, dest_type, rhs);
 
-    return self.addNewInstArgs(b, src, Type.initTag(.bool), Inst.Cmp, .{
-        .lhs = casted_lhs,
-        .rhs = casted_rhs,
-        .op = op,
-    });
+    return self.addBinOp(b, src, Type.initTag(.bool), Inst.Tag.fromCmpOp(op), casted_lhs, casted_rhs);
 }
 
 fn makeIntType(self: *Module, scope: *Scope, signed: bool, bits: u16) !Type {
@@ -3236,7 +3337,7 @@ fn bitcast(self: *Module, scope: *Scope, dest_type: Type, inst: *Inst) !*Inst {
     }
     // TODO validate the type size and other compile errors
     const b = try self.requireRuntimeBlock(scope, inst.src);
-    return self.addNewInstArgs(b, inst.src, dest_type, Inst.BitCast, .{ .operand = inst });
+    return self.addUnOp(b, inst.src, dest_type, .bitcast, inst);
 }
 
 fn coerceArrayPtrToSlice(self: *Module, scope: *Scope, dest_type: Type, inst: *Inst) !*Inst {
diff --git a/src-self-hosted/astgen.zig b/src-self-hosted/astgen.zig
index be70a724c2..813d4d8dca 100644
--- a/src-self-hosted/astgen.zig
+++ b/src-self-hosted/astgen.zig
@@ -173,8 +173,8 @@ fn ifExpr(mod: *Module, scope: *Scope, if_node: *ast.Node.If) InnerError!*zir.In
     const if_src = tree.token_locs[if_node.if_token].start;
     const condbr = try mod.addZIRInstSpecial(&block_scope.base, if_src, zir.Inst.CondBr, .{
         .condition = cond,
-        .true_body = undefined, // populated below
-        .false_body = undefined, // populated below
+        .then_body = undefined, // populated below
+        .else_body = undefined, // populated below
     }, .{});
 
     const block = try mod.addZIRInstBlock(scope, if_src, .{
@@ -196,7 +196,7 @@ fn ifExpr(mod: *Module, scope: *Scope, if_node: *ast.Node.If) InnerError!*zir.In
             .operand = then_result,
         }, .{});
     }
-    condbr.positionals.true_body = .{
+    condbr.positionals.then_body = .{
         .instructions = try then_scope.arena.dupe(*zir.Inst, then_scope.instructions.items),
     };
 
@@ -225,7 +225,7 @@ fn ifExpr(mod: *Module, scope: *Scope, if_node: *ast.Node.If) InnerError!*zir.In
             .block = block,
         }, .{});
     }
-    condbr.positionals.false_body = .{
+    condbr.positionals.else_body = .{
         .instructions = try else_scope.arena.dupe(*zir.Inst, else_scope.instructions.items),
     };
 
diff --git a/src-self-hosted/codegen.zig b/src-self-hosted/codegen.zig
index 2cc471a07d..d64c1824cf 100644
--- a/src-self-hosted/codegen.zig
+++ b/src-self-hosted/codegen.zig
@@ -290,6 +290,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             next_stack_offset: u32 = 0,
 
             fn markRegUsed(self: *Branch, reg: Register) void {
+                if (FreeRegInt == u0) return;
                 const index = reg.allocIndex() orelse return;
                 const ShiftInt = std.math.Log2Int(FreeRegInt);
                 const shift = @intCast(ShiftInt, index);
@@ -297,6 +298,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
 
             fn markRegFree(self: *Branch, reg: Register) void {
+                if (FreeRegInt == u0) return;
                 const index = reg.allocIndex() orelse return;
                 const ShiftInt = std.math.Log2Int(FreeRegInt);
                 const shift = @intCast(ShiftInt, index);
@@ -407,40 +409,64 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             for (body.instructions) |inst| {
                 const new_inst = try self.genFuncInst(inst);
                 try inst_table.putNoClobber(self.gpa, inst, new_inst);
-                // TODO process operand deaths
+
+                var i: ir.Inst.DeathsBitIndex = 0;
+                while (inst.getOperand(i)) |operand| : (i += 1) {
+                    if (inst.operandDies(i))
+                        self.processDeath(operand);
+                }
+            }
+        }
+
+        fn processDeath(self: *Self, inst: *ir.Inst) void {
+            const branch = &self.branch_stack.items[self.branch_stack.items.len - 1];
+            const entry = branch.inst_table.getEntry(inst) orelse return;
+            const prev_value = entry.value;
+            entry.value = .dead;
+            switch (prev_value) {
+                .register => |reg| {
+                    _ = branch.registers.remove(reg);
+                    branch.markRegFree(reg);
+                },
+                else => {}, // TODO process stack allocation death
             }
         }
 
         fn genFuncInst(self: *Self, inst: *ir.Inst) !MCValue {
             switch (inst.tag) {
-                .add => return self.genAdd(inst.cast(ir.Inst.Add).?),
-                .arg => return self.genArg(inst.cast(ir.Inst.Arg).?),
-                .assembly => return self.genAsm(inst.cast(ir.Inst.Assembly).?),
-                .bitcast => return self.genBitCast(inst.cast(ir.Inst.BitCast).?),
-                .block => return self.genBlock(inst.cast(ir.Inst.Block).?),
-                .br => return self.genBr(inst.cast(ir.Inst.Br).?),
+                .add => return self.genAdd(inst.castTag(.add).?),
+                .arg => return self.genArg(inst.castTag(.arg).?),
+                .assembly => return self.genAsm(inst.castTag(.assembly).?),
+                .bitcast => return self.genBitCast(inst.castTag(.bitcast).?),
+                .block => return self.genBlock(inst.castTag(.block).?),
+                .br => return self.genBr(inst.castTag(.br).?),
                 .breakpoint => return self.genBreakpoint(inst.src),
-                .brvoid => return self.genBrVoid(inst.cast(ir.Inst.BrVoid).?),
-                .call => return self.genCall(inst.cast(ir.Inst.Call).?),
-                .cmp => return self.genCmp(inst.cast(ir.Inst.Cmp).?),
-                .condbr => return self.genCondBr(inst.cast(ir.Inst.CondBr).?),
+                .brvoid => return self.genBrVoid(inst.castTag(.brvoid).?),
+                .call => return self.genCall(inst.castTag(.call).?),
+                .cmp_lt => return self.genCmp(inst.castTag(.cmp_lt).?, .lt),
+                .cmp_lte => return self.genCmp(inst.castTag(.cmp_lte).?, .lte),
+                .cmp_eq => return self.genCmp(inst.castTag(.cmp_eq).?, .eq),
+                .cmp_gte => return self.genCmp(inst.castTag(.cmp_gte).?, .gte),
+                .cmp_gt => return self.genCmp(inst.castTag(.cmp_gt).?, .gt),
+                .cmp_neq => return self.genCmp(inst.castTag(.cmp_neq).?, .neq),
+                .condbr => return self.genCondBr(inst.castTag(.condbr).?),
                 .constant => unreachable, // excluded from function bodies
-                .isnonnull => return self.genIsNonNull(inst.cast(ir.Inst.IsNonNull).?),
-                .isnull => return self.genIsNull(inst.cast(ir.Inst.IsNull).?),
-                .ptrtoint => return self.genPtrToInt(inst.cast(ir.Inst.PtrToInt).?),
-                .ret => return self.genRet(inst.cast(ir.Inst.Ret).?),
-                .retvoid => return self.genRetVoid(inst.cast(ir.Inst.RetVoid).?),
-                .sub => return self.genSub(inst.cast(ir.Inst.Sub).?),
+                .isnonnull => return self.genIsNonNull(inst.castTag(.isnonnull).?),
+                .isnull => return self.genIsNull(inst.castTag(.isnull).?),
+                .ptrtoint => return self.genPtrToInt(inst.castTag(.ptrtoint).?),
+                .ret => return self.genRet(inst.castTag(.ret).?),
+                .retvoid => return self.genRetVoid(inst.castTag(.retvoid).?),
+                .sub => return self.genSub(inst.castTag(.sub).?),
                 .unreach => return MCValue{ .unreach = {} },
-                .not => return self.genNot(inst.cast(ir.Inst.Not).?),
+                .not => return self.genNot(inst.castTag(.not).?),
             }
         }
 
-        fn genNot(self: *Self, inst: *ir.Inst.Not) !MCValue {
+        fn genNot(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
             // No side effects, so if it's unreferenced, do nothing.
             if (inst.base.isUnused())
                 return MCValue.dead;
-            const operand = try self.resolveInst(inst.args.operand);
+            const operand = try self.resolveInst(inst.operand);
             switch (operand) {
                 .dead => unreachable,
                 .unreach => unreachable,
@@ -473,36 +499,36 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         .base = .{
                             .tag = .constant,
                             .deaths = 0,
-                            .ty = inst.args.operand.ty,
-                            .src = inst.args.operand.src,
+                            .ty = inst.operand.ty,
+                            .src = inst.operand.src,
                         },
                         .val = Value.initTag(.bool_true),
                     };
-                    return try self.genX8664BinMath(&inst.base, inst.args.operand, &imm.base, 6, 0x30);
+                    return try self.genX8664BinMath(&inst.base, inst.operand, &imm.base, 6, 0x30);
                 },
                 else => return self.fail(inst.base.src, "TODO implement NOT for {}", .{self.target.cpu.arch}),
             }
         }
 
-        fn genAdd(self: *Self, inst: *ir.Inst.Add) !MCValue {
+        fn genAdd(self: *Self, inst: *ir.Inst.BinOp) !MCValue {
             // No side effects, so if it's unreferenced, do nothing.
             if (inst.base.isUnused())
                 return MCValue.dead;
             switch (arch) {
                 .x86_64 => {
-                    return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 0, 0x00);
+                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 0, 0x00);
                 },
                 else => return self.fail(inst.base.src, "TODO implement add for {}", .{self.target.cpu.arch}),
             }
         }
 
-        fn genSub(self: *Self, inst: *ir.Inst.Sub) !MCValue {
+        fn genSub(self: *Self, inst: *ir.Inst.BinOp) !MCValue {
             // No side effects, so if it's unreferenced, do nothing.
             if (inst.base.isUnused())
                 return MCValue.dead;
             switch (arch) {
                 .x86_64 => {
-                    return try self.genX8664BinMath(&inst.base, inst.args.lhs, inst.args.rhs, 5, 0x28);
+                    return try self.genX8664BinMath(&inst.base, inst.lhs, inst.rhs, 5, 0x28);
                 },
                 else => return self.fail(inst.base.src, "TODO implement sub for {}", .{self.target.cpu.arch}),
             }
@@ -625,7 +651,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        fn genArg(self: *Self, inst: *ir.Inst.Arg) !MCValue {
+        fn genArg(self: *Self, inst: *ir.Inst.NoOp) !MCValue {
             if (FreeRegInt == u0) {
                 return self.fail(inst.base.src, "TODO implement Register enum for {}", .{self.target.cpu.arch});
             }
@@ -659,7 +685,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         }
 
         fn genCall(self: *Self, inst: *ir.Inst.Call) !MCValue {
-            const fn_ty = inst.args.func.ty;
+            const fn_ty = inst.func.ty;
             const cc = fn_ty.fnCallingConvention();
             const param_types = try self.gpa.alloc(Type, fn_ty.fnParamLen());
             defer self.gpa.free(param_types);
@@ -671,8 +697,8 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             switch (arch) {
                 .x86_64 => {
                     for (mc_args) |mc_arg, arg_i| {
-                        const arg = inst.args.args[arg_i];
-                        const arg_mcv = try self.resolveInst(inst.args.args[arg_i]);
+                        const arg = inst.args[arg_i];
+                        const arg_mcv = try self.resolveInst(inst.args[arg_i]);
                         switch (mc_arg) {
                             .none => continue,
                             .register => |reg| {
@@ -694,7 +720,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                         }
                     }
 
-                    if (inst.args.func.cast(ir.Inst.Constant)) |func_inst| {
+                    if (inst.func.cast(ir.Inst.Constant)) |func_inst| {
                         if (func_inst.val.cast(Value.Payload.Function)) |func_val| {
                             const func = func_val.func;
                             const got = &self.bin_file.program_headers.items[self.bin_file.phdr_got_index.?];
@@ -742,16 +768,16 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             return .unreach;
         }
 
-        fn genRet(self: *Self, inst: *ir.Inst.Ret) !MCValue {
-            const operand = try self.resolveInst(inst.args.operand);
+        fn genRet(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
+            const operand = try self.resolveInst(inst.operand);
             return self.ret(inst.base.src, operand);
         }
 
-        fn genRetVoid(self: *Self, inst: *ir.Inst.RetVoid) !MCValue {
+        fn genRetVoid(self: *Self, inst: *ir.Inst.NoOp) !MCValue {
             return self.ret(inst.base.src, .none);
         }
 
-        fn genCmp(self: *Self, inst: *ir.Inst.Cmp) !MCValue {
+        fn genCmp(self: *Self, inst: *ir.Inst.BinOp, op: std.math.CompareOperator) !MCValue {
             // No side effects, so if it's unreferenced, do nothing.
             if (inst.base.isUnused())
                 return MCValue.dead;
@@ -759,25 +785,25 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 .x86_64 => {
                     try self.code.ensureCapacity(self.code.items.len + 8);
 
-                    const lhs = try self.resolveInst(inst.args.lhs);
-                    const rhs = try self.resolveInst(inst.args.rhs);
+                    const lhs = try self.resolveInst(inst.lhs);
+                    const rhs = try self.resolveInst(inst.rhs);
 
                     // There are 2 operands, destination and source.
                     // Either one, but not both, can be a memory operand.
                     // Source operand can be an immediate, 8 bits or 32 bits.
                     const dst_mcv = if (lhs.isImmediate() or (lhs.isMemory() and rhs.isMemory()))
-                        try self.copyToNewRegister(inst.args.lhs)
+                        try self.copyToNewRegister(inst.lhs)
                     else
                         lhs;
                     // This instruction supports only signed 32-bit immediates at most.
-                    const src_mcv = try self.limitImmediateType(inst.args.rhs, i32);
+                    const src_mcv = try self.limitImmediateType(inst.rhs, i32);
 
                     try self.genX8664BinMathCode(inst.base.src, dst_mcv, src_mcv, 7, 0x38);
-                    const info = inst.args.lhs.ty.intInfo(self.target.*);
+                    const info = inst.lhs.ty.intInfo(self.target.*);
                     if (info.signed) {
-                        return MCValue{ .compare_flags_signed = inst.args.op };
+                        return MCValue{ .compare_flags_signed = op };
                     } else {
-                        return MCValue{ .compare_flags_unsigned = inst.args.op };
+                        return MCValue{ .compare_flags_unsigned = op };
                     }
                 },
                 else => return self.fail(inst.base.src, "TODO implement cmp for {}", .{self.target.cpu.arch}),
@@ -789,7 +815,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
                 .x86_64 => {
                     try self.code.ensureCapacity(self.code.items.len + 6);
 
-                    const cond = try self.resolveInst(inst.args.condition);
+                    const cond = try self.resolveInst(inst.condition);
                     switch (cond) {
                         .compare_flags_signed => |cmp_op| {
                             // Here we map to the opposite opcode because the jump is to the false branch.
@@ -838,19 +864,19 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             self.code.appendSliceAssumeCapacity(&[_]u8{ 0x0f, opcode });
             const reloc = Reloc{ .rel32 = self.code.items.len };
             self.code.items.len += 4;
-            try self.genBody(inst.args.true_body);
+            try self.genBody(inst.then_body);
             try self.performReloc(inst.base.src, reloc);
-            try self.genBody(inst.args.false_body);
+            try self.genBody(inst.else_body);
             return MCValue.unreach;
         }
 
-        fn genIsNull(self: *Self, inst: *ir.Inst.IsNull) !MCValue {
+        fn genIsNull(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
             switch (arch) {
                 else => return self.fail(inst.base.src, "TODO implement isnull for {}", .{self.target.cpu.arch}),
             }
         }
 
-        fn genIsNonNull(self: *Self, inst: *ir.Inst.IsNonNull) !MCValue {
+        fn genIsNonNull(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
             // Here you can specialize this instruction if it makes sense to, otherwise the default
             // will call genIsNull and invert the result.
             switch (arch) {
@@ -864,7 +890,7 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
             // A block is nothing but a setup to be able to jump to the end.
             defer inst.codegen.relocs.deinit(self.gpa);
-            try self.genBody(inst.args.body);
+            try self.genBody(inst.body);
 
             for (inst.codegen.relocs.items) |reloc| try self.performReloc(inst.base.src, reloc);
 
@@ -883,17 +909,17 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         }
 
         fn genBr(self: *Self, inst: *ir.Inst.Br) !MCValue {
-            if (!inst.args.operand.ty.hasCodeGenBits())
-                return self.brVoid(inst.base.src, inst.args.block);
+            if (!inst.operand.ty.hasCodeGenBits())
+                return self.brVoid(inst.base.src, inst.block);
 
-            const operand = try self.resolveInst(inst.args.operand);
+            const operand = try self.resolveInst(inst.operand);
             switch (arch) {
                 else => return self.fail(inst.base.src, "TODO implement br for {}", .{self.target.cpu.arch}),
             }
         }
 
         fn genBrVoid(self: *Self, inst: *ir.Inst.BrVoid) !MCValue {
-            return self.brVoid(inst.base.src, inst.args.block);
+            return self.brVoid(inst.base.src, inst.block);
         }
 
         fn brVoid(self: *Self, src: usize, block: *ir.Inst.Block) !MCValue {
@@ -915,29 +941,29 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
         }
 
         fn genAsm(self: *Self, inst: *ir.Inst.Assembly) !MCValue {
-            if (!inst.args.is_volatile and inst.base.isUnused())
+            if (!inst.is_volatile and inst.base.isUnused())
                 return MCValue.dead;
             if (arch != .x86_64 and arch != .i386) {
                 return self.fail(inst.base.src, "TODO implement inline asm support for more architectures", .{});
             }
-            for (inst.args.inputs) |input, i| {
+            for (inst.inputs) |input, i| {
                 if (input.len < 3 or input[0] != '{' or input[input.len - 1] != '}') {
                     return self.fail(inst.base.src, "unrecognized asm input constraint: '{}'", .{input});
                 }
                 const reg_name = input[1 .. input.len - 1];
                 const reg = parseRegName(reg_name) orelse
                     return self.fail(inst.base.src, "unrecognized register: '{}'", .{reg_name});
-                const arg = try self.resolveInst(inst.args.args[i]);
+                const arg = try self.resolveInst(inst.args[i]);
                 try self.genSetReg(inst.base.src, reg, arg);
             }
 
-            if (mem.eql(u8, inst.args.asm_source, "syscall")) {
+            if (mem.eql(u8, inst.asm_source, "syscall")) {
                 try self.code.appendSlice(&[_]u8{ 0x0f, 0x05 });
             } else {
                 return self.fail(inst.base.src, "TODO implement support for more x86 assembly instructions", .{});
             }
 
-            if (inst.args.output) |output| {
+            if (inst.output) |output| {
                 if (output.len < 4 or output[0] != '=' or output[1] != '{' or output[output.len - 1] != '}') {
                     return self.fail(inst.base.src, "unrecognized asm output constraint: '{}'", .{output});
                 }
@@ -1169,13 +1195,13 @@ fn Function(comptime arch: std.Target.Cpu.Arch) type {
             }
         }
 
-        fn genPtrToInt(self: *Self, inst: *ir.Inst.PtrToInt) !MCValue {
+        fn genPtrToInt(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
             // no-op
-            return self.resolveInst(inst.args.ptr);
+            return self.resolveInst(inst.operand);
         }
 
-        fn genBitCast(self: *Self, inst: *ir.Inst.BitCast) !MCValue {
-            const operand = try self.resolveInst(inst.args.operand);
+        fn genBitCast(self: *Self, inst: *ir.Inst.UnOp) !MCValue {
+            const operand = try self.resolveInst(inst.operand);
             return operand;
         }
 
diff --git a/src-self-hosted/codegen/c.zig b/src-self-hosted/codegen/c.zig
index ebc4ff7e1a..ed3a5f73b4 100644
--- a/src-self-hosted/codegen/c.zig
+++ b/src-self-hosted/codegen/c.zig
@@ -92,9 +92,9 @@ fn genFn(file: *C, decl: *Decl) !void {
         for (instructions) |inst| {
             try writer.writeAll("\n\t");
             switch (inst.tag) {
-                .assembly => try genAsm(file, inst.cast(Inst.Assembly).?, decl),
-                .call => try genCall(file, inst.cast(Inst.Call).?, decl),
-                .ret => try genRet(file, inst.cast(Inst.Ret).?, decl, tv.ty.fnReturnType()),
+                .assembly => try genAsm(file, inst.castTag(.assembly).?, decl),
+                .call => try genCall(file, inst.castTag(.call).?, decl),
+                .ret => try genRet(file, inst.castTag(.ret).?, decl, tv.ty.fnReturnType()),
                 .retvoid => try file.main.writer().print("return;", .{}),
                 else => |e| return file.fail(decl.src(), "TODO implement C codegen for {}", .{e}),
             }
@@ -105,9 +105,9 @@ fn genFn(file: *C, decl: *Decl) !void {
     try writer.writeAll("}\n\n");
 }
 
-fn genRet(file: *C, inst: *Inst.Ret, decl: *Decl, expected_return_type: Type) !void {
+fn genRet(file: *C, inst: *Inst.UnOp, decl: *Decl, expected_return_type: Type) !void {
     const writer = file.main.writer();
-    const ret_value = inst.args.operand;
+    const ret_value = inst.operand;
     const value = ret_value.value().?;
     if (expected_return_type.eql(ret_value.ty))
         return file.fail(decl.src(), "TODO return {}", .{expected_return_type})
@@ -126,7 +126,7 @@ fn genRet(file: *C, inst: *Inst.Ret, decl: *Decl, expected_return_type: Type) !v
 fn genCall(file: *C, inst: *Inst.Call, decl: *Decl) !void {
     const writer = file.main.writer();
     const header = file.header.writer();
-    if (inst.args.func.cast(Inst.Constant)) |func_inst| {
+    if (inst.func.castTag(.constant)) |func_inst| {
         if (func_inst.val.cast(Value.Payload.Function)) |func_val| {
             const target = func_val.func.owner_decl;
             const target_ty = target.typed_value.most_recent.typed_value.ty;
@@ -144,7 +144,7 @@ fn genCall(file: *C, inst: *Inst.Call, decl: *Decl) !void {
         } else {
             return file.fail(decl.src(), "TODO non-function call target?", .{});
         }
-        if (inst.args.args.len != 0) {
+        if (inst.args.len != 0) {
             return file.fail(decl.src(), "TODO function arguments", .{});
         }
     } else {
@@ -152,14 +152,13 @@ fn genCall(file: *C, inst: *Inst.Call, decl: *Decl) !void {
     }
 }
 
-fn genAsm(file: *C, inst: *Inst.Assembly, decl: *Decl) !void {
-    const as = inst.args;
+fn genAsm(file: *C, as: *Inst.Assembly, decl: *Decl) !void {
     const writer = file.main.writer();
     for (as.inputs) |i, index| {
         if (i[0] == '{' and i[i.len - 1] == '}') {
             const reg = i[1 .. i.len - 1];
             const arg = as.args[index];
-            if (arg.cast(Inst.Constant)) |c| {
+            if (arg.castTag(.constant)) |c| {
                 if (c.val.tag() == .int_u64) {
                     try writer.writeAll("register ");
                     try renderType(file, writer, arg.ty, decl.src());
@@ -190,7 +189,7 @@ fn genAsm(file: *C, inst: *Inst.Assembly, decl: *Decl) !void {
                 if (index > 0) {
                     try writer.writeAll(", ");
                 }
-                if (arg.cast(Inst.Constant)) |c| {
+                if (arg.castTag(.constant)) |c| {
                     try writer.print("\"\"({}_constant)", .{reg});
                 } else {
                     // This is blocked by the earlier test
diff --git a/src-self-hosted/ir.zig b/src-self-hosted/ir.zig
index 9902bd70aa..53a73dbf6c 100644
--- a/src-self-hosted/ir.zig
+++ b/src-self-hosted/ir.zig
@@ -55,7 +55,12 @@ pub const Inst = struct {
         breakpoint,
         brvoid,
         call,
-        cmp,
+        cmp_lt,
+        cmp_lte,
+        cmp_eq,
+        cmp_gte,
+        cmp_gt,
+        cmp_neq,
         condbr,
         constant,
         isnonnull,
@@ -66,13 +71,80 @@ pub const Inst = struct {
         sub,
         unreach,
         not,
+
+        /// There is one-to-one correspondence between tag and type for now,
+        /// but this will not always be the case. For example, binary operations
+        /// such as + and - will have different tags but the same type.
+        pub fn Type(tag: Tag) type {
+            return switch (tag) {
+                .retvoid,
+                .unreach,
+                .arg,
+                .breakpoint,
+                => NoOp,
+
+                .ret,
+                .bitcast,
+                .not,
+                .isnonnull,
+                .isnull,
+                .ptrtoint,
+                => UnOp,
+
+                .add,
+                .sub,
+                .cmp_lt,
+                .cmp_lte,
+                .cmp_eq,
+                .cmp_gte,
+                .cmp_gt,
+                .cmp_neq,
+                => BinOp,
+
+                .assembly => Assembly,
+                .block => Block,
+                .br => Br,
+                .brvoid => BrVoid,
+                .call => Call,
+                .condbr => CondBr,
+                .constant => Constant,
+            };
+        }
+
+        pub fn fromCmpOp(op: std.math.CompareOperator) Tag {
+            return switch (op) {
+                .lt => .cmp_lt,
+                .lte => .cmp_lte,
+                .eq => .cmp_eq,
+                .gte => .cmp_gte,
+                .gt => .cmp_gt,
+                .neq => .cmp_neq,
+            };
+        }
     };
 
+    /// Prefer `castTag` to this.
     pub fn cast(base: *Inst, comptime T: type) ?*T {
-        if (base.tag != T.base_tag)
-            return null;
+        if (@hasField(T, "base_tag")) {
+            return base.castTag(T.base_tag);
+        }
+        inline for (@typeInfo(Tag).Enum.fields) |field| {
+            const tag = @intToEnum(Tag, field.value);
+            if (base.tag == tag) {
+                if (T == tag.Type()) {
+                    return @fieldParentPtr(T, "base", base);
+                }
+                return null;
+            }
+        }
+        unreachable;
+    }
 
-        return @fieldParentPtr(T, "base", base);
+    pub fn castTag(base: *Inst, comptime tag: Tag) ?*tag.Type() {
+        if (base.tag == tag) {
+            return @fieldParentPtr(tag.Type(), "base", base);
+        }
+        return null;
     }
 
     pub fn Args(comptime T: type) type {
@@ -88,186 +160,219 @@ pub const Inst = struct {
         return inst.val;
     }
 
-    pub const Add = struct {
-        pub const base_tag = Tag.add;
+    pub fn cmpOperator(base: *Inst) ?std.math.CompareOperator {
+        return switch (self.base.tag) {
+            .cmp_lt => .lt,
+            .cmp_lte => .lte,
+            .cmp_eq => .eq,
+            .cmp_gte => .gte,
+            .cmp_gt => .gt,
+            .cmp_neq => .neq,
+            else => null,
+        };
+    }
+
+    pub fn operandCount(base: *Inst) usize {
+        inline for (@typeInfo(Tag).Enum.fields) |field| {
+            const tag = @intToEnum(Tag, field.value);
+            if (tag == base.tag) {
+                return @fieldParentPtr(tag.Type(), "base", base).operandCount();
+            }
+        }
+        unreachable;
+    }
+
+    pub fn getOperand(base: *Inst, index: usize) ?*Inst {
+        inline for (@typeInfo(Tag).Enum.fields) |field| {
+            const tag = @intToEnum(Tag, field.value);
+            if (tag == base.tag) {
+                return @fieldParentPtr(tag.Type(), "base", base).getOperand(index);
+            }
+        }
+        unreachable;
+    }
+
+    pub const NoOp = struct {
         base: Inst,
 
-        args: struct {
-            lhs: *Inst,
-            rhs: *Inst,
-        },
+        pub fn operandCount(self: *const NoOp) usize {
+            return 0;
+        }
+        pub fn getOperand(self: *const NoOp, index: usize) ?*Inst {
+            return null;
+        }
     };
 
-    pub const Arg = struct {
-        pub const base_tag = Tag.arg;
+    pub const UnOp = struct {
         base: Inst,
-        args: void,
+        operand: *Inst,
+
+        pub fn operandCount(self: *const UnOp) usize {
+            return 1;
+        }
+        pub fn getOperand(self: *const UnOp, index: usize) ?*Inst {
+            if (index == 0)
+                return self.operand;
+            return null;
+        }
+    };
+
+    pub const BinOp = struct {
+        base: Inst,
+        lhs: *Inst,
+        rhs: *Inst,
+
+        pub fn operandCount(self: *const BinOp) usize {
+            return 2;
+        }
+        pub fn getOperand(self: *const BinOp, index: usize) ?*Inst {
+            var i = index;
+
+            if (i < 1)
+                return self.lhs;
+            i -= 1;
+
+            if (i < 1)
+                return self.rhs;
+            i -= 1;
+
+            return null;
+        }
     };
 
     pub const Assembly = struct {
         pub const base_tag = Tag.assembly;
-        base: Inst,
-
-        args: struct {
-            asm_source: []const u8,
-            is_volatile: bool,
-            output: ?[]const u8,
-            inputs: []const []const u8,
-            clobbers: []const []const u8,
-            args: []const *Inst,
-        },
-    };
-
-    pub const BitCast = struct {
-        pub const base_tag = Tag.bitcast;
 
         base: Inst,
-        args: struct {
-            operand: *Inst,
-        },
+        asm_source: []const u8,
+        is_volatile: bool,
+        output: ?[]const u8,
+        inputs: []const []const u8,
+        clobbers: []const []const u8,
+        args: []const *Inst,
+
+        pub fn operandCount(self: *const Assembly) usize {
+            return self.args.len;
+        }
+        pub fn getOperand(self: *const Assembly, index: usize) ?*Inst {
+            if (index < self.args.len)
+                return self.args[index];
+            return null;
+        }
     };
 
     pub const Block = struct {
         pub const base_tag = Tag.block;
+
         base: Inst,
-        args: struct {
-            body: Body,
-        },
+        body: Body,
         /// This memory is reserved for codegen code to do whatever it needs to here.
         codegen: codegen.BlockData = .{},
+
+        pub fn operandCount(self: *const Block) usize {
+            return 0;
+        }
+        pub fn getOperand(self: *const Block, index: usize) ?*Inst {
+            return null;
+        }
     };
 
     pub const Br = struct {
         pub const base_tag = Tag.br;
-        base: Inst,
-        args: struct {
-            block: *Block,
-            operand: *Inst,
-        },
-    };
 
-    pub const Breakpoint = struct {
-        pub const base_tag = Tag.breakpoint;
         base: Inst,
-        args: void,
+        block: *Block,
+        operand: *Inst,
+
+        pub fn operandCount(self: *const Br) usize {
+            return 0;
+        }
+        pub fn getOperand(self: *const Br, index: usize) ?*Inst {
+            if (index == 0)
+                return self.operand;
+            return null;
+        }
     };
 
     pub const BrVoid = struct {
         pub const base_tag = Tag.brvoid;
+
         base: Inst,
-        args: struct {
-            block: *Block,
-        },
+        block: *Block,
+
+        pub fn operandCount(self: *const BrVoid) usize {
+            return 0;
+        }
+        pub fn getOperand(self: *const BrVoid, index: usize) ?*Inst {
+            return null;
+        }
     };
 
     pub const Call = struct {
         pub const base_tag = Tag.call;
-        base: Inst,
-        args: struct {
-            func: *Inst,
-            args: []const *Inst,
-        },
-    };
-
-    pub const Cmp = struct {
-        pub const base_tag = Tag.cmp;
 
         base: Inst,
-        args: struct {
-            lhs: *Inst,
-            op: std.math.CompareOperator,
-            rhs: *Inst,
-        },
+        func: *Inst,
+        args: []const *Inst,
+
+        pub fn operandCount(self: *const Call) usize {
+            return self.args.len + 1;
+        }
+        pub fn getOperand(self: *const Call, index: usize) ?*Inst {
+            var i = index;
+
+            if (i < 1)
+                return self.func;
+            i -= 1;
+
+            if (i < self.args.len)
+                return self.args[i];
+            i -= self.args.len;
+
+            return null;
+        }
     };
 
     pub const CondBr = struct {
         pub const base_tag = Tag.condbr;
 
         base: Inst,
-        args: struct {
-            condition: *Inst,
-            true_body: Body,
-            false_body: Body,
-        },
+        condition: *Inst,
+        then_body: Body,
+        else_body: Body,
         /// Set of instructions whose lifetimes end at the start of one of the branches.
         /// The `true` branch is first: `deaths[0..true_death_count]`.
         /// The `false` branch is next: `(deaths + true_death_count)[..false_death_count]`.
         deaths: [*]*Inst = undefined,
         true_death_count: u32 = 0,
         false_death_count: u32 = 0,
-    };
 
-    pub const Not = struct {
-        pub const base_tag = Tag.not;
+        pub fn operandCount(self: *const CondBr) usize {
+            return 1;
+        }
+        pub fn getOperand(self: *const CondBr, index: usize) ?*Inst {
+            var i = index;
 
-        base: Inst,
-        args: struct {
-            operand: *Inst,
-        },
+            if (i < 1)
+                return self.condition;
+            i -= 1;
+
+            return null;
+        }
     };
 
     pub const Constant = struct {
         pub const base_tag = Tag.constant;
-        base: Inst,
 
+        base: Inst,
         val: Value,
-    };
 
-    pub const IsNonNull = struct {
-        pub const base_tag = Tag.isnonnull;
-
-        base: Inst,
-        args: struct {
-            operand: *Inst,
-        },
-    };
-
-    pub const IsNull = struct {
-        pub const base_tag = Tag.isnull;
-
-        base: Inst,
-        args: struct {
-            operand: *Inst,
-        },
-    };
-
-    pub const PtrToInt = struct {
-        pub const base_tag = Tag.ptrtoint;
-
-        base: Inst,
-        args: struct {
-            ptr: *Inst,
-        },
-    };
-
-    pub const Ret = struct {
-        pub const base_tag = Tag.ret;
-        base: Inst,
-        args: struct {
-            operand: *Inst,
-        },
-    };
-
-    pub const RetVoid = struct {
-        pub const base_tag = Tag.retvoid;
-        base: Inst,
-        args: void,
-    };
-
-    pub const Sub = struct {
-        pub const base_tag = Tag.sub;
-        base: Inst,
-
-        args: struct {
-            lhs: *Inst,
-            rhs: *Inst,
-        },
-    };
-
-    pub const Unreach = struct {
-        pub const base_tag = Tag.unreach;
-        base: Inst,
-        args: void,
+        pub fn operandCount(self: *const Constant) usize {
+            return 0;
+        }
+        pub fn getOperand(self: *const Constant, index: usize) ?*Inst {
+            return null;
+        }
     };
 };
 
diff --git a/src-self-hosted/liveness.zig b/src-self-hosted/liveness.zig
index a06a4dd1d1..e8f80f30d5 100644
--- a/src-self-hosted/liveness.zig
+++ b/src-self-hosted/liveness.zig
@@ -25,53 +25,38 @@ fn analyzeWithTable(arena: *std.mem.Allocator, table: *std.AutoHashMap(*ir.Inst,
     while (i != 0) {
         i -= 1;
         const base = body.instructions[i];
-        try analyzeInstGeneric(arena, table, base);
+        try analyzeInst(arena, table, base);
     }
 }
 
-fn analyzeInstGeneric(arena: *std.mem.Allocator, table: *std.AutoHashMap(*ir.Inst, void), base: *ir.Inst) error{OutOfMemory}!void {
-    // Obtain the corresponding instruction type based on the tag type.
-    inline for (std.meta.declarations(ir.Inst)) |decl| {
-        switch (decl.data) {
-            .Type => |T| {
-                if (@typeInfo(T) == .Struct and @hasDecl(T, "base_tag")) {
-                    if (T.base_tag == base.tag) {
-                        return analyzeInst(arena, table, T, @fieldParentPtr(T, "base", base));
-                    }
-                }
-            },
-            else => {},
-        }
-    }
-    unreachable;
-}
-
-fn analyzeInst(arena: *std.mem.Allocator, table: *std.AutoHashMap(*ir.Inst, void), comptime T: type, inst: *T) error{OutOfMemory}!void {
-    if (table.contains(&inst.base)) {
-        inst.base.deaths = 0;
+fn analyzeInst(arena: *std.mem.Allocator, table: *std.AutoHashMap(*ir.Inst, void), base: *ir.Inst) error{OutOfMemory}!void {
+    if (table.contains(base)) {
+        base.deaths = 0;
     } else {
         // No tombstone for this instruction means it is never referenced,
         // and its birth marks its own death. Very metal 🤘
-        inst.base.deaths = 1 << ir.Inst.unreferenced_bit_index;
+        base.deaths = 1 << ir.Inst.unreferenced_bit_index;
     }
 
-    switch (T) {
-        ir.Inst.Constant => return,
-        ir.Inst.Block => {
-            try analyzeWithTable(arena, table, inst.args.body);
+    switch (base.tag) {
+        .constant => return,
+        .block => {
+            const inst = base.castTag(.block).?;
+            try analyzeWithTable(arena, table, inst.body);
             // We let this continue so that it can possibly mark the block as
             // unreferenced below.
         },
-        ir.Inst.CondBr => {
+        .condbr => {
+            const inst = base.castTag(.condbr).?;
             var true_table = std.AutoHashMap(*ir.Inst, void).init(table.allocator);
             defer true_table.deinit();
-            try true_table.ensureCapacity(inst.args.true_body.instructions.len);
-            try analyzeWithTable(arena, &true_table, inst.args.true_body);
+            try true_table.ensureCapacity(inst.then_body.instructions.len);
+            try analyzeWithTable(arena, &true_table, inst.then_body);
 
             var false_table = std.AutoHashMap(*ir.Inst, void).init(table.allocator);
             defer false_table.deinit();
-            try false_table.ensureCapacity(inst.args.false_body.instructions.len);
-            try analyzeWithTable(arena, &false_table, inst.args.false_body);
+            try false_table.ensureCapacity(inst.else_body.instructions.len);
+            try analyzeWithTable(arena, &false_table, inst.else_body);
 
             // Each death that occurs inside one branch, but not the other, needs
             // to be added as a death immediately upon entering the other branch.
@@ -112,47 +97,22 @@ fn analyzeInst(arena: *std.mem.Allocator, table: *std.AutoHashMap(*ir.Inst, void
             // instruction, and the deaths flag for the CondBr instruction will indicate whether the
             // condition's lifetime ends immediately before entering any branch.
         },
-        ir.Inst.Call => {
-            // Call instructions have a runtime-known number of operands so we have to handle them ourselves here.
-            const needed_bits = 1 + inst.args.args.len;
-            if (needed_bits <= ir.Inst.deaths_bits) {
-                var bit_i: ir.Inst.DeathsBitIndex = 0;
-                {
-                    const prev = try table.fetchPut(inst.args.func, {});
-                    if (prev == null) inst.base.deaths |= @as(ir.Inst.DeathsInt, 1) << bit_i;
-                    bit_i += 1;
-                }
-                for (inst.args.args) |arg| {
-                    const prev = try table.fetchPut(arg, {});
-                    if (prev == null) inst.base.deaths |= @as(ir.Inst.DeathsInt, 1) << bit_i;
-                    bit_i += 1;
-                }
-            } else {
-                @panic("Handle liveness analysis for function calls with many parameters");
-            }
-        },
         else => {},
     }
 
-    const Args = ir.Inst.Args(T);
-    if (Args == void) {
-        return;
-    }
-
-    comptime var arg_index: usize = 0;
-    inline for (std.meta.fields(Args)) |field| {
-        if (field.field_type == *ir.Inst) {
-            if (arg_index >= 6) {
-                @compileError("out of bits to mark deaths of operands");
-            }
-            const prev = try table.fetchPut(@field(inst.args, field.name), {});
+    const needed_bits = base.operandCount();
+    if (needed_bits <= ir.Inst.deaths_bits) {
+        var bit_i: ir.Inst.DeathsBitIndex = 0;
+        while (base.getOperand(bit_i)) |operand| : (bit_i += 1) {
+            const prev = try table.fetchPut(operand, {});
             if (prev == null) {
                 // Death.
-                inst.base.deaths |= 1 << arg_index;
+                base.deaths |= @as(ir.Inst.DeathsInt, 1) << bit_i;
             }
-            arg_index += 1;
         }
+    } else {
+        @panic("Handle liveness analysis for instructions with many parameters");
     }
 
-    std.log.debug(.liveness, "analyze {}: 0b{b}\n", .{ inst.base.tag, inst.base.deaths });
+    std.log.debug(.liveness, "analyze {}: 0b{b}\n", .{ base.tag, base.deaths });
 }
diff --git a/src-self-hosted/zir.zig b/src-self-hosted/zir.zig
index 5e8c966b94..514d08d6d4 100644
--- a/src-self-hosted/zir.zig
+++ b/src-self-hosted/zir.zig
@@ -337,7 +337,7 @@ pub const Inst = struct {
         base: Inst,
 
         positionals: struct {
-            ptr: *Inst,
+            operand: *Inst,
         },
         kw_args: struct {},
     };
@@ -629,8 +629,8 @@ pub const Inst = struct {
 
         positionals: struct {
             condition: *Inst,
-            true_body: Module.Body,
-            false_body: Module.Body,
+            then_body: Module.Body,
+            else_body: Module.Body,
         },
         kw_args: struct {},
     };
@@ -1615,7 +1615,7 @@ const EmitZIR = struct {
         }
     }
 
-    fn emitTrivial(self: *EmitZIR, src: usize, comptime T: type) Allocator.Error!*Inst {
+    fn emitNoOp(self: *EmitZIR, src: usize, comptime T: type) Allocator.Error!*Inst {
         const new_inst = try self.arena.allocator.create(T);
         new_inst.* = .{
             .base = .{
@@ -1628,6 +1628,72 @@ const EmitZIR = struct {
         return &new_inst.base;
     }
 
+    fn emitCmp(
+        self: *EmitZIR,
+        src: usize,
+        new_body: ZirBody,
+        old_inst: *ir.Inst.BinOp,
+        op: std.math.CompareOperator,
+    ) Allocator.Error!*Inst {
+        const new_inst = try self.arena.allocator.create(Inst.Cmp);
+        new_inst.* = .{
+            .base = .{
+                .src = src,
+                .tag = Inst.Cmp.base_tag,
+            },
+            .positionals = .{
+                .lhs = try self.resolveInst(new_body, old_inst.lhs),
+                .rhs = try self.resolveInst(new_body, old_inst.rhs),
+                .op = op,
+            },
+            .kw_args = .{},
+        };
+        return &new_inst.base;
+    }
+
+    fn emitUnOp(
+        self: *EmitZIR,
+        src: usize,
+        new_body: ZirBody,
+        old_inst: *ir.Inst.UnOp,
+        comptime I: type,
+    ) Allocator.Error!*Inst {
+        const new_inst = try self.arena.allocator.create(I);
+        new_inst.* = .{
+            .base = .{
+                .src = src,
+                .tag = I.base_tag,
+            },
+            .positionals = .{
+                .operand = try self.resolveInst(new_body, old_inst.operand),
+            },
+            .kw_args = .{},
+        };
+        return &new_inst.base;
+    }
+
+    fn emitBinOp(
+        self: *EmitZIR,
+        src: usize,
+        new_body: ZirBody,
+        old_inst: *ir.Inst.BinOp,
+        comptime I: type,
+    ) Allocator.Error!*Inst {
+        const new_inst = try self.arena.allocator.create(I);
+        new_inst.* = .{
+            .base = .{
+                .src = src,
+                .tag = I.base_tag,
+            },
+            .positionals = .{
+                .lhs = try self.resolveInst(new_body, old_inst.lhs),
+                .rhs = try self.resolveInst(new_body, old_inst.rhs),
+            },
+            .kw_args = .{},
+        };
+        return &new_inst.base;
+    }
+
     fn emitBody(
         self: *EmitZIR,
         body: ir.Body,
@@ -1640,69 +1706,48 @@ const EmitZIR = struct {
         };
         for (body.instructions) |inst| {
             const new_inst = switch (inst.tag) {
-                .not => blk: {
-                    const old_inst = inst.cast(ir.Inst.Not).?;
-                    assert(inst.ty.zigTypeTag() == .Bool);
-                    const new_inst = try self.arena.allocator.create(Inst.BoolNot);
+                .constant => unreachable, // excluded from function bodies
+
+                .arg => try self.emitNoOp(inst.src, Inst.Arg),
+                .breakpoint => try self.emitNoOp(inst.src, Inst.Breakpoint),
+                .unreach => try self.emitNoOp(inst.src, Inst.Unreachable),
+                .retvoid => try self.emitNoOp(inst.src, Inst.ReturnVoid),
+
+                .not => try self.emitUnOp(inst.src, new_body, inst.castTag(.not).?, Inst.BoolNot),
+                .ret => try self.emitUnOp(inst.src, new_body, inst.castTag(.ret).?, Inst.Return),
+                .ptrtoint => try self.emitUnOp(inst.src, new_body, inst.castTag(.ptrtoint).?, Inst.PtrToInt),
+                .isnull => try self.emitUnOp(inst.src, new_body, inst.castTag(.isnull).?, Inst.IsNull),
+                .isnonnull => try self.emitUnOp(inst.src, new_body, inst.castTag(.isnonnull).?, Inst.IsNonNull),
+
+                .add => try self.emitBinOp(inst.src, new_body, inst.castTag(.add).?, Inst.Add),
+                .sub => try self.emitBinOp(inst.src, new_body, inst.castTag(.sub).?, Inst.Sub),
+
+                .cmp_lt => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_lt).?, .lt),
+                .cmp_lte => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_lte).?, .lte),
+                .cmp_eq => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_eq).?, .eq),
+                .cmp_gte => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_gte).?, .gte),
+                .cmp_gt => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_gt).?, .gt),
+                .cmp_neq => try self.emitCmp(inst.src, new_body, inst.castTag(.cmp_neq).?, .neq),
+
+                .bitcast => blk: {
+                    const old_inst = inst.castTag(.bitcast).?;
+                    const new_inst = try self.arena.allocator.create(Inst.BitCast);
                     new_inst.* = .{
                         .base = .{
                             .src = inst.src,
-                            .tag = Inst.BoolNot.base_tag,
+                            .tag = Inst.BitCast.base_tag,
                         },
                         .positionals = .{
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
+                            .dest_type = (try self.emitType(inst.src, inst.ty)).inst,
+                            .operand = try self.resolveInst(new_body, old_inst.operand),
                         },
                         .kw_args = .{},
                     };
                     break :blk &new_inst.base;
                 },
-                .add => blk: {
-                    const old_inst = inst.cast(ir.Inst.Add).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Add);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Add.base_tag,
-                        },
-                        .positionals = .{
-                            .lhs = try self.resolveInst(new_body, old_inst.args.lhs),
-                            .rhs = try self.resolveInst(new_body, old_inst.args.rhs),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .sub => blk: {
-                    const old_inst = inst.cast(ir.Inst.Sub).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Sub);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Sub.base_tag,
-                        },
-                        .positionals = .{
-                            .lhs = try self.resolveInst(new_body, old_inst.args.lhs),
-                            .rhs = try self.resolveInst(new_body, old_inst.args.rhs),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .arg => blk: {
-                    const old_inst = inst.cast(ir.Inst.Arg).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Arg);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Arg.base_tag,
-                        },
-                        .positionals = .{},
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
+
                 .block => blk: {
-                    const old_inst = inst.cast(ir.Inst.Block).?;
+                    const old_inst = inst.castTag(.block).?;
                     const new_inst = try self.arena.allocator.create(Inst.Block);
 
                     try self.block_table.put(old_inst, new_inst);
@@ -1710,7 +1755,7 @@ const EmitZIR = struct {
                     var block_body = std.ArrayList(*Inst).init(self.allocator);
                     defer block_body.deinit();
 
-                    try self.emitBody(old_inst.args.body, inst_table, &block_body);
+                    try self.emitBody(old_inst.body, inst_table, &block_body);
 
                     new_inst.* = .{
                         .base = .{
@@ -1725,27 +1770,10 @@ const EmitZIR = struct {
 
                     break :blk &new_inst.base;
                 },
-                .br => blk: {
-                    const old_inst = inst.cast(ir.Inst.Br).?;
-                    const new_block = self.block_table.get(old_inst.args.block).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Break);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Break.base_tag,
-                        },
-                        .positionals = .{
-                            .block = new_block,
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .breakpoint => try self.emitTrivial(inst.src, Inst.Breakpoint),
+
                 .brvoid => blk: {
                     const old_inst = inst.cast(ir.Inst.BrVoid).?;
-                    const new_block = self.block_table.get(old_inst.args.block).?;
+                    const new_block = self.block_table.get(old_inst.block).?;
                     const new_inst = try self.arena.allocator.create(Inst.BreakVoid);
                     new_inst.* = .{
                         .base = .{
@@ -1759,13 +1787,32 @@ const EmitZIR = struct {
                     };
                     break :blk &new_inst.base;
                 },
+
+                .br => blk: {
+                    const old_inst = inst.castTag(.br).?;
+                    const new_block = self.block_table.get(old_inst.block).?;
+                    const new_inst = try self.arena.allocator.create(Inst.Break);
+                    new_inst.* = .{
+                        .base = .{
+                            .src = inst.src,
+                            .tag = Inst.Break.base_tag,
+                        },
+                        .positionals = .{
+                            .block = new_block,
+                            .operand = try self.resolveInst(new_body, old_inst.operand),
+                        },
+                        .kw_args = .{},
+                    };
+                    break :blk &new_inst.base;
+                },
+
                 .call => blk: {
-                    const old_inst = inst.cast(ir.Inst.Call).?;
+                    const old_inst = inst.castTag(.call).?;
                     const new_inst = try self.arena.allocator.create(Inst.Call);
 
-                    const args = try self.arena.allocator.alloc(*Inst, old_inst.args.args.len);
+                    const args = try self.arena.allocator.alloc(*Inst, old_inst.args.len);
                     for (args) |*elem, i| {
-                        elem.* = try self.resolveInst(new_body, old_inst.args.args[i]);
+                        elem.* = try self.resolveInst(new_body, old_inst.args[i]);
                     }
                     new_inst.* = .{
                         .base = .{
@@ -1773,48 +1820,31 @@ const EmitZIR = struct {
                             .tag = Inst.Call.base_tag,
                         },
                         .positionals = .{
-                            .func = try self.resolveInst(new_body, old_inst.args.func),
+                            .func = try self.resolveInst(new_body, old_inst.func),
                             .args = args,
                         },
                         .kw_args = .{},
                     };
                     break :blk &new_inst.base;
                 },
-                .unreach => try self.emitTrivial(inst.src, Inst.Unreachable),
-                .ret => blk: {
-                    const old_inst = inst.cast(ir.Inst.Ret).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Return);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Return.base_tag,
-                        },
-                        .positionals = .{
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .retvoid => try self.emitTrivial(inst.src, Inst.ReturnVoid),
-                .constant => unreachable, // excluded from function bodies
+
                 .assembly => blk: {
-                    const old_inst = inst.cast(ir.Inst.Assembly).?;
+                    const old_inst = inst.castTag(.assembly).?;
                     const new_inst = try self.arena.allocator.create(Inst.Asm);
 
-                    const inputs = try self.arena.allocator.alloc(*Inst, old_inst.args.inputs.len);
+                    const inputs = try self.arena.allocator.alloc(*Inst, old_inst.inputs.len);
                     for (inputs) |*elem, i| {
-                        elem.* = (try self.emitStringLiteral(inst.src, old_inst.args.inputs[i])).inst;
+                        elem.* = (try self.emitStringLiteral(inst.src, old_inst.inputs[i])).inst;
                     }
 
-                    const clobbers = try self.arena.allocator.alloc(*Inst, old_inst.args.clobbers.len);
+                    const clobbers = try self.arena.allocator.alloc(*Inst, old_inst.clobbers.len);
                     for (clobbers) |*elem, i| {
-                        elem.* = (try self.emitStringLiteral(inst.src, old_inst.args.clobbers[i])).inst;
+                        elem.* = (try self.emitStringLiteral(inst.src, old_inst.clobbers[i])).inst;
                     }
 
-                    const args = try self.arena.allocator.alloc(*Inst, old_inst.args.args.len);
+                    const args = try self.arena.allocator.alloc(*Inst, old_inst.args.len);
                     for (args) |*elem, i| {
-                        elem.* = try self.resolveInst(new_body, old_inst.args.args[i]);
+                        elem.* = try self.resolveInst(new_body, old_inst.args[i]);
                     }
 
                     new_inst.* = .{
@@ -1823,12 +1853,12 @@ const EmitZIR = struct {
                             .tag = Inst.Asm.base_tag,
                         },
                         .positionals = .{
-                            .asm_source = (try self.emitStringLiteral(inst.src, old_inst.args.asm_source)).inst,
+                            .asm_source = (try self.emitStringLiteral(inst.src, old_inst.asm_source)).inst,
                             .return_type = (try self.emitType(inst.src, inst.ty)).inst,
                         },
                         .kw_args = .{
-                            .@"volatile" = old_inst.args.is_volatile,
-                            .output = if (old_inst.args.output) |o|
+                            .@"volatile" = old_inst.is_volatile,
+                            .output = if (old_inst.output) |o|
                                 (try self.emitStringLiteral(inst.src, o)).inst
                             else
                                 null,
@@ -1839,65 +1869,18 @@ const EmitZIR = struct {
                     };
                     break :blk &new_inst.base;
                 },
-                .ptrtoint => blk: {
-                    const old_inst = inst.cast(ir.Inst.PtrToInt).?;
-                    const new_inst = try self.arena.allocator.create(Inst.PtrToInt);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.PtrToInt.base_tag,
-                        },
-                        .positionals = .{
-                            .ptr = try self.resolveInst(new_body, old_inst.args.ptr),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .bitcast => blk: {
-                    const old_inst = inst.cast(ir.Inst.BitCast).?;
-                    const new_inst = try self.arena.allocator.create(Inst.BitCast);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.BitCast.base_tag,
-                        },
-                        .positionals = .{
-                            .dest_type = (try self.emitType(inst.src, inst.ty)).inst,
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .cmp => blk: {
-                    const old_inst = inst.cast(ir.Inst.Cmp).?;
-                    const new_inst = try self.arena.allocator.create(Inst.Cmp);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.Cmp.base_tag,
-                        },
-                        .positionals = .{
-                            .lhs = try self.resolveInst(new_body, old_inst.args.lhs),
-                            .rhs = try self.resolveInst(new_body, old_inst.args.rhs),
-                            .op = old_inst.args.op,
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
+
                 .condbr => blk: {
-                    const old_inst = inst.cast(ir.Inst.CondBr).?;
+                    const old_inst = inst.castTag(.condbr).?;
 
-                    var true_body = std.ArrayList(*Inst).init(self.allocator);
-                    var false_body = std.ArrayList(*Inst).init(self.allocator);
+                    var then_body = std.ArrayList(*Inst).init(self.allocator);
+                    var else_body = std.ArrayList(*Inst).init(self.allocator);
 
-                    defer true_body.deinit();
-                    defer false_body.deinit();
+                    defer then_body.deinit();
+                    defer else_body.deinit();
 
-                    try self.emitBody(old_inst.args.true_body, inst_table, &true_body);
-                    try self.emitBody(old_inst.args.false_body, inst_table, &false_body);
+                    try self.emitBody(old_inst.then_body, inst_table, &then_body);
+                    try self.emitBody(old_inst.else_body, inst_table, &else_body);
 
                     const new_inst = try self.arena.allocator.create(Inst.CondBr);
                     new_inst.* = .{
@@ -1906,39 +1889,9 @@ const EmitZIR = struct {
                             .tag = Inst.CondBr.base_tag,
                         },
                         .positionals = .{
-                            .condition = try self.resolveInst(new_body, old_inst.args.condition),
-                            .true_body = .{ .instructions = true_body.toOwnedSlice() },
-                            .false_body = .{ .instructions = false_body.toOwnedSlice() },
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .isnull => blk: {
-                    const old_inst = inst.cast(ir.Inst.IsNull).?;
-                    const new_inst = try self.arena.allocator.create(Inst.IsNull);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.IsNull.base_tag,
-                        },
-                        .positionals = .{
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
-                        },
-                        .kw_args = .{},
-                    };
-                    break :blk &new_inst.base;
-                },
-                .isnonnull => blk: {
-                    const old_inst = inst.cast(ir.Inst.IsNonNull).?;
-                    const new_inst = try self.arena.allocator.create(Inst.IsNonNull);
-                    new_inst.* = .{
-                        .base = .{
-                            .src = inst.src,
-                            .tag = Inst.IsNonNull.base_tag,
-                        },
-                        .positionals = .{
-                            .operand = try self.resolveInst(new_body, old_inst.args.operand),
+                            .condition = try self.resolveInst(new_body, old_inst.condition),
+                            .then_body = .{ .instructions = then_body.toOwnedSlice() },
+                            .else_body = .{ .instructions = else_body.toOwnedSlice() },
                         },
                         .kw_args = .{},
                     };
diff --git a/test/stage2/compare_output.zig b/test/stage2/compare_output.zig
index 6a6772f935..d013573e30 100644
--- a/test/stage2/compare_output.zig
+++ b/test/stage2/compare_output.zig
@@ -267,5 +267,42 @@ pub fn addCases(ctx: *TestContext) !void {
         ,
             "",
         );
+
+        // Requires a second move. The register allocator should figure out to re-use rax.
+        case.addCompareOutput(
+            \\export fn _start() noreturn {
+            \\    add(3, 4);
+            \\
+            \\    exit();
+            \\}
+            \\
+            \\fn add(a: u32, b: u32) void {
+            \\    const c = a + b; // 7
+            \\    const d = a + c; // 10
+            \\    const e = d + b; // 14
+            \\    const f = d + e; // 24
+            \\    const g = e + f; // 38
+            \\    const h = f + g; // 62
+            \\    const i = g + h; // 100
+            \\    const j = i + d; // 110
+            \\    assert(j == 110);
+            \\}
+            \\
+            \\pub fn assert(ok: bool) void {
+            \\    if (!ok) unreachable; // assertion failure
+            \\}
+            \\
+            \\fn exit() noreturn {
+            \\    asm volatile ("syscall"
+            \\        :
+            \\        : [number] "{rax}" (231),
+            \\          [arg1] "{rdi}" (0)
+            \\        : "rcx", "r11", "memory"
+            \\    );
+            \\    unreachable;
+            \\}
+        ,
+            "",
+        );
     }
 }