x64: implement matching SSE instructions for generic cross-comp target

2025-12-17 03:33:06 +00:00 · 2022-05-20 13:00:59 +02:00 · 2022-05-20 13:00:59 +02:00 · 274654d73e
commit 274654d73e
parent 0e43d007c0
4 changed files with 384 additions and 160 deletions
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@ -881,7 +881,7 @@ fn allocRegOrMem(self: *Self, inst: Air.Inst.Index, reg_ok: bool) !MCValue {
        switch (elem_ty.zigTypeTag()) {
            .Vector => return self.fail("TODO allocRegOrMem for Vector type", .{}),
            .Float => {
-                if (self.intrinsicsAllowed(elem_ty)) {
+                if (intrinsicsAllowed(self.target.*, elem_ty)) {
                    const ptr_bytes: u64 = 32;
                    if (abi_size <= ptr_bytes) {
                        if (self.register_manager.tryAllocReg(inst, sse)) |reg| {
@ -970,7 +970,7 @@ pub fn spillRegisters(self: *Self, comptime count: comptime_int, registers: [cou
 fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
    const reg_class: RegisterManager.RegisterBitSet = switch (ty.zigTypeTag()) {
        .Float => blk: {
-            if (self.intrinsicsAllowed(ty)) break :blk sse;
+            if (intrinsicsAllowed(self.target.*, ty)) break :blk sse;
            return self.fail("TODO copy {} to register", .{ty.fmtDebug()});
        },
        else => gp,
@ -987,7 +987,7 @@ fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register {
 fn copyToRegisterWithInstTracking(self: *Self, reg_owner: Air.Inst.Index, ty: Type, mcv: MCValue) !MCValue {
    const reg_class: RegisterManager.RegisterBitSet = switch (ty.zigTypeTag()) {
        .Float => blk: {
-            if (self.intrinsicsAllowed(ty)) break :blk sse;
+            if (intrinsicsAllowed(self.target.*, ty)) break :blk sse;
            return self.fail("TODO copy {} to register", .{ty.fmtDebug()});
        },
        else => gp,
@ -3462,16 +3462,28 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, dst_ty: Type, dst_mcv: MCValu
                },
                .register => |src_reg| switch (dst_ty.zigTypeTag()) {
                    .Float => {
-                        if (self.intrinsicsAllowed(dst_ty)) {
+                        if (intrinsicsAllowed(self.target.*, dst_ty)) {
                            const actual_tag: Mir.Inst.Tag = switch (dst_ty.tag()) {
                                .f32 => switch (mir_tag) {
-                                    .add => Mir.Inst.Tag.add_f32_avx,
+                                    .add => if (hasAvxSupport(self.target.*))
-                                    .cmp => Mir.Inst.Tag.cmp_f32_avx,
+                                        Mir.Inst.Tag.add_f32_avx
                                    else
                                        Mir.Inst.Tag.add_f32_sse,
                                    .cmp => if (hasAvxSupport(self.target.*))
                                        Mir.Inst.Tag.cmp_f32_avx
                                    else
                                        Mir.Inst.Tag.cmp_f32_sse,
                                    else => return self.fail("TODO genBinOpMir for f32 register-register with MIR tag {}", .{mir_tag}),
                                },
                                .f64 => switch (mir_tag) {
-                                    .add => Mir.Inst.Tag.add_f64_avx,
+                                    .add => if (hasAvxSupport(self.target.*))
-                                    .cmp => Mir.Inst.Tag.cmp_f64_avx,
+                                        Mir.Inst.Tag.add_f64_avx
                                    else
                                        Mir.Inst.Tag.add_f64_sse,
                                    .cmp => if (hasAvxSupport(self.target.*))
                                        Mir.Inst.Tag.cmp_f64_avx
                                    else
                                        Mir.Inst.Tag.cmp_f64_sse,
                                    else => return self.fail("TODO genBinOpMir for f64 register-register with MIR tag {}", .{mir_tag}),
                                },
                                else => return self.fail("TODO genBinOpMir for float register-register and type {}", .{dst_ty.fmtDebug()}),
@ -5324,10 +5336,16 @@ fn genSetStackArg(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue) InnerE
        .register => |reg| {
            switch (ty.zigTypeTag()) {
                .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                        const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
-                            .f64 => .mov_f64_avx,
+                                Mir.Inst.Tag.mov_f32_avx
                            else
                                Mir.Inst.Tag.mov_f32_sse,
                            .f64 => if (hasAvxSupport(self.target.*))
                                Mir.Inst.Tag.mov_f64_avx
                            else
                                Mir.Inst.Tag.mov_f64_sse,
                            else => return self.fail("TODO genSetStackArg for register for type {}", .{ty.fmtDebug()}),
                        };
                        _ = try self.addInst(.{
@ -5508,10 +5526,16 @@ fn genSetStack(self: *Self, ty: Type, stack_offset: i32, mcv: MCValue, opts: Inl
            switch (ty.zigTypeTag()) {
                .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                        const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
-                            .f64 => .mov_f64_avx,
+                                Mir.Inst.Tag.mov_f32_avx
                            else
                                Mir.Inst.Tag.mov_f32_sse,
                            .f64 => if (hasAvxSupport(self.target.*))
                                Mir.Inst.Tag.mov_f64_avx
                            else
                                Mir.Inst.Tag.mov_f64_sse,
                            else => return self.fail("TODO genSetStack for register for type {}", .{ty.fmtDebug()}),
                        };
                        _ = try self.addInst(.{
@ -6032,10 +6056,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                    },
                },
                .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                        const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
-                            .f64 => .mov_f64_avx,
+                                Mir.Inst.Tag.mov_f32_avx
                            else
                                Mir.Inst.Tag.mov_f32_sse,
                            .f64 => if (hasAvxSupport(self.target.*))
                                Mir.Inst.Tag.mov_f64_avx
                            else
                                Mir.Inst.Tag.mov_f64_sse,
                            else => return self.fail("TODO genSetReg from register for {}", .{ty.fmtDebug()}),
                        };
                        _ = try self.addInst(.{
@ -6072,10 +6102,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                    const base_reg = try self.register_manager.allocReg(null, gp);
                    try self.loadMemPtrIntoRegister(base_reg, Type.usize, mcv);
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                        const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
-                            .f64 => .mov_f64_avx,
+                                Mir.Inst.Tag.mov_f32_avx
                            else
                                Mir.Inst.Tag.mov_f32_sse,
                            .f64 => if (hasAvxSupport(self.target.*))
                                Mir.Inst.Tag.mov_f64_avx
                            else
                                Mir.Inst.Tag.mov_f64_sse,
                            else => return self.fail("TODO genSetReg from memory for {}", .{ty.fmtDebug()}),
                        };
@ -6115,10 +6151,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                const base_reg = try self.register_manager.allocReg(null, gp);
                try self.loadMemPtrIntoRegister(base_reg, Type.usize, mcv);
-                if (self.intrinsicsAllowed(ty)) {
+                if (intrinsicsAllowed(self.target.*, ty)) {
                    const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                        .f32 => .mov_f32_avx,
+                        .f32 => if (hasAvxSupport(self.target.*))
-                        .f64 => .mov_f64_avx,
+                            Mir.Inst.Tag.mov_f32_avx
                        else
                            Mir.Inst.Tag.mov_f32_sse,
                        .f64 => if (hasAvxSupport(self.target.*))
                            Mir.Inst.Tag.mov_f64_avx
                        else
                            Mir.Inst.Tag.mov_f64_sse,
                        else => return self.fail("TODO genSetReg from memory for {}", .{ty.fmtDebug()}),
                    };
@ -6230,10 +6272,16 @@ fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void
                    },
                },
                .Float => {
-                    if (self.intrinsicsAllowed(ty)) {
+                    if (intrinsicsAllowed(self.target.*, ty)) {
                        const tag: Mir.Inst.Tag = switch (ty.tag()) {
-                            .f32 => .mov_f32_avx,
+                            .f32 => if (hasAvxSupport(self.target.*))
-                            .f64 => .mov_f64_avx,
+                                Mir.Inst.Tag.mov_f32_avx
                            else
                                Mir.Inst.Tag.mov_f32_sse,
                            .f64 => if (hasAvxSupport(self.target.*))
                                Mir.Inst.Tag.mov_f64_avx
                            else
                                Mir.Inst.Tag.mov_f64_sse,
                            else => return self.fail("TODO genSetReg from stack offset for {}", .{ty.fmtDebug()}),
                        };
                        _ = try self.addInst(.{
@ -7046,11 +7094,15 @@ fn truncateRegister(self: *Self, ty: Type, reg: Register) !void {
    }
 }
-fn intrinsicsAllowed(self: *Self, ty: Type) bool {
+fn intrinsicsAllowed(target: Target, ty: Type) bool {
    return switch (ty.tag()) {
        .f32,
        .f64,
-        => Target.x86.featureSetHasAny(self.target.cpu.features, .{ .avx, .avx2 }),
+        => Target.x86.featureSetHasAny(target.cpu.features, .{ .sse2, .avx, .avx2 }),
        else => unreachable, // TODO finish this off
    };
 }
 fn hasAvxSupport(target: Target) bool {
    return Target.x86.featureSetHasAny(target.cpu.features, .{ .avx, .avx2 });
 }
--- a/src/arch/x86_64/Emit.zig
+++ b/src/arch/x86_64/Emit.zig
@ -182,6 +182,16 @@ pub fn lowerMir(emit: *Emit) InnerError!void {
            .interrupt => try emit.mirInterrupt(inst),
            .nop => try emit.mirNop(),
            // SSE instructions
            .mov_f64_sse => try emit.mirMovFloatSse(.movsd, inst),
            .mov_f32_sse => try emit.mirMovFloatSse(.movss, inst),
            .add_f64_sse => try emit.mirAddFloatSse(.addsd, inst),
            .add_f32_sse => try emit.mirAddFloatSse(.addss, inst),
            .cmp_f64_sse => try emit.mirCmpFloatSse(.ucomisd, inst),
            .cmp_f32_sse => try emit.mirCmpFloatSse(.ucomiss, inst),
            // AVX instructions
            .mov_f64_avx => try emit.mirMovFloatAvx(.vmovsd, inst),
            .mov_f32_avx => try emit.mirMovFloatAvx(.vmovss, inst),
@ -536,6 +546,7 @@ fn mirArithMemImm(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
 }
 inline fn setRexWRegister(reg: Register) bool {
    if (reg.size() > 64) return false;
    if (reg.size() == 64) return true;
    return switch (reg) {
        .ah, .ch, .dh, .bh => true,
@ -963,11 +974,55 @@ fn mirLeaPie(emit: *Emit, inst: Mir.Inst.Index) InnerError!void {
    }
 }
 // SSE instructions
 fn mirMovFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            const imm = emit.mir.instructions.items(.data)[inst].imm;
            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.mem(Memory.PtrSize.new(ops.reg2.size()), .{
                .disp = imm,
                .base = ops.reg2,
            }), emit.code);
        },
        0b01 => {
            const imm = emit.mir.instructions.items(.data)[inst].imm;
            return lowerToMrEnc(tag, RegisterOrMemory.mem(Memory.PtrSize.new(ops.reg1.size()), .{
                .disp = imm,
                .base = ops.reg1,
            }), ops.reg2, emit.code);
        },
        0b10 => {
            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
        },
        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
    }
 }
 fn mirAddFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
        },
        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
    }
 }
 fn mirCmpFloatSse(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            return lowerToRmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
        },
        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
    }
 }
 // AVX instructions
 fn mirMovFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            const imm = emit.mir.instructions.items(.data)[inst].imm;
@ -986,24 +1041,22 @@ fn mirMovFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
        0b10 => {
            return lowerToRvmEnc(tag, ops.reg1, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
        },
-        else => return emit.fail("TODO unused variant 0b{b} for mov_f64", .{ops.flags}),
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
    }
 }
 fn mirAddFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            return lowerToRvmEnc(tag, ops.reg1, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
        },
-        else => return emit.fail("TODO unused variant 0b{b} for mov_f64", .{ops.flags}),
+        else => return emit.fail("TODO unused variant 0b{b} for {}", .{ ops.flags, tag }),
    }
 }
 fn mirCmpFloatAvx(emit: *Emit, tag: Tag, inst: Mir.Inst.Index) InnerError!void {
    const ops = emit.mir.instructions.items(.ops)[inst].decode();
    switch (ops.flags) {
        0b00 => {
            return lowerToVmEnc(tag, ops.reg1, RegisterOrMemory.reg(ops.reg2), emit.code);
@ -1247,6 +1300,14 @@ const Tag = enum {
    cmovng,
    cmovb,
    cmovnae,
    movsd,
    movss,
    addsd,
    addss,
    cmpsd,
    cmpss,
    ucomisd,
    ucomiss,
    vmovsd,
    vmovss,
    vaddsd,
@ -1256,6 +1317,22 @@ const Tag = enum {
    vucomisd,
    vucomiss,
    fn isSse(tag: Tag) bool {
        return switch (tag) {
            .movsd,
            .movss,
            .addsd,
            .addss,
            .cmpsd,
            .cmpss,
            .ucomisd,
            .ucomiss,
            => true,
            else => false,
        };
    }
    fn isAvx(tag: Tag) bool {
        return switch (tag) {
            .vmovsd,
@ -1369,190 +1446,256 @@ const Encoding = enum {
    rvmi,
 };
-const OpCode = union(enum) {
+const OpCode = struct {
-    one_byte: u8,
+    bytes: [3]u8,
-    two_byte: struct { _1: u8, _2: u8 },
+    count: usize,
-    fn oneByte(opc: u8) OpCode {
+    fn init(comptime in_bytes: []const u8) OpCode {
-        return .{ .one_byte = opc };
+        comptime assert(in_bytes.len <= 3);
        comptime var bytes: [3]u8 = undefined;
        inline for (in_bytes) |x, i| {
            bytes[i] = x;
        }
-
+        return .{ .bytes = bytes, .count = in_bytes.len };
    fn twoByte(opc1: u8, opc2: u8) OpCode {
        return .{ .two_byte = .{ ._1 = opc1, ._2 = opc2 } };
    }
    fn encode(opc: OpCode, encoder: Encoder) void {
-        switch (opc) {
+        switch (opc.count) {
-            .one_byte => |v| encoder.opcode_1byte(v),
+            1 => encoder.opcode_1byte(opc.bytes[0]),
-            .two_byte => |v| encoder.opcode_2byte(v._1, v._2),
+            2 => encoder.opcode_2byte(opc.bytes[0], opc.bytes[1]),
            3 => encoder.opcode_3byte(opc.bytes[0], opc.bytes[1], opc.bytes[2]),
            else => unreachable,
        }
    }
    fn encodeWithReg(opc: OpCode, encoder: Encoder, reg: Register) void {
-        assert(opc == .one_byte);
+        assert(opc.count == 1);
-        encoder.opcode_withReg(opc.one_byte, reg.lowEnc());
+        encoder.opcode_withReg(opc.bytes[0], reg.lowEnc());
    }
 };
 inline fn getOpCode(tag: Tag, enc: Encoding, is_one_byte: bool) OpCode {
    // zig fmt: off
    switch (enc) {
        .zo => return switch (tag) {
-            .ret_near => OpCode.oneByte(0xc3),
+            .ret_near => OpCode.init(&.{0xc3}),
-            .ret_far => OpCode.oneByte(0xcb),
+            .ret_far  => OpCode.init(&.{0xcb}),
-            .int3 => OpCode.oneByte(0xcc),
+            .int3     => OpCode.init(&.{0xcc}),
-            .nop => OpCode.oneByte(0x90),
+            .nop      => OpCode.init(&.{0x90}),
-            .syscall => OpCode.twoByte(0x0f, 0x05),
+            .syscall  => OpCode.init(&.{ 0x0f, 0x05 }),
-            .cbw => OpCode.oneByte(0x98),
+            .cbw      => OpCode.init(&.{0x98}),
-            .cwd, .cdq, .cqo => OpCode.oneByte(0x99),
+            .cwd,
            .cdq,
            .cqo      => OpCode.init(&.{0x99}),
            else      => unreachable,
        },
        .d => return switch (tag) {
-            .jmp_near => OpCode.oneByte(0xe9),
+            .jmp_near  =>                  OpCode.init(&.{0xe9}),
-            .call_near => OpCode.oneByte(0xe8),
+            .call_near =>                  OpCode.init(&.{0xe8}),
-            .jo => if (is_one_byte) OpCode.oneByte(0x70) else OpCode.twoByte(0x0f, 0x80),
+            .jo        => if (is_one_byte) OpCode.init(&.{0x70}) else OpCode.init(&.{0x0f,0x80}),
-            .jno => if (is_one_byte) OpCode.oneByte(0x71) else OpCode.twoByte(0x0f, 0x81),
+            .jno       => if (is_one_byte) OpCode.init(&.{0x71}) else OpCode.init(&.{0x0f,0x81}),
-            .jb, .jc, .jnae => if (is_one_byte) OpCode.oneByte(0x72) else OpCode.twoByte(0x0f, 0x82),
+            .jb,
-            .jnb, .jnc, .jae => if (is_one_byte) OpCode.oneByte(0x73) else OpCode.twoByte(0x0f, 0x83),
+            .jc,
-            .je, .jz => if (is_one_byte) OpCode.oneByte(0x74) else OpCode.twoByte(0x0f, 0x84),
+            .jnae      => if (is_one_byte) OpCode.init(&.{0x72}) else OpCode.init(&.{0x0f,0x82}),
-            .jne, .jnz => if (is_one_byte) OpCode.oneByte(0x75) else OpCode.twoByte(0x0f, 0x85),
+            .jnb,
-            .jna, .jbe => if (is_one_byte) OpCode.oneByte(0x76) else OpCode.twoByte(0x0f, 0x86),
+            .jnc, 
-            .jnbe, .ja => if (is_one_byte) OpCode.oneByte(0x77) else OpCode.twoByte(0x0f, 0x87),
+            .jae       => if (is_one_byte) OpCode.init(&.{0x73}) else OpCode.init(&.{0x0f,0x83}),
-            .js => if (is_one_byte) OpCode.oneByte(0x78) else OpCode.twoByte(0x0f, 0x88),
+            .je, 
-            .jns => if (is_one_byte) OpCode.oneByte(0x79) else OpCode.twoByte(0x0f, 0x89),
+            .jz        => if (is_one_byte) OpCode.init(&.{0x74}) else OpCode.init(&.{0x0f,0x84}),
-            .jpe, .jp => if (is_one_byte) OpCode.oneByte(0x7a) else OpCode.twoByte(0x0f, 0x8a),
+            .jne, 
-            .jpo, .jnp => if (is_one_byte) OpCode.oneByte(0x7b) else OpCode.twoByte(0x0f, 0x8b),
+            .jnz       => if (is_one_byte) OpCode.init(&.{0x75}) else OpCode.init(&.{0x0f,0x85}),
-            .jnge, .jl => if (is_one_byte) OpCode.oneByte(0x7c) else OpCode.twoByte(0x0f, 0x8c),
+            .jna, 
-            .jge, .jnl => if (is_one_byte) OpCode.oneByte(0x7d) else OpCode.twoByte(0x0f, 0x8d),
+            .jbe       => if (is_one_byte) OpCode.init(&.{0x76}) else OpCode.init(&.{0x0f,0x86}),
-            .jle, .jng => if (is_one_byte) OpCode.oneByte(0x7e) else OpCode.twoByte(0x0f, 0x8e),
+            .jnbe, 
-            .jg, .jnle => if (is_one_byte) OpCode.oneByte(0x7f) else OpCode.twoByte(0x0f, 0x8f),
+            .ja        => if (is_one_byte) OpCode.init(&.{0x77}) else OpCode.init(&.{0x0f,0x87}),
            .js        => if (is_one_byte) OpCode.init(&.{0x78}) else OpCode.init(&.{0x0f,0x88}),
            .jns       => if (is_one_byte) OpCode.init(&.{0x79}) else OpCode.init(&.{0x0f,0x89}),
            .jpe, 
            .jp        => if (is_one_byte) OpCode.init(&.{0x7a}) else OpCode.init(&.{0x0f,0x8a}),
            .jpo, 
            .jnp       => if (is_one_byte) OpCode.init(&.{0x7b}) else OpCode.init(&.{0x0f,0x8b}),
            .jnge, 
            .jl        => if (is_one_byte) OpCode.init(&.{0x7c}) else OpCode.init(&.{0x0f,0x8c}),
            .jge, 
            .jnl       => if (is_one_byte) OpCode.init(&.{0x7d}) else OpCode.init(&.{0x0f,0x8d}),
            .jle, 
            .jng       => if (is_one_byte) OpCode.init(&.{0x7e}) else OpCode.init(&.{0x0f,0x8e}),
            .jg, 
            .jnle      => if (is_one_byte) OpCode.init(&.{0x7f}) else OpCode.init(&.{0x0f,0x8f}),
            else       => unreachable,
        },
        .m => return switch (tag) {
-            .jmp_near, .call_near, .push => OpCode.oneByte(0xff),
+            .jmp_near,
-            .pop => OpCode.oneByte(0x8f),
+            .call_near,
-            .seto => OpCode.twoByte(0x0f, 0x90),
+            .push       =>                  OpCode.init(&.{0xff}),
-            .setno => OpCode.twoByte(0x0f, 0x91),
+            .pop        =>                  OpCode.init(&.{0x8f}),
-            .setb, .setc, .setnae => OpCode.twoByte(0x0f, 0x92),
+            .seto       =>                  OpCode.init(&.{0x0f,0x90}),
-            .setnb, .setnc, .setae => OpCode.twoByte(0x0f, 0x93),
+            .setno      =>                  OpCode.init(&.{0x0f,0x91}),
-            .sete, .setz => OpCode.twoByte(0x0f, 0x94),
+            .setb,
-            .setne, .setnz => OpCode.twoByte(0x0f, 0x95),
+            .setc,
-            .setbe, .setna => OpCode.twoByte(0x0f, 0x96),
+            .setnae     =>                  OpCode.init(&.{0x0f,0x92}),
-            .seta, .setnbe => OpCode.twoByte(0x0f, 0x97),
+            .setnb,
-            .sets => OpCode.twoByte(0x0f, 0x98),
+            .setnc,
-            .setns => OpCode.twoByte(0x0f, 0x99),
+            .setae      =>                  OpCode.init(&.{0x0f,0x93}),
-            .setp, .setpe => OpCode.twoByte(0x0f, 0x9a),
+            .sete,
-            .setnp, .setop => OpCode.twoByte(0x0f, 0x9b),
+            .setz       =>                  OpCode.init(&.{0x0f,0x94}),
-            .setl, .setnge => OpCode.twoByte(0x0f, 0x9c),
+            .setne,
-            .setnl, .setge => OpCode.twoByte(0x0f, 0x9d),
+            .setnz      =>                  OpCode.init(&.{0x0f,0x95}),
-            .setle, .setng => OpCode.twoByte(0x0f, 0x9e),
+            .setbe,
-            .setnle, .setg => OpCode.twoByte(0x0f, 0x9f),
+            .setna      =>                  OpCode.init(&.{0x0f,0x96}),
-            .idiv, .div, .imul, .mul => OpCode.oneByte(if (is_one_byte) 0xf6 else 0xf7),
+            .seta,
-            .fisttp16 => OpCode.oneByte(0xdf),
+            .setnbe     =>                  OpCode.init(&.{0x0f,0x97}),
-            .fisttp32 => OpCode.oneByte(0xdb),
+            .sets       =>                  OpCode.init(&.{0x0f,0x98}),
-            .fisttp64 => OpCode.oneByte(0xdd),
+            .setns      =>                  OpCode.init(&.{0x0f,0x99}),
-            .fld32 => OpCode.oneByte(0xd9),
+            .setp,
-            .fld64 => OpCode.oneByte(0xdd),
+            .setpe      =>                  OpCode.init(&.{0x0f,0x9a}),
            .setnp, 
            .setop      =>                  OpCode.init(&.{0x0f,0x9b}),
            .setl, 
            .setnge     =>                  OpCode.init(&.{0x0f,0x9c}),
            .setnl,
            .setge      =>                  OpCode.init(&.{0x0f,0x9d}),
            .setle,
            .setng      =>                  OpCode.init(&.{0x0f,0x9e}),
            .setnle,
            .setg       =>                  OpCode.init(&.{0x0f,0x9f}),
            .idiv,
            .div,
            .imul,
            .mul        => if (is_one_byte) OpCode.init(&.{0xf6}) else OpCode.init(&.{0xf7}),
            .fisttp16   =>                  OpCode.init(&.{0xdf}),
            .fisttp32   =>                  OpCode.init(&.{0xdb}),
            .fisttp64   =>                  OpCode.init(&.{0xdd}),
            .fld32      =>                  OpCode.init(&.{0xd9}),
            .fld64      =>                  OpCode.init(&.{0xdd}),
            else        => unreachable,
        },
        .o => return switch (tag) {
-            .push => OpCode.oneByte(0x50),
+            .push => OpCode.init(&.{0x50}),
-            .pop => OpCode.oneByte(0x58),
+            .pop  => OpCode.init(&.{0x58}),
            else  => unreachable,
        },
        .i => return switch (tag) {
-            .push => OpCode.oneByte(if (is_one_byte) 0x6a else 0x68),
+            .push     => if (is_one_byte) OpCode.init(&.{0x6a}) else OpCode.init(&.{0x68}),
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0xa8 else 0xa9),
+            .@"test"  => if (is_one_byte) OpCode.init(&.{0xa8}) else OpCode.init(&.{0xa9}),
-            .ret_near => OpCode.oneByte(0xc2),
+            .ret_near => OpCode.init(&.{0xc2}),
-            .ret_far => OpCode.oneByte(0xca),
+            .ret_far  => OpCode.init(&.{0xca}),
            else      => unreachable,
        },
        .m1 => return switch (tag) {
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xd0 else 0xd1),
+            .shl, .sal,
            .shr, .sar  => if (is_one_byte) OpCode.init(&.{0xd0}) else OpCode.init(&.{0xd1}),
            else        => unreachable,
        },
        .mc => return switch (tag) {
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xd2 else 0xd3),
+            .shl, .sal,
            .shr, .sar  => if (is_one_byte) OpCode.init(&.{0xd2}) else OpCode.init(&.{0xd3}),
            else        => unreachable,
        },
        .mi => return switch (tag) {
-            .adc, .add, .sub, .xor, .@"and", .@"or", .sbb, .cmp => OpCode.oneByte(if (is_one_byte) 0x80 else 0x81),
+            .adc, .add,
-            .mov => OpCode.oneByte(if (is_one_byte) 0xc6 else 0xc7),
+            .sub, .xor,
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0xf6 else 0xf7),
+            .@"and", .@"or",
            .sbb, .cmp       => if (is_one_byte) OpCode.init(&.{0x80}) else OpCode.init(&.{0x81}),
            .mov             => if (is_one_byte) OpCode.init(&.{0xc6}) else OpCode.init(&.{0xc7}),
            .@"test"         => if (is_one_byte) OpCode.init(&.{0xf6}) else OpCode.init(&.{0xf7}),
            else             => unreachable,
        },
        .mi8 => return switch (tag) {
-            .adc, .add, .sub, .xor, .@"and", .@"or", .sbb, .cmp => OpCode.oneByte(0x83),
+            .adc, .add,
-            .shl, .sal, .shr, .sar => OpCode.oneByte(if (is_one_byte) 0xc0 else 0xc1),
+            .sub, .xor,
            .@"and", .@"or",
            .sbb, .cmp        =>                  OpCode.init(&.{0x83}),
            .shl, .sal,
            .shr, .sar        => if (is_one_byte) OpCode.init(&.{0xc0}) else OpCode.init(&.{0xc1}),
            else              => unreachable,
        },
        .mr => return switch (tag) {
-            .adc => OpCode.oneByte(if (is_one_byte) 0x10 else 0x11),
+            .adc     => if (is_one_byte) OpCode.init(&.{0x10}) else OpCode.init(&.{0x11}),
-            .add => OpCode.oneByte(if (is_one_byte) 0x00 else 0x01),
+            .add     => if (is_one_byte) OpCode.init(&.{0x00}) else OpCode.init(&.{0x01}),
-            .sub => OpCode.oneByte(if (is_one_byte) 0x28 else 0x29),
+            .sub     => if (is_one_byte) OpCode.init(&.{0x28}) else OpCode.init(&.{0x29}),
-            .xor => OpCode.oneByte(if (is_one_byte) 0x30 else 0x31),
+            .xor     => if (is_one_byte) OpCode.init(&.{0x30}) else OpCode.init(&.{0x31}),
-            .@"and" => OpCode.oneByte(if (is_one_byte) 0x20 else 0x21),
+            .@"and"  => if (is_one_byte) OpCode.init(&.{0x20}) else OpCode.init(&.{0x21}),
-            .@"or" => OpCode.oneByte(if (is_one_byte) 0x08 else 0x09),
+            .@"or"   => if (is_one_byte) OpCode.init(&.{0x08}) else OpCode.init(&.{0x09}),
-            .sbb => OpCode.oneByte(if (is_one_byte) 0x18 else 0x19),
+            .sbb     => if (is_one_byte) OpCode.init(&.{0x18}) else OpCode.init(&.{0x19}),
-            .cmp => OpCode.oneByte(if (is_one_byte) 0x38 else 0x39),
+            .cmp     => if (is_one_byte) OpCode.init(&.{0x38}) else OpCode.init(&.{0x39}),
-            .mov => OpCode.oneByte(if (is_one_byte) 0x88 else 0x89),
+            .mov     => if (is_one_byte) OpCode.init(&.{0x88}) else OpCode.init(&.{0x89}),
-            .@"test" => OpCode.oneByte(if (is_one_byte) 0x84 else 0x85),
+            .@"test" => if (is_one_byte) OpCode.init(&.{0x84}) else OpCode.init(&.{0x85}),
            .movsd   =>                  OpCode.init(&.{0xf2,0x0f,0x11}),
            .movss   =>                  OpCode.init(&.{0xf3,0x0f,0x11}),
            else     => unreachable,
        },
        .rm => return switch (tag) {
-            .adc => OpCode.oneByte(if (is_one_byte) 0x12 else 0x13),
+            .adc      => if (is_one_byte) OpCode.init(&.{0x12})      else OpCode.init(&.{0x13}),
-            .add => OpCode.oneByte(if (is_one_byte) 0x02 else 0x03),
+            .add      => if (is_one_byte) OpCode.init(&.{0x02})      else OpCode.init(&.{0x03}),
-            .sub => OpCode.oneByte(if (is_one_byte) 0x2a else 0x2b),
+            .sub      => if (is_one_byte) OpCode.init(&.{0x2a})      else OpCode.init(&.{0x2b}),
-            .xor => OpCode.oneByte(if (is_one_byte) 0x32 else 0x33),
+            .xor      => if (is_one_byte) OpCode.init(&.{0x32})      else OpCode.init(&.{0x33}),
-            .@"and" => OpCode.oneByte(if (is_one_byte) 0x22 else 0x23),
+            .@"and"   => if (is_one_byte) OpCode.init(&.{0x22})      else OpCode.init(&.{0x23}),
-            .@"or" => OpCode.oneByte(if (is_one_byte) 0x0a else 0x0b),
+            .@"or"    => if (is_one_byte) OpCode.init(&.{0x0a})      else OpCode.init(&.{0x0b}),
-            .sbb => OpCode.oneByte(if (is_one_byte) 0x1a else 0x1b),
+            .sbb      => if (is_one_byte) OpCode.init(&.{0x1a})      else OpCode.init(&.{0x1b}),
-            .cmp => OpCode.oneByte(if (is_one_byte) 0x3a else 0x3b),
+            .cmp      => if (is_one_byte) OpCode.init(&.{0x3a})      else OpCode.init(&.{0x3b}),
-            .mov => OpCode.oneByte(if (is_one_byte) 0x8a else 0x8b),
+            .mov      => if (is_one_byte) OpCode.init(&.{0x8a})      else OpCode.init(&.{0x8b}),
-            .movsx => OpCode.twoByte(0x0f, if (is_one_byte) 0xbe else 0xbf),
+            .movsx    => if (is_one_byte) OpCode.init(&.{0x0f,0xbe}) else OpCode.init(&.{0x0f,0xbf}),
-            .movsxd => OpCode.oneByte(0x63),
+            .movsxd   =>                  OpCode.init(&.{0x63}),
-            .movzx => OpCode.twoByte(0x0f, if (is_one_byte) 0xb6 else 0xb7),
+            .movzx    => if (is_one_byte) OpCode.init(&.{0x0f,0xb6}) else OpCode.init(&.{0x0f,0xb7}),
-            .lea => OpCode.oneByte(if (is_one_byte) 0x8c else 0x8d),
+            .lea      => if (is_one_byte) OpCode.init(&.{0x8c})      else OpCode.init(&.{0x8d}),
-            .imul => OpCode.twoByte(0x0f, 0xaf),
+            .imul     =>                  OpCode.init(&.{0x0f,0xaf}),
-            .cmove, .cmovz => OpCode.twoByte(0x0f, 0x44),
+            .cmove, 
-            .cmovb, .cmovnae => OpCode.twoByte(0x0f, 0x42),
+            .cmovz    =>                  OpCode.init(&.{0x0f,0x44}),
-            .cmovl, .cmovng => OpCode.twoByte(0x0f, 0x4c),
+            .cmovb,
            .cmovnae  =>                  OpCode.init(&.{0x0f,0x42}),
            .cmovl,
            .cmovng   =>                  OpCode.init(&.{0x0f,0x4c}),
            .movsd    =>                  OpCode.init(&.{0xf2,0x0f,0x10}),
            .movss    =>                  OpCode.init(&.{0xf3,0x0f,0x10}),
            .addsd    =>                  OpCode.init(&.{0xf2,0x0f,0x58}),
            .addss    =>                  OpCode.init(&.{0xf3,0x0f,0x58}),
            .ucomisd  =>                  OpCode.init(&.{0x66,0x0f,0x2e}),
            .ucomiss  =>                  OpCode.init(&.{0x0f,0x2e}),
            else => unreachable,
        },
        .oi => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xb0 else 0xb8),
+            .mov => if (is_one_byte) OpCode.init(&.{0xb0}) else OpCode.init(&.{0xb8}),
            else => unreachable,
        },
        .fd => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xa0 else 0xa1),
+            .mov => if (is_one_byte) OpCode.init(&.{0xa0}) else OpCode.init(&.{0xa1}),
            else => unreachable,
        },
        .td => return switch (tag) {
-            .mov => OpCode.oneByte(if (is_one_byte) 0xa2 else 0xa3),
+            .mov => if (is_one_byte) OpCode.init(&.{0xa2}) else OpCode.init(&.{0xa3}),
            else => unreachable,
        },
        .rmi => return switch (tag) {
-            .imul => OpCode.oneByte(if (is_one_byte) 0x6b else 0x69),
+            .imul => if (is_one_byte) OpCode.init(&.{0x6b}) else OpCode.init(&.{0x69}),
            else  => unreachable,
        },
        .mv => return switch (tag) {
-            .vmovsd, .vmovss => OpCode.oneByte(0x11),
+            .vmovsd,
            .vmovss => OpCode.init(&.{0x11}),
            else => unreachable,
        },
        .vm => return switch (tag) {
-            .vmovsd, .vmovss => OpCode.oneByte(0x10),
+            .vmovsd, 
-            .vucomisd, .vucomiss => OpCode.oneByte(0x2e),
+            .vmovss   => OpCode.init(&.{0x10}),
            .vucomisd,
            .vucomiss => OpCode.init(&.{0x2e}),
            else => unreachable,
        },
        .rvm => return switch (tag) {
-            .vaddsd, .vaddss => OpCode.oneByte(0x58),
+            .vaddsd,
-            .vmovsd, .vmovss => OpCode.oneByte(0x10),
+            .vaddss  => OpCode.init(&.{0x58}),
            .vmovsd,
            .vmovss  => OpCode.init(&.{0x10}),
            else => unreachable,
        },
        .rvmi => return switch (tag) {
-            .vcmpsd, .vcmpss => OpCode.oneByte(0xc2),
+            .vcmpsd,
            .vcmpss  => OpCode.init(&.{0xc2}),
            else     => unreachable,
        },
    }
    // zig fmt: on
 }
 inline fn getModRmExt(tag: Tag) u3 {
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@ -345,11 +345,29 @@ pub const Inst = struct {
        /// Nop
        nop,
-        /// AVX instructions
+        /// SSE instructions
        /// ops flags:  form:
        ///       0b00  reg1, qword ptr [reg2 + imm32]
        ///       0b01  qword ptr [reg1 + imm32], reg2
        ///       0b10  reg1, reg2
        mov_f64_sse,
        mov_f32_sse,
        /// ops flags:  form:
        ///       0b00  reg1, reg2
        add_f64_sse,
        add_f32_sse,
        /// ops flags:  form:
        ///       0b00  reg1, reg2
        cmp_f64_sse,
        cmp_f32_sse,
        /// AVX instructions
        /// ops flags:  form:
        ///       0b00  reg1, qword ptr [reg2 + imm32]
        ///       0b01  qword ptr [reg1 + imm32], reg2
        ///       0b10  reg1, reg1, reg2
        mov_f64_avx,
        mov_f32_avx,
@ -359,7 +377,7 @@ pub const Inst = struct {
        add_f32_avx,
        /// ops flags:  form:
-        ///
+        ///       0b00  reg1, reg1, reg2
        cmp_f64_avx,
        cmp_f32_avx,
--- a/src/arch/x86_64/bits.zig
+++ b/src/arch/x86_64/bits.zig
@ -441,6 +441,17 @@ pub const Encoder = struct {
        self.code.appendAssumeCapacity(opcode);
    }
    /// Encodes a 3 byte opcode
    ///
    /// e.g. MOVSD has the opcode 0xf2 0x0f 0x10
    ///
    /// encoder.opcode_3byte(0xf2, 0x0f, 0x10);
    pub fn opcode_3byte(self: Self, prefix_1: u8, prefix_2: u8, opcode: u8) void {
        self.code.appendAssumeCapacity(prefix_1);
        self.code.appendAssumeCapacity(prefix_2);
        self.code.appendAssumeCapacity(opcode);
    }
    /// Encodes a 1 byte opcode with a reg field
    ///
    /// Remember to add a REX prefix byte if reg is extended!