diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 6337ad23f5..8c6f14ec3a 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1176,6 +1176,21 @@ fn asmRegisterRegisterRegister( }); } +fn asmRegisterRegisterRegisterImmediate( + self: *Self, + tag: Mir.Inst.Tag, + reg1: Register, + reg2: Register, + reg3: Register, + imm: Immediate, +) !void { + _ = try self.addInst(.{ + .tag = tag, + .ops = .rrri, + .data = .{ .rrri = .{ .r1 = reg1, .r2 = reg2, .r3 = reg3, .i = @intCast(u8, imm.unsigned) } }, + }); +} + fn asmRegisterRegisterImmediate( self: *Self, tag: Mir.Inst.Tag, @@ -2310,20 +2325,31 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { }), } } else if (src_bits == 64 and dst_bits == 32) { - if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( - .vcvtsd2ss, - dst_reg, - dst_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterRegisterMemory( + if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( .vcvtsd2ss, dst_reg, dst_reg, src_mcv.mem(.qword), - ) else if (src_mcv.isRegister()) - try self.asmRegisterRegister(.cvtsd2ss, dst_reg, src_mcv.getReg().?.to128()) - else - try self.asmRegisterMemory(.cvtsd2ss, dst_reg, src_mcv.mem(.qword)); + ) else try self.asmRegisterRegisterRegister( + .vcvtsd2ss, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( + .cvtsd2ss, + dst_reg, + src_mcv.mem(.qword), + ) else try self.asmRegisterRegister( + .cvtsd2ss, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ); } else return self.fail("TODO implement airFptrunc from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); @@ -2360,20 +2386,31 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { }), } } else if (src_bits == 32 and dst_bits == 64) { - if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( - .vcvtss2sd, - dst_reg, - dst_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterRegisterMemory( + if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( .vcvtss2sd, dst_reg, dst_reg, src_mcv.mem(.dword), - ) else if (src_mcv.isRegister()) - try self.asmRegisterRegister(.cvtss2sd, dst_reg, src_mcv.getReg().?.to128()) - else - try self.asmRegisterMemory(.cvtss2sd, dst_reg, src_mcv.mem(.dword)); + ) else try self.asmRegisterRegisterRegister( + .vcvtss2sd, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( + .cvtss2sd, + dst_reg, + src_mcv.mem(.dword), + ) else try self.asmRegisterRegister( + .cvtss2sd, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ); } else return self.fail("TODO implement airFpext from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); @@ -4532,7 +4569,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); const result: MCValue = result: { - const tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) { + const mir_tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) { .Float => switch (ty.floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) { const mat_src_reg = if (src_mcv.isRegister()) @@ -4558,11 +4595,14 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { .Float => switch (ty.childType().floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) { 1 => { - const mat_src_reg = if (src_mcv.isRegister()) - src_mcv.getReg().? - else - try self.copyToTmpRegister(ty, src_mcv); - try self.asmRegisterRegister(.vcvtph2ps, dst_reg, mat_src_reg.to128()); + try self.asmRegisterRegister( + .vcvtph2ps, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), + ); try self.asmRegisterRegisterRegister(.vsqrtss, dst_reg, dst_reg, dst_reg); try self.asmRegisterRegisterImmediate( .vcvtps2ph, @@ -4574,16 +4614,19 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { }, 2...8 => { const wide_reg = registerAlias(dst_reg, abi_size * 2); - if (src_mcv.isRegister()) try self.asmRegisterRegister( - .vcvtph2ps, - wide_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterMemory( + if (src_mcv.isMemory()) try self.asmRegisterMemory( .vcvtph2ps, wide_reg, src_mcv.mem(Memory.PtrSize.fromSize( @intCast(u32, @divExact(wide_reg.bitSize(), 16)), )), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + wide_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), ); try self.asmRegisterRegister(.vsqrtps, wide_reg, wide_reg); try self.asmRegisterRegisterImmediate( @@ -4617,26 +4660,32 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { })) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{ ty.fmt(self.bin_file.options.module.?), }); - switch (tag) { - .vsqrtss, .vsqrtsd => if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( - tag, - dst_reg, - dst_reg, - registerAlias(src_mcv.getReg().?, abi_size), - ) else try self.asmRegisterRegisterMemory( - tag, + switch (mir_tag) { + .vsqrtss, .vsqrtsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( + mir_tag, dst_reg, dst_reg, src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_reg, + dst_reg, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv), abi_size), ), - else => if (src_mcv.isRegister()) try self.asmRegisterRegister( - tag, - dst_reg, - registerAlias(src_mcv.getReg().?, abi_size), - ) else try self.asmRegisterMemory( - tag, + else => if (src_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, dst_reg, src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_reg, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv), abi_size), ), } break :result dst_mcv; @@ -5800,25 +5849,22 @@ fn genMulDivBinOp( } } -/// Result is always a register. fn genBinOp( self: *Self, maybe_inst: ?Air.Inst.Index, - tag: Air.Inst.Tag, + air_tag: Air.Inst.Tag, lhs_air: Air.Inst.Ref, rhs_air: Air.Inst.Ref, ) !MCValue { - const lhs = try self.resolveInst(lhs_air); - const rhs = try self.resolveInst(rhs_air); + const lhs_mcv = try self.resolveInst(lhs_air); + const rhs_mcv = try self.resolveInst(rhs_air); const lhs_ty = self.air.typeOf(lhs_air); const rhs_ty = self.air.typeOf(rhs_air); - if (lhs_ty.zigTypeTag() == .Vector) { - return self.fail("TODO implement genBinOp for {}", .{lhs_ty.fmt(self.bin_file.options.module.?)}); - } + const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); - switch (lhs) { + switch (lhs_mcv) { .immediate => |imm| switch (imm) { - 0 => switch (tag) { + 0 => switch (air_tag) { .sub, .subwrap => return self.genUnOp(maybe_inst, .neg, rhs_air), else => {}, }, @@ -5827,9 +5873,10 @@ fn genBinOp( else => {}, } - const is_commutative = switch (tag) { + const is_commutative = switch (air_tag) { .add, .addwrap, + .mul, .bool_or, .bit_or, .bool_and, @@ -5841,48 +5888,42 @@ fn genBinOp( else => false, }; - const dst_mem_ok = switch (tag) { - .add, - .addwrap, - .sub, - .subwrap, - .mul, - .div_float, - .div_exact, - .div_trunc, - .div_floor, - => !lhs_ty.isRuntimeFloat(), - - else => true, + const vec_op = switch (lhs_ty.zigTypeTag()) { + else => false, + .Float, .Vector => true, }; - const lhs_lock: ?RegisterLock = switch (lhs) { + const lhs_lock: ?RegisterLock = switch (lhs_mcv) { .register => |reg| self.register_manager.lockRegAssumeUnused(reg), else => null, }; defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock); - const rhs_lock: ?RegisterLock = switch (rhs) { + const rhs_lock: ?RegisterLock = switch (rhs_mcv) { .register => |reg| self.register_manager.lockReg(reg), else => null, }; defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock); - var flipped: bool = false; + var flipped = false; + var copied_to_dst = true; const dst_mcv: MCValue = dst: { if (maybe_inst) |inst| { - if ((dst_mem_ok or lhs.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs)) { - break :dst lhs; + if ((!vec_op or lhs_mcv.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs_mcv)) { + break :dst lhs_mcv; } - if (is_commutative and (dst_mem_ok or rhs.isRegister()) and - self.reuseOperand(inst, rhs_air, 1, rhs)) + if (is_commutative and (!vec_op or rhs_mcv.isRegister()) and + self.reuseOperand(inst, rhs_air, 1, rhs_mcv)) { flipped = true; - break :dst rhs; + break :dst rhs_mcv; } } const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true); - try self.genCopy(lhs_ty, dst_mcv, lhs); + if (vec_op and lhs_mcv.isRegister() and self.hasFeature(.avx)) + copied_to_dst = false + else + try self.genCopy(lhs_ty, dst_mcv, lhs_mcv); break :dst dst_mcv; }; const dst_lock: ?RegisterLock = switch (dst_mcv) { @@ -5891,160 +5932,47 @@ fn genBinOp( }; defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - const src_mcv = if (flipped) lhs else rhs; - switch (tag) { - .add, - .addwrap, - => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => .add, - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .addss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .addsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), + const src_mcv = if (flipped) lhs_mcv else rhs_mcv; + if (!vec_op) { + switch (air_tag) { + .add, + .addwrap, + => try self.genBinOpMir(.add, lhs_ty, dst_mcv, src_mcv), + + .sub, + .subwrap, + => try self.genBinOpMir(.sub, lhs_ty, dst_mcv, src_mcv), + + .ptr_add, + .ptr_sub, + => { + const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv); + const tmp_mcv = MCValue{ .register = tmp_reg }; + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + const elem_size = lhs_ty.elemType2().abiSize(self.target.*); + try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size }); + try self.genBinOpMir(switch (air_tag) { + .ptr_add => .add, + .ptr_sub => .sub, + else => unreachable, + }, lhs_ty, dst_mcv, tmp_mcv); }, - }, lhs_ty, dst_mcv, src_mcv), - .sub, - .subwrap, - => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => .sub, - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .subss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .subsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv), + .bool_or, + .bit_or, + => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv), - .mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .mulss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .mulsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv), + .bool_and, + .bit_and, + => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv), - .div_float, - .div_exact, - .div_trunc, - .div_floor, - => { - try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .divss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .divsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv); - switch (tag) { - .div_float, - .div_exact, - => {}, - .div_trunc, - .div_floor, - => if (self.hasFeature(.sse4_1)) { - const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); - const dst_alias = registerAlias(dst_mcv.register, abi_size); - try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) { - 32 => .roundss, - 64 => .roundsd, - else => unreachable, - }, dst_alias, dst_alias, Immediate.u(switch (tag) { - .div_trunc => 0b1_0_11, - .div_floor => 0b1_0_01, - else => unreachable, - })); - } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => unreachable, - } - }, + .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv), - .ptr_add, - .ptr_sub, - => { - const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv); - const tmp_mcv = MCValue{ .register = tmp_reg }; - const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); - defer self.register_manager.unlockReg(tmp_lock); - - const elem_size = lhs_ty.elemType2().abiSize(self.target.*); - try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size }); - try self.genBinOpMir(switch (tag) { - .ptr_add => .add, - .ptr_sub => .sub, - else => unreachable, - }, lhs_ty, dst_mcv, tmp_mcv); - }, - - .bool_or, - .bit_or, - => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv), - - .bool_and, - .bit_and, - => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv), - - .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv), - - .min, - .max, - => switch (lhs_ty.zigTypeTag()) { - .Int => { + .min, + .max, + => { const mat_src_mcv: MCValue = if (switch (src_mcv) { .immediate, .eflags, @@ -6070,12 +5998,12 @@ fn genBinOp( const int_info = lhs_ty.intInfo(self.target.*); const cc: Condition = switch (int_info.signedness) { - .unsigned => switch (tag) { + .unsigned => switch (air_tag) { .min => .a, .max => .b, else => unreachable, }, - .signed => switch (tag) { + .signed => switch (air_tag) { .min => .g, .max => .l, else => unreachable, @@ -6134,26 +6062,222 @@ fn genBinOp( } try self.genCopy(lhs_ty, dst_mcv, .{ .register = tmp_reg }); }, - .Float => try self.genBinOpMir(switch (lhs_ty.floatBits(self.target.*)) { - 32 => switch (tag) { - .min => .minss, - .max => .maxss, - else => unreachable, - }, - 64 => switch (tag) { - .min => .minsd, - .max => .maxsd, - else => unreachable, - }, - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, lhs_ty, dst_mcv, src_mcv), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, + else => return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + } + return dst_mcv; + } + + const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { + else => unreachable, + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddss else .addss, + .sub => if (self.hasFeature(.avx)) .vsubss else .subss, + .mul => if (self.hasFeature(.avx)) .vmulss else .mulss, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivss else .divss, + .max => if (self.hasFeature(.avx)) .vmaxss else .maxss, + .min => if (self.hasFeature(.avx)) .vminss else .minss, + else => unreachable, + }, + 64 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddsd else .addsd, + .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd, + .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivsd else .divsd, + .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd, + .min => if (self.hasFeature(.avx)) .vminsd else .minsd, + else => unreachable, + }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + else => null, + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddss else .addss, + .sub => if (self.hasFeature(.avx)) .vsubss else .subss, + .mul => if (self.hasFeature(.avx)) .vmulss else .mulss, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivss else .divss, + .max => if (self.hasFeature(.avx)) .vmaxss else .maxss, + .min => if (self.hasFeature(.avx)) .vminss else .minss, + else => unreachable, + }, + 2...4 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddps else .addps, + .sub => if (self.hasFeature(.avx)) .vsubps else .subps, + .mul => if (self.hasFeature(.avx)) .vmulps else .mulps, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivps else .divps, + .max => if (self.hasFeature(.avx)) .vmaxps else .maxps, + .min => if (self.hasFeature(.avx)) .vminps else .minps, + else => unreachable, + }, + 5...8 => if (self.hasFeature(.avx)) switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .mul => .vmulps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vminps, + else => unreachable, + } else null, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddsd else .addsd, + .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd, + .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivsd else .divsd, + .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd, + .min => if (self.hasFeature(.avx)) .vminsd else .minsd, + else => unreachable, + }, + 2 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddpd else .addpd, + .sub => if (self.hasFeature(.avx)) .vsubpd else .subpd, + .mul => if (self.hasFeature(.avx)) .vmulpd else .mulpd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivpd else .divpd, + .max => if (self.hasFeature(.avx)) .vmaxpd else .maxpd, + .min => if (self.hasFeature(.avx)) .vminpd else .minpd, + else => unreachable, + }, + 3...4 => if (self.hasFeature(.avx)) switch (air_tag) { + .add => .vaddpd, + .sub => .vsubpd, + .mul => .vmulpd, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivpd, + .max => .vmaxpd, + .min => .vminpd, + else => unreachable, + } else null, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + }, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }); + const dst_alias = registerAlias(dst_mcv.getReg().?, abi_size); + if (self.hasFeature(.avx)) { + const src1_alias = + if (copied_to_dst) dst_alias else registerAlias(lhs_mcv.getReg().?, abi_size); + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( + mir_tag, + dst_alias, + src1_alias, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + src1_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size), + ); + } else { + assert(copied_to_dst); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, + dst_alias, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size), + ); + } + switch (air_tag) { + .add, .sub, .mul, .div_float, .div_exact => {}, + .div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) { + const round_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => if (self.hasFeature(.avx)) .vroundss else .roundss, + 64 => if (self.hasFeature(.avx)) .vroundsd else .roundsd, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vroundss else .roundss, + 2...4 => if (self.hasFeature(.avx)) .vroundps else .roundps, + 5...8 => if (self.hasFeature(.avx)) .vroundps else null, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vroundsd else .roundsd, + 2 => if (self.hasFeature(.avx)) .vroundpd else .roundpd, + 3...4 => if (self.hasFeature(.avx)) .vroundpd else null, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => null, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }); + const round_mode = Immediate.u(switch (air_tag) { + .div_trunc => 0b1_0_11, + .div_floor => 0b1_0_01, + else => unreachable, + }); + switch (round_tag) { + .vroundss, .vroundsd => try self.asmRegisterRegisterRegisterImmediate( + round_tag, + dst_alias, + dst_alias, + dst_alias, + round_mode, + ), + else => try self.asmRegisterRegisterImmediate( + round_tag, + dst_alias, + dst_alias, + round_mode, + ), + } + } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + .max, .min => {}, // TODO: unordered select else => unreachable, } return dst_mcv; @@ -6186,20 +6310,11 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s .register_overflow, .reserved_frame, => unreachable, - .register => |src_reg| switch (ty.zigTypeTag()) { - .Float => { - if (!Target.x86.featureSetHas(self.target.cpu.features, .sse)) - return self.fail("TODO genBinOpMir for {s} {} without sse", .{ - @tagName(mir_tag), ty.fmt(self.bin_file.options.module.?), - }); - return self.asmRegisterRegister(mir_tag, dst_reg.to128(), src_reg.to128()); - }, - else => try self.asmRegisterRegister( - mir_tag, - dst_alias, - registerAlias(src_reg, abi_size), - ), - }, + .register => |src_reg| try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(src_reg, abi_size), + ), .immediate => |imm| switch (self.regBitSize(ty)) { 8 => try self.asmRegisterImmediate( mir_tag, @@ -9646,7 +9761,7 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { lock.* = self.register_manager.lockRegAssumeUnused(reg); } - const tag = if (@as( + const mir_tag = if (@as( ?Mir.Inst.Tag, if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 })) switch (ty.zigTypeTag()) { @@ -9741,20 +9856,17 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { const abi_size = @intCast(u32, ty.abiSize(self.target.*)); const mop1_reg = registerAlias(mops[0].getReg().?, abi_size); const mop2_reg = registerAlias(mops[1].getReg().?, abi_size); - if (mops[2].isRegister()) - try self.asmRegisterRegisterRegister( - tag, - mop1_reg, - mop2_reg, - registerAlias(mops[2].getReg().?, abi_size), - ) - else - try self.asmRegisterRegisterMemory( - tag, - mop1_reg, - mop2_reg, - mops[2].mem(Memory.PtrSize.fromSize(abi_size)), - ); + if (mops[2].isRegister()) try self.asmRegisterRegisterRegister( + mir_tag, + mop1_reg, + mop2_reg, + registerAlias(mops[2].getReg().?, abi_size), + ) else try self.asmRegisterRegisterMemory( + mir_tag, + mop1_reg, + mop2_reg, + mops[2].mem(Memory.PtrSize.fromSize(abi_size)), + ); return self.finishAir(inst, mops[0], ops); } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index b242c98bdc..b8ccc9efba 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -262,61 +262,69 @@ pub const Mnemonic = enum { // MMX movd, // SSE - addss, + addps, addss, andps, andnps, cmpss, cvtsi2ss, - divss, - maxss, minss, + divps, divss, + maxps, maxss, + minps, minss, movaps, movss, movups, - mulss, + mulps, mulss, orps, pextrw, pinsrw, - sqrtps, - sqrtss, - subss, + sqrtps, sqrtss, + subps, subss, ucomiss, xorps, // SSE2 - addsd, + addpd, addsd, andpd, andnpd, //cmpsd, cvtsd2ss, cvtsi2sd, cvtss2sd, - divsd, - maxsd, minsd, + divpd, divsd, + maxpd, maxsd, + minpd, minsd, movapd, movq, //movd, movsd, movupd, - mulsd, + mulpd, mulsd, orpd, pshufhw, pshuflw, psrld, psrlq, psrlw, punpckhbw, punpckhdq, punpckhqdq, punpckhwd, punpcklbw, punpckldq, punpcklqdq, punpcklwd, sqrtpd, sqrtsd, - subsd, + subpd, subsd, ucomisd, xorpd, // SSE3 movddup, movshdup, movsldup, // SSE4.1 - roundsd, roundss, + roundpd, roundps, roundsd, roundss, // AVX + vaddpd, vaddps, vaddsd, vaddss, vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, + vdivpd, vdivps, vdivsd, vdivss, + vmaxpd, vmaxps, vmaxsd, vmaxss, + vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, vmovddup, vmovsd, vmovshdup, vmovsldup, vmovss, vmovupd, vmovups, + vmulpd, vmulps, vmulsd, vmulss, vpextrw, vpinsrw, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, + vroundpd, vroundps, vroundsd, vroundss, vsqrtpd, vsqrtps, vsqrtsd, vsqrtss, + vsubpd, vsubps, vsubsd, vsubss, // F16C vcvtph2ps, vcvtps2ph, // FMA diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 39ad2313e7..2cfa25ac84 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -124,27 +124,34 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .xchg, .xor, + .addps, .addss, .andnps, .andps, .cmpss, .cvtsi2ss, + .divps, .divss, + .maxps, .maxss, + .minps, .minss, .movaps, .movss, .movups, + .mulps, .mulss, .orps, .pextrw, .pinsrw, .sqrtps, .sqrtss, + .subps, .subss, .ucomiss, .xorps, + .addpd, .addsd, .andnpd, .andpd, @@ -152,10 +159,14 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .cvtsd2ss, .cvtsi2sd, .cvtss2sd, + .divpd, .divsd, + .maxpd, .maxsd, + .minpd, .minsd, .movsd, + .mulpd, .mulsd, .orpd, .pshufhw, @@ -173,6 +184,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .punpcklwd, .sqrtpd, .sqrtsd, + .subpd, .subsd, .ucomisd, .xorpd, @@ -181,13 +193,31 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .movshdup, .movsldup, + .roundpd, + .roundps, .roundsd, .roundss, + .vaddpd, + .vaddps, + .vaddsd, + .vaddss, .vcvtsd2ss, .vcvtsi2sd, .vcvtsi2ss, .vcvtss2sd, + .vdivpd, + .vdivps, + .vdivsd, + .vdivss, + .vmaxpd, + .vmaxps, + .vmaxsd, + .vmaxss, + .vminpd, + .vminps, + .vminsd, + .vminss, .vmovapd, .vmovaps, .vmovddup, @@ -197,6 +227,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmovss, .vmovupd, .vmovups, + .vmulpd, + .vmulps, + .vmulsd, + .vmulss, .vpextrw, .vpinsrw, .vpshufhw, @@ -212,10 +246,18 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vpunpckldq, .vpunpcklqdq, .vpunpcklwd, + .vroundpd, + .vroundps, + .vroundsd, + .vroundss, .vsqrtpd, .vsqrtps, .vsqrtsd, .vsqrtss, + .vsubpd, + .vsubps, + .vsubsd, + .vsubss, .vcvtph2ps, .vcvtps2ph, @@ -304,6 +346,7 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .lock_mi_rip_s, => Immediate.s(@bitCast(i32, i)), + .rrri, .rri_u, .ri_u, .i_u, @@ -429,6 +472,12 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rrr.r2 }, .{ .reg = inst.data.rrr.r3 }, }, + .rrri => &.{ + .{ .reg = inst.data.rrri.r1 }, + .{ .reg = inst.data.rrri.r2 }, + .{ .reg = inst.data.rrri.r3 }, + .{ .imm = lower.imm(inst.ops, inst.data.rrri.i) }, + }, .ri_s, .ri_u => &.{ .{ .reg = inst.data.ri.r }, .{ .imm = lower.imm(inst.ops, inst.data.ri.i) }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index b6df0fff09..c0450406cf 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -166,7 +166,9 @@ pub const Inst = struct { /// Logical exclusive-or xor, - /// Add single precision floating point values + /// Add packed single-precision floating-point values + addps, + /// Add scalar single-precision floating-point values addss, /// Bitwise logical and of packed single precision floating-point values andps, @@ -176,11 +178,17 @@ pub const Inst = struct { cmpss, /// Convert doubleword integer to scalar single-precision floating-point value cvtsi2ss, + /// Divide packed single-precision floating-point values + divps, /// Divide scalar single-precision floating-point values divss, - /// Return maximum single-precision floating-point value + /// Maximum of packed single-precision floating-point values + maxps, + /// Maximum of scalar single-precision floating-point values maxss, - /// Return minimum single-precision floating-point value + /// Minimum of packed single-precision floating-point values + minps, + /// Minimum of scalar single-precision floating-point values minss, /// Move aligned packed single-precision floating-point values movaps, @@ -188,6 +196,8 @@ pub const Inst = struct { movss, /// Move unaligned packed single-precision floating-point values movups, + /// Multiply packed single-precision floating-point values + mulps, /// Multiply scalar single-precision floating-point values mulss, /// Bitwise logical or of packed single precision floating-point values @@ -196,18 +206,22 @@ pub const Inst = struct { pextrw, /// Insert word pinsrw, - /// Square root of scalar single precision floating-point value + /// Square root of packed single-precision floating-point values sqrtps, - /// Subtract scalar single-precision floating-point values + /// Square root of scalar single-precision floating-point value sqrtss, - /// Square root of single precision floating-point values + /// Subtract packed single-precision floating-point values + subps, + /// Subtract scalar single-precision floating-point values subss, /// Unordered compare scalar single-precision floating-point values ucomiss, /// Bitwise logical xor of packed single precision floating-point values xorps, - /// Add double precision floating point values + /// Add packed double-precision floating-point values + addpd, + /// Add scalar double-precision floating-point values addsd, /// Bitwise logical and not of packed double precision floating-point values andnpd, @@ -221,14 +235,22 @@ pub const Inst = struct { cvtsi2sd, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value cvtss2sd, + /// Divide packed double-precision floating-point values + divpd, /// Divide scalar double-precision floating-point values divsd, - /// Return maximum double-precision floating-point value + /// Maximum of packed double-precision floating-point values + maxpd, + /// Maximum of scalar double-precision floating-point values maxsd, - /// Return minimum double-precision floating-point value + /// Minimum of packed double-precision floating-point values + minpd, + /// Minimum of scalar double-precision floating-point values minsd, /// Move scalar double-precision floating-point value movsd, + /// Multiply packed double-precision floating-point values + mulpd, /// Multiply scalar double-precision floating-point values mulsd, /// Bitwise logical or of packed double precision floating-point values @@ -263,6 +285,8 @@ pub const Inst = struct { sqrtpd, /// Square root of scalar double precision floating-point value sqrtsd, + /// Subtract packed double-precision floating-point values + subpd, /// Subtract scalar double-precision floating-point values subsd, /// Unordered compare scalar double-precision floating-point values @@ -277,11 +301,23 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, - /// Round scalar double-precision floating-point values + /// Round packed double-precision floating-point values + roundpd, + /// Round packed single-precision floating-point values + roundps, + /// Round scalar double-precision floating-point value roundsd, - /// Round scalar single-precision floating-point values + /// Round scalar single-precision floating-point value roundss, + /// Add packed double-precision floating-point values + vaddpd, + /// Add packed single-precision floating-point values + vaddps, + /// Add scalar double-precision floating-point values + vaddsd, + /// Add scalar single-precision floating-point values + vaddss, /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value vcvtsd2ss, /// Convert doubleword integer to scalar double-precision floating-point value @@ -290,6 +326,30 @@ pub const Inst = struct { vcvtsi2ss, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value vcvtss2sd, + /// Divide packed double-precision floating-point values + vdivpd, + /// Divide packed single-precision floating-point values + vdivps, + /// Divide scalar double-precision floating-point values + vdivsd, + /// Divide scalar single-precision floating-point values + vdivss, + /// Maximum of packed double-precision floating-point values + vmaxpd, + /// Maximum of packed single-precision floating-point values + vmaxps, + /// Maximum of scalar double-precision floating-point values + vmaxsd, + /// Maximum of scalar single-precision floating-point values + vmaxss, + /// Minimum of packed double-precision floating-point values + vminpd, + /// Minimum of packed single-precision floating-point values + vminps, + /// Minimum of scalar double-precision floating-point values + vminsd, + /// Minimum of scalar single-precision floating-point values + vminss, /// Move aligned packed double-precision floating-point values vmovapd, /// Move aligned packed single-precision floating-point values @@ -308,6 +368,14 @@ pub const Inst = struct { vmovupd, /// Move unaligned packed single-precision floating-point values vmovups, + /// Multiply packed double-precision floating-point values + vmulpd, + /// Multiply packed single-precision floating-point values + vmulps, + /// Multiply scalar double-precision floating-point values + vmulsd, + /// Multiply scalar single-precision floating-point values + vmulss, /// Extract word vpextrw, /// Insert word @@ -338,6 +406,14 @@ pub const Inst = struct { vpunpcklqdq, /// Unpack low data vpunpcklwd, + /// Round packed double-precision floating-point values + vroundpd, + /// Round packed single-precision floating-point values + vroundps, + /// Round scalar double-precision floating-point value + vroundsd, + /// Round scalar single-precision floating-point value + vroundss, /// Square root of packed double-precision floating-point value vsqrtpd, /// Square root of packed single-precision floating-point value @@ -346,6 +422,14 @@ pub const Inst = struct { vsqrtsd, /// Square root of scalar single-precision floating-point value vsqrtss, + /// Subtract packed double-precision floating-point values + vsubpd, + /// Subtract packed single-precision floating-point values + vsubps, + /// Subtract scalar double-precision floating-point values + vsubsd, + /// Subtract scalar single-precision floating-point values + vsubss, /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, @@ -442,6 +526,9 @@ pub const Inst = struct { /// Register, register, register operands. /// Uses `rrr` payload. rrr, + /// Register, register, register, immediate (byte) operands. + /// Uses `rrri` payload. + rrri, /// Register, register, immediate (sign-extended) operands. /// Uses `rri` payload. rri_s, @@ -625,6 +712,12 @@ pub const Inst = struct { r2: Register, r3: Register, }, + rrri: struct { + r1: Register, + r2: Register, + r3: Register, + i: u8, + }, rri: struct { r1: Register, r2: Register, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 78bda4fc76..c41f0ea4e7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -837,6 +837,8 @@ pub const table = [_]Entry{ .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, // SSE + .{ .addps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .none, .sse }, + .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse }, .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .none, .sse }, @@ -848,10 +850,16 @@ pub const table = [_]Entry{ .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse }, .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse }, + .{ .divps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .none, .sse }, + .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse }, + .{ .maxps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .none, .sse }, + .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .none, .sse }, + .{ .minps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .none, .sse }, + .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .none, .sse }, .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse }, @@ -863,10 +871,14 @@ pub const table = [_]Entry{ .{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .none, .sse }, .{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .none, .sse }, + .{ .mulps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .none, .sse }, + .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .none, .sse }, .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse }, + .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse }, + .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, @@ -878,6 +890,8 @@ pub const table = [_]Entry{ .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse }, // SSE2 + .{ .addpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .none, .sse2 }, + .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .none, .sse2 }, .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .none, .sse2 }, @@ -893,10 +907,16 @@ pub const table = [_]Entry{ .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 }, + .{ .divpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .none, .sse2 }, + .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 }, + .{ .maxpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .none, .sse2 }, + .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .none, .sse2 }, + .{ .minpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .none, .sse2 }, + .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .none, .sse2 }, .{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .none, .sse2 }, @@ -914,6 +934,8 @@ pub const table = [_]Entry{ .{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .none, .sse2 }, .{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .none, .sse2 }, + .{ .mulpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .none, .sse2 }, + .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .none, .sse2 }, .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, @@ -947,6 +969,8 @@ pub const table = [_]Entry{ .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, + .{ .subpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .none, .sse2 }, + .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 }, .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .none, .sse2 }, @@ -966,10 +990,25 @@ pub const table = [_]Entry{ // SSE4.1 .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, - .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, + .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, + + .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 }, + .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, + .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, + // AVX + .{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx }, + .{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx }, + + .{ .vaddps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .vex_128_wig, .avx }, + .{ .vaddps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x58 }, 0, .vex_256_wig, .avx }, + + .{ .vaddsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + + .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, @@ -980,6 +1019,36 @@ pub const table = [_]Entry{ .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, + .{ .vdivpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_128_wig, .avx }, + .{ .vdivpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_256_wig, .avx }, + + .{ .vdivps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .vex_128_wig, .avx }, + .{ .vdivps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5e }, 0, .vex_256_wig, .avx }, + + .{ .vdivsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx }, + + .{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx }, + + .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx }, + .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx }, + + .{ .vmaxps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .vex_128_wig, .avx }, + .{ .vmaxps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5f }, 0, .vex_256_wig, .avx }, + + .{ .vmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx }, + + .{ .vmaxss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx }, + + .{ .vminpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_128_wig, .avx }, + .{ .vminpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_256_wig, .avx }, + + .{ .vminps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .vex_128_wig, .avx }, + .{ .vminps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5d }, 0, .vex_256_wig, .avx }, + + .{ .vminsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx }, + + .{ .vminss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx }, + .{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128_wig, .avx }, .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128_wig, .avx }, .{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256_wig, .avx }, @@ -1019,6 +1088,16 @@ pub const table = [_]Entry{ .{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256_wig, .avx }, .{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256_wig, .avx }, + .{ .vmulpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_128_wig, .avx }, + .{ .vmulpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_256_wig, .avx }, + + .{ .vmulps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .vex_128_wig, .avx }, + .{ .vmulps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x59 }, 0, .vex_256_wig, .avx }, + + .{ .vmulsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + + .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + .{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx }, .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx }, @@ -1041,6 +1120,16 @@ pub const table = [_]Entry{ .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx }, .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx }, + .{ .vroundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_128_wig, .avx }, + .{ .vroundpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_256_wig, .avx }, + + .{ .vroundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_128_wig, .avx }, + .{ .vroundps, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_256_wig, .avx }, + + .{ .vroundsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .vex_lig_wig, .avx }, + + .{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx }, + .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx }, .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx }, @@ -1051,6 +1140,16 @@ pub const table = [_]Entry{ .{ .vsqrtss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .vex_lig_wig, .avx }, + .{ .vsubpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_128_wig, .avx }, + .{ .vsubpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_256_wig, .avx }, + + .{ .vsubps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .vex_128_wig, .avx }, + .{ .vsubps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5c }, 0, .vex_256_wig, .avx }, + + .{ .vsubsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx }, + + .{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx }, + // F16C .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c }, .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c },