x86_64: implement binary operations for float vectors

This commit is contained in:
Jacob Young 2023-05-07 09:06:12 -04:00
parent ea957c4cff
commit 057139fda5
5 changed files with 662 additions and 301 deletions

View File

@ -1176,6 +1176,21 @@ fn asmRegisterRegisterRegister(
});
}
fn asmRegisterRegisterRegisterImmediate(
self: *Self,
tag: Mir.Inst.Tag,
reg1: Register,
reg2: Register,
reg3: Register,
imm: Immediate,
) !void {
_ = try self.addInst(.{
.tag = tag,
.ops = .rrri,
.data = .{ .rrri = .{ .r1 = reg1, .r2 = reg2, .r3 = reg3, .i = @intCast(u8, imm.unsigned) } },
});
}
fn asmRegisterRegisterImmediate(
self: *Self,
tag: Mir.Inst.Tag,
@ -2310,20 +2325,31 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
}),
}
} else if (src_bits == 64 and dst_bits == 32) {
if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
.vcvtsd2ss,
dst_reg,
dst_reg,
src_mcv.getReg().?.to128(),
) else try self.asmRegisterRegisterMemory(
if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
.vcvtsd2ss,
dst_reg,
dst_reg,
src_mcv.mem(.qword),
) else if (src_mcv.isRegister())
try self.asmRegisterRegister(.cvtsd2ss, dst_reg, src_mcv.getReg().?.to128())
else
try self.asmRegisterMemory(.cvtsd2ss, dst_reg, src_mcv.mem(.qword));
) else try self.asmRegisterRegisterRegister(
.vcvtsd2ss,
dst_reg,
dst_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
.cvtsd2ss,
dst_reg,
src_mcv.mem(.qword),
) else try self.asmRegisterRegister(
.cvtsd2ss,
dst_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
);
} else return self.fail("TODO implement airFptrunc from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
});
@ -2360,20 +2386,31 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
}),
}
} else if (src_bits == 32 and dst_bits == 64) {
if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
.vcvtss2sd,
dst_reg,
dst_reg,
src_mcv.getReg().?.to128(),
) else try self.asmRegisterRegisterMemory(
if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
.vcvtss2sd,
dst_reg,
dst_reg,
src_mcv.mem(.dword),
) else if (src_mcv.isRegister())
try self.asmRegisterRegister(.cvtss2sd, dst_reg, src_mcv.getReg().?.to128())
else
try self.asmRegisterMemory(.cvtss2sd, dst_reg, src_mcv.mem(.dword));
) else try self.asmRegisterRegisterRegister(
.vcvtss2sd,
dst_reg,
dst_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
) else if (src_mcv.isMemory()) try self.asmRegisterMemory(
.cvtss2sd,
dst_reg,
src_mcv.mem(.dword),
) else try self.asmRegisterRegister(
.cvtss2sd,
dst_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(src_ty, src_mcv)).to128(),
);
} else return self.fail("TODO implement airFpext from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
});
@ -4532,7 +4569,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
const result: MCValue = result: {
const tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) {
const mir_tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) {
.Float => switch (ty.floatBits(self.target.*)) {
16 => if (self.hasFeature(.f16c)) {
const mat_src_reg = if (src_mcv.isRegister())
@ -4558,11 +4595,14 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
.Float => switch (ty.childType().floatBits(self.target.*)) {
16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) {
1 => {
const mat_src_reg = if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(ty, src_mcv);
try self.asmRegisterRegister(.vcvtph2ps, dst_reg, mat_src_reg.to128());
try self.asmRegisterRegister(
.vcvtph2ps,
dst_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(ty, src_mcv)).to128(),
);
try self.asmRegisterRegisterRegister(.vsqrtss, dst_reg, dst_reg, dst_reg);
try self.asmRegisterRegisterImmediate(
.vcvtps2ph,
@ -4574,16 +4614,19 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
},
2...8 => {
const wide_reg = registerAlias(dst_reg, abi_size * 2);
if (src_mcv.isRegister()) try self.asmRegisterRegister(
.vcvtph2ps,
wide_reg,
src_mcv.getReg().?.to128(),
) else try self.asmRegisterMemory(
if (src_mcv.isMemory()) try self.asmRegisterMemory(
.vcvtph2ps,
wide_reg,
src_mcv.mem(Memory.PtrSize.fromSize(
@intCast(u32, @divExact(wide_reg.bitSize(), 16)),
)),
) else try self.asmRegisterRegister(
.vcvtph2ps,
wide_reg,
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(ty, src_mcv)).to128(),
);
try self.asmRegisterRegister(.vsqrtps, wide_reg, wide_reg);
try self.asmRegisterRegisterImmediate(
@ -4617,26 +4660,32 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void {
})) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{
ty.fmt(self.bin_file.options.module.?),
});
switch (tag) {
.vsqrtss, .vsqrtsd => if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister(
tag,
dst_reg,
dst_reg,
registerAlias(src_mcv.getReg().?, abi_size),
) else try self.asmRegisterRegisterMemory(
tag,
switch (mir_tag) {
.vsqrtss, .vsqrtsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
mir_tag,
dst_reg,
dst_reg,
src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
) else try self.asmRegisterRegisterRegister(
mir_tag,
dst_reg,
dst_reg,
registerAlias(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(ty, src_mcv), abi_size),
),
else => if (src_mcv.isRegister()) try self.asmRegisterRegister(
tag,
dst_reg,
registerAlias(src_mcv.getReg().?, abi_size),
) else try self.asmRegisterMemory(
tag,
else => if (src_mcv.isMemory()) try self.asmRegisterMemory(
mir_tag,
dst_reg,
src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
) else try self.asmRegisterRegister(
mir_tag,
dst_reg,
registerAlias(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(ty, src_mcv), abi_size),
),
}
break :result dst_mcv;
@ -5800,25 +5849,22 @@ fn genMulDivBinOp(
}
}
/// Result is always a register.
fn genBinOp(
self: *Self,
maybe_inst: ?Air.Inst.Index,
tag: Air.Inst.Tag,
air_tag: Air.Inst.Tag,
lhs_air: Air.Inst.Ref,
rhs_air: Air.Inst.Ref,
) !MCValue {
const lhs = try self.resolveInst(lhs_air);
const rhs = try self.resolveInst(rhs_air);
const lhs_mcv = try self.resolveInst(lhs_air);
const rhs_mcv = try self.resolveInst(rhs_air);
const lhs_ty = self.air.typeOf(lhs_air);
const rhs_ty = self.air.typeOf(rhs_air);
if (lhs_ty.zigTypeTag() == .Vector) {
return self.fail("TODO implement genBinOp for {}", .{lhs_ty.fmt(self.bin_file.options.module.?)});
}
const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
switch (lhs) {
switch (lhs_mcv) {
.immediate => |imm| switch (imm) {
0 => switch (tag) {
0 => switch (air_tag) {
.sub, .subwrap => return self.genUnOp(maybe_inst, .neg, rhs_air),
else => {},
},
@ -5827,9 +5873,10 @@ fn genBinOp(
else => {},
}
const is_commutative = switch (tag) {
const is_commutative = switch (air_tag) {
.add,
.addwrap,
.mul,
.bool_or,
.bit_or,
.bool_and,
@ -5841,48 +5888,42 @@ fn genBinOp(
else => false,
};
const dst_mem_ok = switch (tag) {
.add,
.addwrap,
.sub,
.subwrap,
.mul,
.div_float,
.div_exact,
.div_trunc,
.div_floor,
=> !lhs_ty.isRuntimeFloat(),
else => true,
const vec_op = switch (lhs_ty.zigTypeTag()) {
else => false,
.Float, .Vector => true,
};
const lhs_lock: ?RegisterLock = switch (lhs) {
const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
.register => |reg| self.register_manager.lockRegAssumeUnused(reg),
else => null,
};
defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
const rhs_lock: ?RegisterLock = switch (rhs) {
const rhs_lock: ?RegisterLock = switch (rhs_mcv) {
.register => |reg| self.register_manager.lockReg(reg),
else => null,
};
defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
var flipped: bool = false;
var flipped = false;
var copied_to_dst = true;
const dst_mcv: MCValue = dst: {
if (maybe_inst) |inst| {
if ((dst_mem_ok or lhs.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs)) {
break :dst lhs;
if ((!vec_op or lhs_mcv.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs_mcv)) {
break :dst lhs_mcv;
}
if (is_commutative and (dst_mem_ok or rhs.isRegister()) and
self.reuseOperand(inst, rhs_air, 1, rhs))
if (is_commutative and (!vec_op or rhs_mcv.isRegister()) and
self.reuseOperand(inst, rhs_air, 1, rhs_mcv))
{
flipped = true;
break :dst rhs;
break :dst rhs_mcv;
}
}
const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true);
try self.genCopy(lhs_ty, dst_mcv, lhs);
if (vec_op and lhs_mcv.isRegister() and self.hasFeature(.avx))
copied_to_dst = false
else
try self.genCopy(lhs_ty, dst_mcv, lhs_mcv);
break :dst dst_mcv;
};
const dst_lock: ?RegisterLock = switch (dst_mcv) {
@ -5891,160 +5932,47 @@ fn genBinOp(
};
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
const src_mcv = if (flipped) lhs else rhs;
switch (tag) {
.add,
.addwrap,
=> try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
else => .add,
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
.addss
else
return self.fail("TODO implement genBinOp for {s} {} without sse", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
.addsd
else
return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
const src_mcv = if (flipped) lhs_mcv else rhs_mcv;
if (!vec_op) {
switch (air_tag) {
.add,
.addwrap,
=> try self.genBinOpMir(.add, lhs_ty, dst_mcv, src_mcv),
.sub,
.subwrap,
=> try self.genBinOpMir(.sub, lhs_ty, dst_mcv, src_mcv),
.ptr_add,
.ptr_sub,
=> {
const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv);
const tmp_mcv = MCValue{ .register = tmp_reg };
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
defer self.register_manager.unlockReg(tmp_lock);
const elem_size = lhs_ty.elemType2().abiSize(self.target.*);
try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size });
try self.genBinOpMir(switch (air_tag) {
.ptr_add => .add,
.ptr_sub => .sub,
else => unreachable,
}, lhs_ty, dst_mcv, tmp_mcv);
},
}, lhs_ty, dst_mcv, src_mcv),
.sub,
.subwrap,
=> try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
else => .sub,
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
.subss
else
return self.fail("TODO implement genBinOp for {s} {} without sse", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
.subsd
else
return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
},
}, lhs_ty, dst_mcv, src_mcv),
.bool_or,
.bit_or,
=> try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv),
.mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
.mulss
else
return self.fail("TODO implement genBinOp for {s} {} without sse", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
.mulsd
else
return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
},
}, lhs_ty, dst_mcv, src_mcv),
.bool_and,
.bit_and,
=> try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv),
.div_float,
.div_exact,
.div_trunc,
.div_floor,
=> {
try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) {
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse))
.divss
else
return self.fail("TODO implement genBinOp for {s} {} without sse", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2))
.divsd
else
return self.fail("TODO implement genBinOp for {s} {} without sse2", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
},
}, lhs_ty, dst_mcv, src_mcv);
switch (tag) {
.div_float,
.div_exact,
=> {},
.div_trunc,
.div_floor,
=> if (self.hasFeature(.sse4_1)) {
const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
const dst_alias = registerAlias(dst_mcv.register, abi_size);
try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) {
32 => .roundss,
64 => .roundsd,
else => unreachable,
}, dst_alias, dst_alias, Immediate.u(switch (tag) {
.div_trunc => 0b1_0_11,
.div_floor => 0b1_0_01,
else => unreachable,
}));
} else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
else => unreachable,
}
},
.xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv),
.ptr_add,
.ptr_sub,
=> {
const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv);
const tmp_mcv = MCValue{ .register = tmp_reg };
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
defer self.register_manager.unlockReg(tmp_lock);
const elem_size = lhs_ty.elemType2().abiSize(self.target.*);
try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size });
try self.genBinOpMir(switch (tag) {
.ptr_add => .add,
.ptr_sub => .sub,
else => unreachable,
}, lhs_ty, dst_mcv, tmp_mcv);
},
.bool_or,
.bit_or,
=> try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv),
.bool_and,
.bit_and,
=> try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv),
.xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv),
.min,
.max,
=> switch (lhs_ty.zigTypeTag()) {
.Int => {
.min,
.max,
=> {
const mat_src_mcv: MCValue = if (switch (src_mcv) {
.immediate,
.eflags,
@ -6070,12 +5998,12 @@ fn genBinOp(
const int_info = lhs_ty.intInfo(self.target.*);
const cc: Condition = switch (int_info.signedness) {
.unsigned => switch (tag) {
.unsigned => switch (air_tag) {
.min => .a,
.max => .b,
else => unreachable,
},
.signed => switch (tag) {
.signed => switch (air_tag) {
.min => .g,
.max => .l,
else => unreachable,
@ -6134,26 +6062,222 @@ fn genBinOp(
}
try self.genCopy(lhs_ty, dst_mcv, .{ .register = tmp_reg });
},
.Float => try self.genBinOpMir(switch (lhs_ty.floatBits(self.target.*)) {
32 => switch (tag) {
.min => .minss,
.max => .maxss,
else => unreachable,
},
64 => switch (tag) {
.min => .minsd,
.max => .maxsd,
else => unreachable,
},
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
}, lhs_ty, dst_mcv, src_mcv),
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
},
else => return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
}
return dst_mcv;
}
const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
else => unreachable,
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
.sub => if (self.hasFeature(.avx)) .vsubss else .subss,
.mul => if (self.hasFeature(.avx)) .vmulss else .mulss,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivss else .divss,
.max => if (self.hasFeature(.avx)) .vmaxss else .maxss,
.min => if (self.hasFeature(.avx)) .vminss else .minss,
else => unreachable,
},
64 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddsd else .addsd,
.sub => if (self.hasFeature(.avx)) .vsubsd else .subsd,
.mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivsd else .divsd,
.max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd,
.min => if (self.hasFeature(.avx)) .vminsd else .minsd,
else => unreachable,
},
16, 80, 128 => null,
else => unreachable,
},
.Vector => switch (lhs_ty.childType().zigTypeTag()) {
else => null,
.Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
32 => switch (lhs_ty.vectorLen()) {
1 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
.sub => if (self.hasFeature(.avx)) .vsubss else .subss,
.mul => if (self.hasFeature(.avx)) .vmulss else .mulss,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivss else .divss,
.max => if (self.hasFeature(.avx)) .vmaxss else .maxss,
.min => if (self.hasFeature(.avx)) .vminss else .minss,
else => unreachable,
},
2...4 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddps else .addps,
.sub => if (self.hasFeature(.avx)) .vsubps else .subps,
.mul => if (self.hasFeature(.avx)) .vmulps else .mulps,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivps else .divps,
.max => if (self.hasFeature(.avx)) .vmaxps else .maxps,
.min => if (self.hasFeature(.avx)) .vminps else .minps,
else => unreachable,
},
5...8 => if (self.hasFeature(.avx)) switch (air_tag) {
.add => .vaddps,
.sub => .vsubps,
.mul => .vmulps,
.div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
.max => .vmaxps,
.min => .vminps,
else => unreachable,
} else null,
else => null,
},
64 => switch (lhs_ty.vectorLen()) {
1 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddsd else .addsd,
.sub => if (self.hasFeature(.avx)) .vsubsd else .subsd,
.mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivsd else .divsd,
.max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd,
.min => if (self.hasFeature(.avx)) .vminsd else .minsd,
else => unreachable,
},
2 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddpd else .addpd,
.sub => if (self.hasFeature(.avx)) .vsubpd else .subpd,
.mul => if (self.hasFeature(.avx)) .vmulpd else .mulpd,
.div_float,
.div_trunc,
.div_floor,
.div_exact,
=> if (self.hasFeature(.avx)) .vdivpd else .divpd,
.max => if (self.hasFeature(.avx)) .vmaxpd else .maxpd,
.min => if (self.hasFeature(.avx)) .vminpd else .minpd,
else => unreachable,
},
3...4 => if (self.hasFeature(.avx)) switch (air_tag) {
.add => .vaddpd,
.sub => .vsubpd,
.mul => .vmulpd,
.div_float, .div_trunc, .div_floor, .div_exact => .vdivpd,
.max => .vmaxpd,
.min => .vminpd,
else => unreachable,
} else null,
else => null,
},
16, 80, 128 => null,
else => unreachable,
},
},
})) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
});
const dst_alias = registerAlias(dst_mcv.getReg().?, abi_size);
if (self.hasFeature(.avx)) {
const src1_alias =
if (copied_to_dst) dst_alias else registerAlias(lhs_mcv.getReg().?, abi_size);
if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory(
mir_tag,
dst_alias,
src1_alias,
src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
) else try self.asmRegisterRegisterRegister(
mir_tag,
dst_alias,
src1_alias,
registerAlias(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
);
} else {
assert(copied_to_dst);
if (src_mcv.isMemory()) try self.asmRegisterMemory(
mir_tag,
dst_alias,
src_mcv.mem(Memory.PtrSize.fromSize(abi_size)),
) else try self.asmRegisterRegister(
mir_tag,
dst_alias,
registerAlias(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size),
);
}
switch (air_tag) {
.add, .sub, .mul, .div_float, .div_exact => {},
.div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) {
const round_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
.Float => switch (lhs_ty.floatBits(self.target.*)) {
32 => if (self.hasFeature(.avx)) .vroundss else .roundss,
64 => if (self.hasFeature(.avx)) .vroundsd else .roundsd,
16, 80, 128 => null,
else => unreachable,
},
.Vector => switch (lhs_ty.childType().zigTypeTag()) {
.Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
32 => switch (lhs_ty.vectorLen()) {
1 => if (self.hasFeature(.avx)) .vroundss else .roundss,
2...4 => if (self.hasFeature(.avx)) .vroundps else .roundps,
5...8 => if (self.hasFeature(.avx)) .vroundps else null,
else => null,
},
64 => switch (lhs_ty.vectorLen()) {
1 => if (self.hasFeature(.avx)) .vroundsd else .roundsd,
2 => if (self.hasFeature(.avx)) .vroundpd else .roundpd,
3...4 => if (self.hasFeature(.avx)) .vroundpd else null,
else => null,
},
16, 80, 128 => null,
else => unreachable,
},
else => null,
},
else => unreachable,
})) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
});
const round_mode = Immediate.u(switch (air_tag) {
.div_trunc => 0b1_0_11,
.div_floor => 0b1_0_01,
else => unreachable,
});
switch (round_tag) {
.vroundss, .vroundsd => try self.asmRegisterRegisterRegisterImmediate(
round_tag,
dst_alias,
dst_alias,
dst_alias,
round_mode,
),
else => try self.asmRegisterRegisterImmediate(
round_tag,
dst_alias,
dst_alias,
round_mode,
),
}
} else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
}),
.max, .min => {}, // TODO: unordered select
else => unreachable,
}
return dst_mcv;
@ -6186,20 +6310,11 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s
.register_overflow,
.reserved_frame,
=> unreachable,
.register => |src_reg| switch (ty.zigTypeTag()) {
.Float => {
if (!Target.x86.featureSetHas(self.target.cpu.features, .sse))
return self.fail("TODO genBinOpMir for {s} {} without sse", .{
@tagName(mir_tag), ty.fmt(self.bin_file.options.module.?),
});
return self.asmRegisterRegister(mir_tag, dst_reg.to128(), src_reg.to128());
},
else => try self.asmRegisterRegister(
mir_tag,
dst_alias,
registerAlias(src_reg, abi_size),
),
},
.register => |src_reg| try self.asmRegisterRegister(
mir_tag,
dst_alias,
registerAlias(src_reg, abi_size),
),
.immediate => |imm| switch (self.regBitSize(ty)) {
8 => try self.asmRegisterImmediate(
mir_tag,
@ -9646,7 +9761,7 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
lock.* = self.register_manager.lockRegAssumeUnused(reg);
}
const tag = if (@as(
const mir_tag = if (@as(
?Mir.Inst.Tag,
if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 }))
switch (ty.zigTypeTag()) {
@ -9741,20 +9856,17 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
const abi_size = @intCast(u32, ty.abiSize(self.target.*));
const mop1_reg = registerAlias(mops[0].getReg().?, abi_size);
const mop2_reg = registerAlias(mops[1].getReg().?, abi_size);
if (mops[2].isRegister())
try self.asmRegisterRegisterRegister(
tag,
mop1_reg,
mop2_reg,
registerAlias(mops[2].getReg().?, abi_size),
)
else
try self.asmRegisterRegisterMemory(
tag,
mop1_reg,
mop2_reg,
mops[2].mem(Memory.PtrSize.fromSize(abi_size)),
);
if (mops[2].isRegister()) try self.asmRegisterRegisterRegister(
mir_tag,
mop1_reg,
mop2_reg,
registerAlias(mops[2].getReg().?, abi_size),
) else try self.asmRegisterRegisterMemory(
mir_tag,
mop1_reg,
mop2_reg,
mops[2].mem(Memory.PtrSize.fromSize(abi_size)),
);
return self.finishAir(inst, mops[0], ops);
}

View File

@ -262,61 +262,69 @@ pub const Mnemonic = enum {
// MMX
movd,
// SSE
addss,
addps, addss,
andps,
andnps,
cmpss,
cvtsi2ss,
divss,
maxss, minss,
divps, divss,
maxps, maxss,
minps, minss,
movaps, movss, movups,
mulss,
mulps, mulss,
orps,
pextrw, pinsrw,
sqrtps,
sqrtss,
subss,
sqrtps, sqrtss,
subps, subss,
ucomiss,
xorps,
// SSE2
addsd,
addpd, addsd,
andpd,
andnpd,
//cmpsd,
cvtsd2ss, cvtsi2sd, cvtss2sd,
divsd,
maxsd, minsd,
divpd, divsd,
maxpd, maxsd,
minpd, minsd,
movapd,
movq, //movd, movsd,
movupd,
mulsd,
mulpd, mulsd,
orpd,
pshufhw, pshuflw,
psrld, psrlq, psrlw,
punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
punpcklbw, punpckldq, punpcklqdq, punpcklwd,
sqrtpd, sqrtsd,
subsd,
subpd, subsd,
ucomisd,
xorpd,
// SSE3
movddup, movshdup, movsldup,
// SSE4.1
roundsd, roundss,
roundpd, roundps, roundsd, roundss,
// AVX
vaddpd, vaddps, vaddsd, vaddss,
vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd,
vdivpd, vdivps, vdivsd, vdivss,
vmaxpd, vmaxps, vmaxsd, vmaxss,
vminpd, vminps, vminsd, vminss,
vmovapd, vmovaps,
vmovddup,
vmovsd,
vmovshdup, vmovsldup,
vmovss,
vmovupd, vmovups,
vmulpd, vmulps, vmulsd, vmulss,
vpextrw, vpinsrw,
vpshufhw, vpshuflw,
vpsrld, vpsrlq, vpsrlw,
vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
vroundpd, vroundps, vroundsd, vroundss,
vsqrtpd, vsqrtps, vsqrtsd, vsqrtss,
vsubpd, vsubps, vsubsd, vsubss,
// F16C
vcvtph2ps, vcvtps2ph,
// FMA

View File

@ -124,27 +124,34 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.xchg,
.xor,
.addps,
.addss,
.andnps,
.andps,
.cmpss,
.cvtsi2ss,
.divps,
.divss,
.maxps,
.maxss,
.minps,
.minss,
.movaps,
.movss,
.movups,
.mulps,
.mulss,
.orps,
.pextrw,
.pinsrw,
.sqrtps,
.sqrtss,
.subps,
.subss,
.ucomiss,
.xorps,
.addpd,
.addsd,
.andnpd,
.andpd,
@ -152,10 +159,14 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.cvtsd2ss,
.cvtsi2sd,
.cvtss2sd,
.divpd,
.divsd,
.maxpd,
.maxsd,
.minpd,
.minsd,
.movsd,
.mulpd,
.mulsd,
.orpd,
.pshufhw,
@ -173,6 +184,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.punpcklwd,
.sqrtpd,
.sqrtsd,
.subpd,
.subsd,
.ucomisd,
.xorpd,
@ -181,13 +193,31 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.movshdup,
.movsldup,
.roundpd,
.roundps,
.roundsd,
.roundss,
.vaddpd,
.vaddps,
.vaddsd,
.vaddss,
.vcvtsd2ss,
.vcvtsi2sd,
.vcvtsi2ss,
.vcvtss2sd,
.vdivpd,
.vdivps,
.vdivsd,
.vdivss,
.vmaxpd,
.vmaxps,
.vmaxsd,
.vmaxss,
.vminpd,
.vminps,
.vminsd,
.vminss,
.vmovapd,
.vmovaps,
.vmovddup,
@ -197,6 +227,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.vmovss,
.vmovupd,
.vmovups,
.vmulpd,
.vmulps,
.vmulsd,
.vmulss,
.vpextrw,
.vpinsrw,
.vpshufhw,
@ -212,10 +246,18 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.vpunpckldq,
.vpunpcklqdq,
.vpunpcklwd,
.vroundpd,
.vroundps,
.vroundsd,
.vroundss,
.vsqrtpd,
.vsqrtps,
.vsqrtsd,
.vsqrtss,
.vsubpd,
.vsubps,
.vsubsd,
.vsubss,
.vcvtph2ps,
.vcvtps2ph,
@ -304,6 +346,7 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate {
.lock_mi_rip_s,
=> Immediate.s(@bitCast(i32, i)),
.rrri,
.rri_u,
.ri_u,
.i_u,
@ -429,6 +472,12 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void {
.{ .reg = inst.data.rrr.r2 },
.{ .reg = inst.data.rrr.r3 },
},
.rrri => &.{
.{ .reg = inst.data.rrri.r1 },
.{ .reg = inst.data.rrri.r2 },
.{ .reg = inst.data.rrri.r3 },
.{ .imm = lower.imm(inst.ops, inst.data.rrri.i) },
},
.ri_s, .ri_u => &.{
.{ .reg = inst.data.ri.r },
.{ .imm = lower.imm(inst.ops, inst.data.ri.i) },

View File

@ -166,7 +166,9 @@ pub const Inst = struct {
/// Logical exclusive-or
xor,
/// Add single precision floating point values
/// Add packed single-precision floating-point values
addps,
/// Add scalar single-precision floating-point values
addss,
/// Bitwise logical and of packed single precision floating-point values
andps,
@ -176,11 +178,17 @@ pub const Inst = struct {
cmpss,
/// Convert doubleword integer to scalar single-precision floating-point value
cvtsi2ss,
/// Divide packed single-precision floating-point values
divps,
/// Divide scalar single-precision floating-point values
divss,
/// Return maximum single-precision floating-point value
/// Maximum of packed single-precision floating-point values
maxps,
/// Maximum of scalar single-precision floating-point values
maxss,
/// Return minimum single-precision floating-point value
/// Minimum of packed single-precision floating-point values
minps,
/// Minimum of scalar single-precision floating-point values
minss,
/// Move aligned packed single-precision floating-point values
movaps,
@ -188,6 +196,8 @@ pub const Inst = struct {
movss,
/// Move unaligned packed single-precision floating-point values
movups,
/// Multiply packed single-precision floating-point values
mulps,
/// Multiply scalar single-precision floating-point values
mulss,
/// Bitwise logical or of packed single precision floating-point values
@ -196,18 +206,22 @@ pub const Inst = struct {
pextrw,
/// Insert word
pinsrw,
/// Square root of scalar single precision floating-point value
/// Square root of packed single-precision floating-point values
sqrtps,
/// Subtract scalar single-precision floating-point values
/// Square root of scalar single-precision floating-point value
sqrtss,
/// Square root of single precision floating-point values
/// Subtract packed single-precision floating-point values
subps,
/// Subtract scalar single-precision floating-point values
subss,
/// Unordered compare scalar single-precision floating-point values
ucomiss,
/// Bitwise logical xor of packed single precision floating-point values
xorps,
/// Add double precision floating point values
/// Add packed double-precision floating-point values
addpd,
/// Add scalar double-precision floating-point values
addsd,
/// Bitwise logical and not of packed double precision floating-point values
andnpd,
@ -221,14 +235,22 @@ pub const Inst = struct {
cvtsi2sd,
/// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
cvtss2sd,
/// Divide packed double-precision floating-point values
divpd,
/// Divide scalar double-precision floating-point values
divsd,
/// Return maximum double-precision floating-point value
/// Maximum of packed double-precision floating-point values
maxpd,
/// Maximum of scalar double-precision floating-point values
maxsd,
/// Return minimum double-precision floating-point value
/// Minimum of packed double-precision floating-point values
minpd,
/// Minimum of scalar double-precision floating-point values
minsd,
/// Move scalar double-precision floating-point value
movsd,
/// Multiply packed double-precision floating-point values
mulpd,
/// Multiply scalar double-precision floating-point values
mulsd,
/// Bitwise logical or of packed double precision floating-point values
@ -263,6 +285,8 @@ pub const Inst = struct {
sqrtpd,
/// Square root of scalar double precision floating-point value
sqrtsd,
/// Subtract packed double-precision floating-point values
subpd,
/// Subtract scalar double-precision floating-point values
subsd,
/// Unordered compare scalar double-precision floating-point values
@ -277,11 +301,23 @@ pub const Inst = struct {
/// Replicate single floating-point values
movsldup,
/// Round scalar double-precision floating-point values
/// Round packed double-precision floating-point values
roundpd,
/// Round packed single-precision floating-point values
roundps,
/// Round scalar double-precision floating-point value
roundsd,
/// Round scalar single-precision floating-point values
/// Round scalar single-precision floating-point value
roundss,
/// Add packed double-precision floating-point values
vaddpd,
/// Add packed single-precision floating-point values
vaddps,
/// Add scalar double-precision floating-point values
vaddsd,
/// Add scalar single-precision floating-point values
vaddss,
/// Convert scalar double-precision floating-point value to scalar single-precision floating-point value
vcvtsd2ss,
/// Convert doubleword integer to scalar double-precision floating-point value
@ -290,6 +326,30 @@ pub const Inst = struct {
vcvtsi2ss,
/// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
vcvtss2sd,
/// Divide packed double-precision floating-point values
vdivpd,
/// Divide packed single-precision floating-point values
vdivps,
/// Divide scalar double-precision floating-point values
vdivsd,
/// Divide scalar single-precision floating-point values
vdivss,
/// Maximum of packed double-precision floating-point values
vmaxpd,
/// Maximum of packed single-precision floating-point values
vmaxps,
/// Maximum of scalar double-precision floating-point values
vmaxsd,
/// Maximum of scalar single-precision floating-point values
vmaxss,
/// Minimum of packed double-precision floating-point values
vminpd,
/// Minimum of packed single-precision floating-point values
vminps,
/// Minimum of scalar double-precision floating-point values
vminsd,
/// Minimum of scalar single-precision floating-point values
vminss,
/// Move aligned packed double-precision floating-point values
vmovapd,
/// Move aligned packed single-precision floating-point values
@ -308,6 +368,14 @@ pub const Inst = struct {
vmovupd,
/// Move unaligned packed single-precision floating-point values
vmovups,
/// Multiply packed double-precision floating-point values
vmulpd,
/// Multiply packed single-precision floating-point values
vmulps,
/// Multiply scalar double-precision floating-point values
vmulsd,
/// Multiply scalar single-precision floating-point values
vmulss,
/// Extract word
vpextrw,
/// Insert word
@ -338,6 +406,14 @@ pub const Inst = struct {
vpunpcklqdq,
/// Unpack low data
vpunpcklwd,
/// Round packed double-precision floating-point values
vroundpd,
/// Round packed single-precision floating-point values
vroundps,
/// Round scalar double-precision floating-point value
vroundsd,
/// Round scalar single-precision floating-point value
vroundss,
/// Square root of packed double-precision floating-point value
vsqrtpd,
/// Square root of packed single-precision floating-point value
@ -346,6 +422,14 @@ pub const Inst = struct {
vsqrtsd,
/// Square root of scalar single-precision floating-point value
vsqrtss,
/// Subtract packed double-precision floating-point values
vsubpd,
/// Subtract packed single-precision floating-point values
vsubps,
/// Subtract scalar double-precision floating-point values
vsubsd,
/// Subtract scalar single-precision floating-point values
vsubss,
/// Convert 16-bit floating-point values to single-precision floating-point values
vcvtph2ps,
@ -442,6 +526,9 @@ pub const Inst = struct {
/// Register, register, register operands.
/// Uses `rrr` payload.
rrr,
/// Register, register, register, immediate (byte) operands.
/// Uses `rrri` payload.
rrri,
/// Register, register, immediate (sign-extended) operands.
/// Uses `rri` payload.
rri_s,
@ -625,6 +712,12 @@ pub const Inst = struct {
r2: Register,
r3: Register,
},
rrri: struct {
r1: Register,
r2: Register,
r3: Register,
i: u8,
},
rri: struct {
r1: Register,
r2: Register,

View File

@ -837,6 +837,8 @@ pub const table = [_]Entry{
.{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none },
// SSE
.{ .addps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .none, .sse },
.{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse },
.{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .none, .sse },
@ -848,10 +850,16 @@ pub const table = [_]Entry{
.{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse },
.{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse },
.{ .divps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .none, .sse },
.{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse },
.{ .maxps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .none, .sse },
.{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .none, .sse },
.{ .minps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .none, .sse },
.{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .none, .sse },
.{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse },
@ -863,10 +871,14 @@ pub const table = [_]Entry{
.{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .none, .sse },
.{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .none, .sse },
.{ .mulps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .none, .sse },
.{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .none, .sse },
.{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse },
.{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
.{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
.{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
@ -878,6 +890,8 @@ pub const table = [_]Entry{
.{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse },
// SSE2
.{ .addpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .none, .sse2 },
.{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .none, .sse2 },
.{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .none, .sse2 },
@ -893,10 +907,16 @@ pub const table = [_]Entry{
.{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 },
.{ .divpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .none, .sse2 },
.{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 },
.{ .maxpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .none, .sse2 },
.{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .none, .sse2 },
.{ .minpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .none, .sse2 },
.{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .none, .sse2 },
.{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .none, .sse2 },
@ -914,6 +934,8 @@ pub const table = [_]Entry{
.{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .none, .sse2 },
.{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .none, .sse2 },
.{ .mulpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .none, .sse2 },
.{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .none, .sse2 },
.{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 },
@ -947,6 +969,8 @@ pub const table = [_]Entry{
.{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 },
.{ .subpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .none, .sse2 },
.{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 },
.{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .none, .sse2 },
@ -966,10 +990,25 @@ pub const table = [_]Entry{
// SSE4.1
.{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
.{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 },
.{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
.{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
.{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 },
.{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 },
// AVX
.{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
.{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
.{ .vaddps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .vex_128_wig, .avx },
.{ .vaddps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x58 }, 0, .vex_256_wig, .avx },
.{ .vaddsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
.{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
.{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
.{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx },
@ -980,6 +1019,36 @@ pub const table = [_]Entry{
.{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
.{ .vdivpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_128_wig, .avx },
.{ .vdivpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_256_wig, .avx },
.{ .vdivps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .vex_128_wig, .avx },
.{ .vdivps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5e }, 0, .vex_256_wig, .avx },
.{ .vdivsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
.{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
.{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
.{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
.{ .vmaxps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
.{ .vmaxps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
.{ .vmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx },
.{ .vmaxss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx },
.{ .vminpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_128_wig, .avx },
.{ .vminpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_256_wig, .avx },
.{ .vminps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .vex_128_wig, .avx },
.{ .vminps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5d }, 0, .vex_256_wig, .avx },
.{ .vminsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx },
.{ .vminss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx },
.{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128_wig, .avx },
.{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128_wig, .avx },
.{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256_wig, .avx },
@ -1019,6 +1088,16 @@ pub const table = [_]Entry{
.{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256_wig, .avx },
.{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256_wig, .avx },
.{ .vmulpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_128_wig, .avx },
.{ .vmulpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_256_wig, .avx },
.{ .vmulps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .vex_128_wig, .avx },
.{ .vmulps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x59 }, 0, .vex_256_wig, .avx },
.{ .vmulsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
.{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
.{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx },
.{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
@ -1041,6 +1120,16 @@ pub const table = [_]Entry{
.{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx },
.{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx },
.{ .vroundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_128_wig, .avx },
.{ .vroundpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_256_wig, .avx },
.{ .vroundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_128_wig, .avx },
.{ .vroundps, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_256_wig, .avx },
.{ .vroundsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .vex_lig_wig, .avx },
.{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx },
.{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx },
.{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx },
@ -1051,6 +1140,16 @@ pub const table = [_]Entry{
.{ .vsqrtss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .vex_lig_wig, .avx },
.{ .vsubpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_128_wig, .avx },
.{ .vsubpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_256_wig, .avx },
.{ .vsubps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .vex_128_wig, .avx },
.{ .vsubps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5c }, 0, .vex_256_wig, .avx },
.{ .vsubsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx },
.{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx },
// F16C
.{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c },
.{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c },