mirror of
https://github.com/ziglang/zig.git
synced 2026-02-13 04:48:20 +00:00
x86_64: implement binary operations for f16 and f16 vectors
This commit is contained in:
parent
f8708e2c4d
commit
6778da4516
@ -4497,14 +4497,15 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void {
|
||||
const tag = self.air.instructions.items(.tag)[inst];
|
||||
try self.genBinOpMir(switch (ty_bits) {
|
||||
// No point using an extra prefix byte for *pd which performs the same operation.
|
||||
32, 64 => switch (tag) {
|
||||
16, 32, 64, 128 => switch (tag) {
|
||||
.neg => .xorps,
|
||||
.fabs => .andnps,
|
||||
else => unreachable,
|
||||
},
|
||||
else => return self.fail("TODO implement airFloatSign for {}", .{
|
||||
80 => return self.fail("TODO implement airFloatSign for {}", .{
|
||||
ty.fmt(self.bin_file.options.module.?),
|
||||
}),
|
||||
else => unreachable,
|
||||
}, vec_ty, dst_mcv, sign_mcv);
|
||||
return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none });
|
||||
}
|
||||
@ -6112,9 +6113,53 @@ fn genBinOp(
|
||||
return dst_mcv;
|
||||
}
|
||||
|
||||
const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
|
||||
const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
|
||||
else => unreachable,
|
||||
.Float => switch (lhs_ty.floatBits(self.target.*)) {
|
||||
16 => if (self.hasFeature(.f16c)) {
|
||||
const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
|
||||
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
|
||||
defer self.register_manager.unlockReg(tmp_lock);
|
||||
|
||||
if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
|
||||
.vpinsrw,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
src_mcv.mem(.word),
|
||||
Immediate.u(1),
|
||||
) else try self.asmRegisterRegisterRegister(
|
||||
.vpunpcklwd,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
|
||||
try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
|
||||
try self.asmRegisterRegisterRegister(
|
||||
switch (air_tag) {
|
||||
.add => .vaddss,
|
||||
.sub => .vsubss,
|
||||
.div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
|
||||
.max => .vmaxss,
|
||||
.min => .vmaxss,
|
||||
else => unreachable,
|
||||
},
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
tmp_reg,
|
||||
);
|
||||
try self.asmRegisterRegisterImmediate(
|
||||
.vcvtps2ph,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
Immediate.u(0b1_00),
|
||||
);
|
||||
return dst_mcv;
|
||||
} else null,
|
||||
32 => switch (air_tag) {
|
||||
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
|
||||
.sub => if (self.hasFeature(.avx)) .vsubss else .subss,
|
||||
@ -6141,12 +6186,178 @@ fn genBinOp(
|
||||
.min => if (self.hasFeature(.avx)) .vminsd else .minsd,
|
||||
else => unreachable,
|
||||
},
|
||||
16, 80, 128 => null,
|
||||
80, 128 => null,
|
||||
else => unreachable,
|
||||
},
|
||||
.Vector => switch (lhs_ty.childType().zigTypeTag()) {
|
||||
else => null,
|
||||
.Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
|
||||
16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) {
|
||||
1 => {
|
||||
const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
|
||||
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
|
||||
defer self.register_manager.unlockReg(tmp_lock);
|
||||
|
||||
if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
|
||||
.vpinsrw,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
src_mcv.mem(.word),
|
||||
Immediate.u(1),
|
||||
) else try self.asmRegisterRegisterRegister(
|
||||
.vpunpcklwd,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
|
||||
try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
|
||||
try self.asmRegisterRegisterRegister(
|
||||
switch (air_tag) {
|
||||
.add => .vaddss,
|
||||
.sub => .vsubss,
|
||||
.div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
|
||||
.max => .vmaxss,
|
||||
.min => .vmaxss,
|
||||
else => unreachable,
|
||||
},
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
tmp_reg,
|
||||
);
|
||||
try self.asmRegisterRegisterImmediate(
|
||||
.vcvtps2ph,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
Immediate.u(0b1_00),
|
||||
);
|
||||
return dst_mcv;
|
||||
},
|
||||
2 => {
|
||||
const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
|
||||
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
|
||||
defer self.register_manager.unlockReg(tmp_lock);
|
||||
|
||||
if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
|
||||
.vpinsrd,
|
||||
dst_reg,
|
||||
src_mcv.mem(.dword),
|
||||
Immediate.u(1),
|
||||
) else try self.asmRegisterRegisterRegister(
|
||||
.vunpcklps,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
|
||||
try self.asmRegisterRegisterRegister(.vmovhlps, tmp_reg, dst_reg, dst_reg);
|
||||
try self.asmRegisterRegisterRegister(
|
||||
switch (air_tag) {
|
||||
.add => .vaddps,
|
||||
.sub => .vsubps,
|
||||
.div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
|
||||
.max => .vmaxps,
|
||||
.min => .vmaxps,
|
||||
else => unreachable,
|
||||
},
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
tmp_reg,
|
||||
);
|
||||
try self.asmRegisterRegisterImmediate(
|
||||
.vcvtps2ph,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
Immediate.u(0b1_00),
|
||||
);
|
||||
return dst_mcv;
|
||||
},
|
||||
3...4 => {
|
||||
const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
|
||||
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
|
||||
defer self.register_manager.unlockReg(tmp_lock);
|
||||
|
||||
try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
|
||||
if (src_mcv.isMemory()) try self.asmRegisterMemory(
|
||||
.vcvtph2ps,
|
||||
tmp_reg,
|
||||
src_mcv.mem(.qword),
|
||||
) else try self.asmRegisterRegister(
|
||||
.vcvtph2ps,
|
||||
tmp_reg,
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegisterRegister(
|
||||
switch (air_tag) {
|
||||
.add => .vaddps,
|
||||
.sub => .vsubps,
|
||||
.div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
|
||||
.max => .vmaxps,
|
||||
.min => .vmaxps,
|
||||
else => unreachable,
|
||||
},
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
tmp_reg,
|
||||
);
|
||||
try self.asmRegisterRegisterImmediate(
|
||||
.vcvtps2ph,
|
||||
dst_reg,
|
||||
dst_reg,
|
||||
Immediate.u(0b1_00),
|
||||
);
|
||||
return dst_mcv;
|
||||
},
|
||||
5...8 => {
|
||||
const tmp_reg = (try self.register_manager.allocReg(null, sse)).to256();
|
||||
const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
|
||||
defer self.register_manager.unlockReg(tmp_lock);
|
||||
|
||||
try self.asmRegisterRegister(.vcvtph2ps, dst_reg.to256(), dst_reg);
|
||||
if (src_mcv.isMemory()) try self.asmRegisterMemory(
|
||||
.vcvtph2ps,
|
||||
tmp_reg,
|
||||
src_mcv.mem(.xword),
|
||||
) else try self.asmRegisterRegister(
|
||||
.vcvtph2ps,
|
||||
tmp_reg,
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegisterRegister(
|
||||
switch (air_tag) {
|
||||
.add => .vaddps,
|
||||
.sub => .vsubps,
|
||||
.div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
|
||||
.max => .vmaxps,
|
||||
.min => .vmaxps,
|
||||
else => unreachable,
|
||||
},
|
||||
dst_reg.to256(),
|
||||
dst_reg.to256(),
|
||||
tmp_reg,
|
||||
);
|
||||
try self.asmRegisterRegisterImmediate(
|
||||
.vcvtps2ph,
|
||||
dst_reg,
|
||||
dst_reg.to256(),
|
||||
Immediate.u(0b1_00),
|
||||
);
|
||||
return dst_mcv;
|
||||
},
|
||||
else => null,
|
||||
} else null,
|
||||
32 => switch (lhs_ty.vectorLen()) {
|
||||
1 => switch (air_tag) {
|
||||
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
|
||||
@ -6223,14 +6434,13 @@ fn genBinOp(
|
||||
} else null,
|
||||
else => null,
|
||||
},
|
||||
16, 80, 128 => null,
|
||||
80, 128 => null,
|
||||
else => unreachable,
|
||||
},
|
||||
},
|
||||
})) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
|
||||
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
|
||||
});
|
||||
const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
|
||||
if (self.hasFeature(.avx)) {
|
||||
const src1_alias =
|
||||
if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
|
||||
@ -7139,21 +7349,21 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
|
||||
const tmp2_lock = self.register_manager.lockRegAssumeUnused(tmp2_reg);
|
||||
defer self.register_manager.unlockReg(tmp2_lock);
|
||||
|
||||
if (src_mcv.isRegister())
|
||||
try self.asmRegisterRegisterRegister(
|
||||
.vpunpcklwd,
|
||||
tmp1_reg,
|
||||
dst_reg.to128(),
|
||||
src_mcv.getReg().?.to128(),
|
||||
)
|
||||
else
|
||||
try self.asmRegisterRegisterMemoryImmediate(
|
||||
.vpinsrw,
|
||||
tmp1_reg,
|
||||
dst_reg.to128(),
|
||||
src_mcv.mem(.word),
|
||||
Immediate.u(1),
|
||||
);
|
||||
if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
|
||||
.vpinsrw,
|
||||
tmp1_reg,
|
||||
dst_reg.to128(),
|
||||
src_mcv.mem(.word),
|
||||
Immediate.u(1),
|
||||
) else try self.asmRegisterRegisterRegister(
|
||||
.vpunpcklwd,
|
||||
tmp1_reg,
|
||||
dst_reg.to128(),
|
||||
(if (src_mcv.isRegister())
|
||||
src_mcv.getReg().?
|
||||
else
|
||||
try self.copyToTmpRegister(ty, src_mcv)).to128(),
|
||||
);
|
||||
try self.asmRegisterRegister(.vcvtph2ps, tmp1_reg, tmp1_reg);
|
||||
try self.asmRegisterRegister(.vmovshdup, tmp2_reg, tmp1_reg);
|
||||
try self.genBinOpMir(.ucomiss, ty, tmp1_mcv, tmp2_mcv);
|
||||
@ -8139,7 +8349,16 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag {
|
||||
},
|
||||
.Vector => switch (ty.childType().zigTypeTag()) {
|
||||
.Float => switch (ty.childType().floatBits(self.target.*)) {
|
||||
16 => unreachable, // needs special handling
|
||||
16 => switch (ty.vectorLen()) {
|
||||
1 => unreachable, // needs special handling
|
||||
2 => return if (self.hasFeature(.avx)) .vmovss else .movss,
|
||||
3...4 => return if (self.hasFeature(.avx)) .vmovsd else .movsd,
|
||||
5...8 => return if (self.hasFeature(.avx))
|
||||
if (aligned) .vmovaps else .vmovups
|
||||
else if (aligned) .movaps else .movups,
|
||||
9...16 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups,
|
||||
else => {},
|
||||
},
|
||||
32 => switch (ty.vectorLen()) {
|
||||
1 => return if (self.hasFeature(.avx)) .vmovss else .movss,
|
||||
2...4 => return if (self.hasFeature(.avx))
|
||||
|
||||
@ -270,7 +270,7 @@ pub const Mnemonic = enum {
|
||||
divps, divss,
|
||||
maxps, maxss,
|
||||
minps, minss,
|
||||
movaps, movss, movups,
|
||||
movaps, movhlps, movss, movups,
|
||||
mulps, mulss,
|
||||
orps,
|
||||
pextrw, pinsrw,
|
||||
@ -303,6 +303,8 @@ pub const Mnemonic = enum {
|
||||
// SSE3
|
||||
movddup, movshdup, movsldup,
|
||||
// SSE4.1
|
||||
pextrb, pextrd, pextrq,
|
||||
pinsrb, pinsrd, pinsrq,
|
||||
roundpd, roundps, roundsd, roundss,
|
||||
// AVX
|
||||
vaddpd, vaddps, vaddsd, vaddss,
|
||||
@ -311,13 +313,14 @@ pub const Mnemonic = enum {
|
||||
vmaxpd, vmaxps, vmaxsd, vmaxss,
|
||||
vminpd, vminps, vminsd, vminss,
|
||||
vmovapd, vmovaps,
|
||||
vmovddup,
|
||||
vmovddup, vmovhlps,
|
||||
vmovsd,
|
||||
vmovshdup, vmovsldup,
|
||||
vmovss,
|
||||
vmovupd, vmovups,
|
||||
vmulpd, vmulps, vmulsd, vmulss,
|
||||
vpextrw, vpinsrw,
|
||||
vpextrb, vpextrd, vpextrq, vpextrw,
|
||||
vpinsrb, vpinsrd, vpinsrq, vpinsrw,
|
||||
vpshufhw, vpshuflw,
|
||||
vpsrld, vpsrlq, vpsrlw,
|
||||
vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
|
||||
@ -359,7 +362,7 @@ pub const Op = enum {
|
||||
cl,
|
||||
r8, r16, r32, r64,
|
||||
rm8, rm16, rm32, rm64,
|
||||
r32_m16, r64_m16,
|
||||
r32_m8, r32_m16, r64_m16,
|
||||
m8, m16, m32, m64, m80, m128, m256,
|
||||
rel8, rel16, rel32,
|
||||
m,
|
||||
@ -444,7 +447,7 @@ pub const Op = enum {
|
||||
pub fn immBitSize(op: Op) u64 {
|
||||
return switch (op) {
|
||||
.none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable,
|
||||
.al, .cl, .r8, .rm8 => unreachable,
|
||||
.al, .cl, .r8, .rm8, .r32_m8 => unreachable,
|
||||
.ax, .r16, .rm16 => unreachable,
|
||||
.eax, .r32, .rm32, .r32_m16 => unreachable,
|
||||
.rax, .r64, .rm64, .r64_m16 => unreachable,
|
||||
@ -467,7 +470,7 @@ pub const Op = enum {
|
||||
.m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
|
||||
.al, .cl, .r8, .rm8 => 8,
|
||||
.ax, .r16, .rm16 => 16,
|
||||
.eax, .r32, .rm32, .r32_m16 => 32,
|
||||
.eax, .r32, .rm32, .r32_m8, .r32_m16 => 32,
|
||||
.rax, .r64, .rm64, .r64_m16 => 64,
|
||||
.xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
|
||||
.ymm, .ymm_m256 => 256,
|
||||
@ -480,7 +483,7 @@ pub const Op = enum {
|
||||
.unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
|
||||
.rel8, .rel16, .rel32 => unreachable,
|
||||
.al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable,
|
||||
.m8, .rm8 => 8,
|
||||
.m8, .rm8, .r32_m8 => 8,
|
||||
.m16, .rm16, .r32_m16, .r64_m16 => 16,
|
||||
.m32, .rm32, .xmm_m32 => 32,
|
||||
.m64, .rm64, .xmm_m64 => 64,
|
||||
@ -509,7 +512,7 @@ pub const Op = enum {
|
||||
.al, .ax, .eax, .rax,
|
||||
.r8, .r16, .r32, .r64,
|
||||
.rm8, .rm16, .rm32, .rm64,
|
||||
.r32_m16, .r64_m16,
|
||||
.r32_m8, .r32_m16, .r64_m16,
|
||||
.xmm, .xmm_m32, .xmm_m64, .xmm_m128,
|
||||
.ymm, .ymm_m256,
|
||||
=> true,
|
||||
@ -535,7 +538,7 @@ pub const Op = enum {
|
||||
// zig fmt: off
|
||||
return switch (op) {
|
||||
.rm8, .rm16, .rm32, .rm64,
|
||||
.r32_m16, .r64_m16,
|
||||
.r32_m8, .r32_m16, .r64_m16,
|
||||
.m8, .m16, .m32, .m64, .m80, .m128, .m256,
|
||||
.m,
|
||||
.xmm_m32, .xmm_m64, .xmm_m128,
|
||||
@ -559,7 +562,7 @@ pub const Op = enum {
|
||||
.al, .ax, .eax, .rax, .cl => .general_purpose,
|
||||
.r8, .r16, .r32, .r64 => .general_purpose,
|
||||
.rm8, .rm16, .rm32, .rm64 => .general_purpose,
|
||||
.r32_m16, .r64_m16 => .general_purpose,
|
||||
.r32_m8, .r32_m16, .r64_m16 => .general_purpose,
|
||||
.sreg => .segment,
|
||||
.xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point,
|
||||
.ymm, .ymm_m256 => .floating_point,
|
||||
|
||||
@ -137,6 +137,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.minps,
|
||||
.minss,
|
||||
.movaps,
|
||||
.movhlps,
|
||||
.movss,
|
||||
.movups,
|
||||
.mulps,
|
||||
@ -149,6 +150,8 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.subps,
|
||||
.subss,
|
||||
.ucomiss,
|
||||
.unpckhps,
|
||||
.unpcklps,
|
||||
.xorps,
|
||||
|
||||
.addpd,
|
||||
@ -187,12 +190,20 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.subpd,
|
||||
.subsd,
|
||||
.ucomisd,
|
||||
.unpckhpd,
|
||||
.unpcklpd,
|
||||
.xorpd,
|
||||
|
||||
.movddup,
|
||||
.movshdup,
|
||||
.movsldup,
|
||||
|
||||
.pextrb,
|
||||
.pextrd,
|
||||
.pextrq,
|
||||
.pinsrb,
|
||||
.pinsrd,
|
||||
.pinsrq,
|
||||
.roundpd,
|
||||
.roundps,
|
||||
.roundsd,
|
||||
@ -221,6 +232,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.vmovapd,
|
||||
.vmovaps,
|
||||
.vmovddup,
|
||||
.vmovhlps,
|
||||
.vmovsd,
|
||||
.vmovshdup,
|
||||
.vmovsldup,
|
||||
@ -231,7 +243,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.vmulps,
|
||||
.vmulsd,
|
||||
.vmulss,
|
||||
.vpextrb,
|
||||
.vpextrd,
|
||||
.vpextrq,
|
||||
.vpextrw,
|
||||
.vpinsrb,
|
||||
.vpinsrd,
|
||||
.vpinsrq,
|
||||
.vpinsrw,
|
||||
.vpshufhw,
|
||||
.vpshuflw,
|
||||
@ -258,6 +276,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
|
||||
.vsubps,
|
||||
.vsubsd,
|
||||
.vsubss,
|
||||
.vunpckhpd,
|
||||
.vunpckhps,
|
||||
.vunpcklpd,
|
||||
.vunpcklps,
|
||||
|
||||
.vcvtph2ps,
|
||||
.vcvtps2ph,
|
||||
|
||||
@ -192,6 +192,8 @@ pub const Inst = struct {
|
||||
minss,
|
||||
/// Move aligned packed single-precision floating-point values
|
||||
movaps,
|
||||
/// Move packed single-precision floating-point values high to low
|
||||
movhlps,
|
||||
/// Move scalar single-precision floating-point value
|
||||
movss,
|
||||
/// Move unaligned packed single-precision floating-point values
|
||||
@ -216,6 +218,10 @@ pub const Inst = struct {
|
||||
subss,
|
||||
/// Unordered compare scalar single-precision floating-point values
|
||||
ucomiss,
|
||||
/// Unpack and interleave high packed single-precision floating-point values
|
||||
unpckhps,
|
||||
/// Unpack and interleave low packed single-precision floating-point values
|
||||
unpcklps,
|
||||
/// Bitwise logical xor of packed single precision floating-point values
|
||||
xorps,
|
||||
|
||||
@ -291,6 +297,10 @@ pub const Inst = struct {
|
||||
subsd,
|
||||
/// Unordered compare scalar double-precision floating-point values
|
||||
ucomisd,
|
||||
/// Unpack and interleave high packed double-precision floating-point values
|
||||
unpckhpd,
|
||||
/// Unpack and interleave low packed double-precision floating-point values
|
||||
unpcklpd,
|
||||
/// Bitwise logical xor of packed double precision floating-point values
|
||||
xorpd,
|
||||
|
||||
@ -301,6 +311,18 @@ pub const Inst = struct {
|
||||
/// Replicate single floating-point values
|
||||
movsldup,
|
||||
|
||||
/// Extract Byte
|
||||
pextrb,
|
||||
/// Extract Doubleword
|
||||
pextrd,
|
||||
/// Extract Quadword
|
||||
pextrq,
|
||||
/// Insert Byte
|
||||
pinsrb,
|
||||
/// Insert Doubleword
|
||||
pinsrd,
|
||||
/// Insert Quadword
|
||||
pinsrq,
|
||||
/// Round packed double-precision floating-point values
|
||||
roundpd,
|
||||
/// Round packed single-precision floating-point values
|
||||
@ -354,6 +376,8 @@ pub const Inst = struct {
|
||||
vmovapd,
|
||||
/// Move aligned packed single-precision floating-point values
|
||||
vmovaps,
|
||||
/// Move packed single-precision floating-point values high to low
|
||||
vmovhlps,
|
||||
/// Replicate double floating-point values
|
||||
vmovddup,
|
||||
/// Move or merge scalar double-precision floating-point value
|
||||
@ -376,8 +400,20 @@ pub const Inst = struct {
|
||||
vmulsd,
|
||||
/// Multiply scalar single-precision floating-point values
|
||||
vmulss,
|
||||
/// Extract Byte
|
||||
vpextrb,
|
||||
/// Extract Doubleword
|
||||
vpextrd,
|
||||
/// Extract Quadword
|
||||
vpextrq,
|
||||
/// Extract word
|
||||
vpextrw,
|
||||
/// Insert Byte
|
||||
vpinsrb,
|
||||
/// Insert Doubleword
|
||||
vpinsrd,
|
||||
/// Insert Quadword
|
||||
vpinsrq,
|
||||
/// Insert word
|
||||
vpinsrw,
|
||||
/// Shuffle packed high words
|
||||
@ -430,6 +466,14 @@ pub const Inst = struct {
|
||||
vsubsd,
|
||||
/// Subtract scalar single-precision floating-point values
|
||||
vsubss,
|
||||
/// Unpack and interleave high packed double-precision floating-point values
|
||||
vunpckhpd,
|
||||
/// Unpack and interleave high packed single-precision floating-point values
|
||||
vunpckhps,
|
||||
/// Unpack and interleave low packed double-precision floating-point values
|
||||
vunpcklpd,
|
||||
/// Unpack and interleave low packed single-precision floating-point values
|
||||
vunpcklps,
|
||||
|
||||
/// Convert 16-bit floating-point values to single-precision floating-point values
|
||||
vcvtph2ps,
|
||||
|
||||
@ -865,6 +865,8 @@ pub const table = [_]Entry{
|
||||
.{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse },
|
||||
.{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .none, .sse },
|
||||
|
||||
.{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
|
||||
|
||||
.{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse },
|
||||
.{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse },
|
||||
|
||||
@ -988,8 +990,16 @@ pub const table = [_]Entry{
|
||||
.{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
|
||||
|
||||
// SSE4.1
|
||||
.{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
|
||||
.{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 },
|
||||
.{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 },
|
||||
|
||||
.{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
|
||||
|
||||
.{ .pinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 },
|
||||
.{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 },
|
||||
.{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 },
|
||||
|
||||
.{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
|
||||
|
||||
.{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
|
||||
@ -1062,6 +1072,8 @@ pub const table = [_]Entry{
|
||||
.{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
|
||||
.{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx },
|
||||
|
||||
.{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
|
||||
|
||||
.{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
|
||||
.{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
|
||||
.{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx },
|
||||
@ -1098,9 +1110,17 @@ pub const table = [_]Entry{
|
||||
|
||||
.{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
|
||||
|
||||
.{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
|
||||
.{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
|
||||
.{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
|
||||
|
||||
.{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx },
|
||||
.{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
|
||||
|
||||
.{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
|
||||
.{ .vpinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
|
||||
.{ .vpinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
|
||||
|
||||
.{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx },
|
||||
|
||||
.{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx },
|
||||
|
||||
@ -8,6 +8,8 @@ const has_f80_rt = switch (builtin.cpu.arch) {
|
||||
.x86_64, .x86 => true,
|
||||
else => false,
|
||||
};
|
||||
const no_x86_64_hardware_f16_support = builtin.zig_backend == .stage2_x86_64 and
|
||||
!std.Target.x86.featureSetHas(builtin.cpu.features, .f16c);
|
||||
|
||||
const epsilon_16 = 0.001;
|
||||
const epsilon = 0.000001;
|
||||
@ -52,8 +54,7 @@ fn testFloatComparisons() !void {
|
||||
}
|
||||
|
||||
test "different sized float comparisons" {
|
||||
if (builtin.zig_backend == .stage2_x86_64 and
|
||||
!comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -152,7 +153,7 @@ fn testSqrtWithVectors() !void {
|
||||
}
|
||||
|
||||
test "more @sqrt f16 tests" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -202,7 +203,7 @@ fn testSqrtLegacy(comptime T: type, x: T) !void {
|
||||
}
|
||||
|
||||
test "@sin" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -241,7 +242,7 @@ fn testSinWithVectors() !void {
|
||||
}
|
||||
|
||||
test "@cos" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -280,7 +281,7 @@ fn testCosWithVectors() !void {
|
||||
}
|
||||
|
||||
test "@exp" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -318,7 +319,7 @@ fn testExpWithVectors() !void {
|
||||
}
|
||||
|
||||
test "@exp2" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -403,7 +404,7 @@ test "@log with @vectors" {
|
||||
}
|
||||
|
||||
test "@log2" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -445,7 +446,7 @@ fn testLog2WithVectors() !void {
|
||||
}
|
||||
|
||||
test "@log10" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -881,7 +882,7 @@ fn testTruncLegacy(comptime T: type, x: T) !void {
|
||||
}
|
||||
|
||||
test "negation f16" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -1040,7 +1041,6 @@ test "comptime_float zero divided by zero produces zero" {
|
||||
}
|
||||
|
||||
test "nan negation f16" {
|
||||
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
|
||||
@ -2,11 +2,11 @@ const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
const expect = std.testing.expect;
|
||||
|
||||
const stage2_x86_64_without_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and
|
||||
const no_x86_64_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and
|
||||
!std.Target.x86.featureSetHas(builtin.cpu.features, .fma);
|
||||
|
||||
test "@mulAdd" {
|
||||
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -120,7 +120,7 @@ fn vector32() !void {
|
||||
|
||||
test "vector f32" {
|
||||
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
|
||||
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
@ -143,7 +143,7 @@ fn vector64() !void {
|
||||
|
||||
test "vector f64" {
|
||||
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
|
||||
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
|
||||
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user