x86_64: implement @splat

This commit is contained in:
Jacob Young 2023-05-09 03:15:27 -04:00
parent 1336619979
commit c23e80e671
5 changed files with 270 additions and 9 deletions

View File

@ -8561,7 +8561,8 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag {
},
32 => switch (ty.vectorLen()) {
1 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov },
2...4 => return if (self.hasFeature(.avx))
2 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov },
3...4 => return if (self.hasFeature(.avx))
if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
5...8 => if (self.hasFeature(.avx))
@ -8577,6 +8578,14 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag {
return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
else => {},
},
128 => switch (ty.vectorLen()) {
1 => return if (self.hasFeature(.avx))
if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }
else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu },
2 => if (self.hasFeature(.avx))
return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu },
else => {},
},
else => {},
},
else => {},
@ -9939,9 +9948,200 @@ fn airErrorName(self: *Self, inst: Air.Inst.Index) !void {
fn airSplat(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
_ = ty_op;
return self.fail("TODO implement airSplat for x86_64", .{});
//return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
const vector_ty = self.air.typeOfIndex(inst);
const dst_rc = regClassForType(vector_ty);
const scalar_ty = vector_ty.scalarType();
const src_mcv = try self.resolveInst(ty_op.operand);
const result: MCValue = result: {
switch (scalar_ty.zigTypeTag()) {
else => {},
.Float => switch (scalar_ty.floatBits(self.target.*)) {
32 => switch (vector_ty.vectorLen()) {
1 => {
if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
try self.genSetReg(dst_reg, scalar_ty, src_mcv);
break :result .{ .register = dst_reg };
},
2...4 => {
if (self.hasFeature(.avx)) {
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
if (src_mcv.isMemory()) try self.asmRegisterMemory(
.{ .v_ss, .broadcast },
dst_reg.to128(),
src_mcv.mem(.dword),
) else {
const src_reg = if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv);
try self.asmRegisterRegisterRegisterImmediate(
.{ .v_ps, .shuf },
dst_reg.to128(),
src_reg.to128(),
src_reg.to128(),
Immediate.u(0),
);
}
break :result .{ .register = dst_reg };
} else {
const dst_mcv = if (src_mcv.isRegister() and
self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
src_mcv
else
try self.copyToRegisterWithInstTracking(inst, scalar_ty, src_mcv);
const dst_reg = dst_mcv.getReg().?;
try self.asmRegisterRegisterImmediate(
.{ ._ps, .shuf },
dst_reg.to128(),
dst_reg.to128(),
Immediate.u(0),
);
break :result dst_mcv;
}
},
5...8 => if (self.hasFeature(.avx)) {
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
if (src_mcv.isMemory()) try self.asmRegisterMemory(
.{ .v_ss, .broadcast },
dst_reg.to256(),
src_mcv.mem(.dword),
) else {
const src_reg = if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv);
if (self.hasFeature(.avx2)) try self.asmRegisterRegister(
.{ .v_ss, .broadcast },
dst_reg.to256(),
src_reg.to128(),
) else {
try self.asmRegisterRegisterRegisterImmediate(
.{ .v_ps, .shuf },
dst_reg.to128(),
src_reg.to128(),
src_reg.to128(),
Immediate.u(0),
);
try self.asmRegisterRegisterRegisterImmediate(
.{ .v_f128, .insert },
dst_reg.to256(),
dst_reg.to256(),
dst_reg.to128(),
Immediate.u(1),
);
}
}
break :result .{ .register = dst_reg };
},
else => {},
},
64 => switch (vector_ty.vectorLen()) {
1 => {
if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
try self.genSetReg(dst_reg, scalar_ty, src_mcv);
break :result .{ .register = dst_reg };
},
2 => {
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
if (self.hasFeature(.sse3)) {
if (src_mcv.isMemory()) try self.asmRegisterMemory(
if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
dst_reg.to128(),
src_mcv.mem(.qword),
) else try self.asmRegisterRegister(
if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup },
dst_reg.to128(),
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(),
);
break :result .{ .register = dst_reg };
} else try self.asmRegisterRegister(
.{ ._ps, .movlh },
dst_reg.to128(),
(if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(),
);
},
3...4 => if (self.hasFeature(.avx)) {
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
if (src_mcv.isMemory()) try self.asmRegisterMemory(
.{ .v_sd, .broadcast },
dst_reg.to256(),
src_mcv.mem(.qword),
) else {
const src_reg = if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv);
if (self.hasFeature(.avx2)) try self.asmRegisterRegister(
.{ .v_sd, .broadcast },
dst_reg.to256(),
src_reg.to128(),
) else {
try self.asmRegisterRegister(
.{ .v_, .movddup },
dst_reg.to128(),
src_reg.to128(),
);
try self.asmRegisterRegisterRegisterImmediate(
.{ .v_f128, .insert },
dst_reg.to256(),
dst_reg.to256(),
dst_reg.to128(),
Immediate.u(1),
);
}
}
break :result .{ .register = dst_reg };
},
else => {},
},
128 => switch (vector_ty.vectorLen()) {
1 => {
if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv;
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
try self.genSetReg(dst_reg, scalar_ty, src_mcv);
break :result .{ .register = dst_reg };
},
2 => if (self.hasFeature(.avx)) {
const dst_reg = try self.register_manager.allocReg(inst, dst_rc);
if (src_mcv.isMemory()) try self.asmRegisterMemory(
.{ .v_f128, .broadcast },
dst_reg.to256(),
src_mcv.mem(.xword),
) else {
const src_reg = if (src_mcv.isRegister())
src_mcv.getReg().?
else
try self.copyToTmpRegister(scalar_ty, src_mcv);
try self.asmRegisterRegisterRegisterImmediate(
.{ .v_f128, .insert },
dst_reg.to256(),
src_reg.to256(),
src_reg.to128(),
Immediate.u(1),
);
}
break :result .{ .register = dst_reg };
},
else => {},
},
16, 80 => {},
else => unreachable,
},
}
return self.fail("TODO implement airSplat for {}", .{
vector_ty.fmt(self.bin_file.options.module.?),
});
};
return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });
}
fn airSelect(self: *Self, inst: Air.Inst.Index) !void {

View File

@ -270,10 +270,12 @@ pub const Mnemonic = enum {
divps, divss,
maxps, maxss,
minps, minss,
movaps, movhlps, movss, movups,
movaps, movhlps, movlhps,
movss, movups,
mulps, mulss,
orps,
pextrw, pinsrw,
shufps,
sqrtps, sqrtss,
subps, subss,
ucomiss,
@ -296,6 +298,7 @@ pub const Mnemonic = enum {
psrld, psrlq, psrlw,
punpckhbw, punpckhdq, punpckhqdq, punpckhwd,
punpcklbw, punpckldq, punpcklqdq, punpcklwd,
shufpd,
sqrtpd, sqrtsd,
subpd, subsd,
ucomisd,
@ -303,17 +306,22 @@ pub const Mnemonic = enum {
// SSE3
movddup, movshdup, movsldup,
// SSE4.1
extractps,
insertps,
pextrb, pextrd, pextrq,
pinsrb, pinsrd, pinsrq,
roundpd, roundps, roundsd, roundss,
// AVX
vaddpd, vaddps, vaddsd, vaddss,
vbroadcastf128, vbroadcastsd, vbroadcastss,
vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd,
vdivpd, vdivps, vdivsd, vdivss,
vextractf128, vextractps,
vinsertf128, vinsertps,
vmaxpd, vmaxps, vmaxsd, vmaxss,
vminpd, vminps, vminsd, vminss,
vmovapd, vmovaps,
vmovddup, vmovhlps,
vmovddup, vmovhlps, vmovlhps,
vmovsd,
vmovshdup, vmovsldup,
vmovss,
@ -326,6 +334,7 @@ pub const Mnemonic = enum {
vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd,
vroundpd, vroundps, vroundsd, vroundss,
vshufpd, vshufps,
vsqrtpd, vsqrtps, vsqrtsd, vsqrtss,
vsubpd, vsubps, vsubsd, vsubss,
// F16C

View File

@ -300,6 +300,8 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
else
.none,
}, mnemonic: {
@setEvalBranchQuota(2_000);
comptime var max_len = 0;
inline for (@typeInfo(Mnemonic).Enum.fields) |field| max_len = @max(field.name.len, max_len);
var buf: [max_len]u8 = undefined;

View File

@ -256,6 +256,8 @@ pub const Inst = struct {
v_sd,
/// VEX-Encoded ___ Packed Double-Precision Values
v_pd,
/// VEX-Encoded ___ 128-Bits Of Floating-Point Data
v_f128,
/// Mask ___ Byte
k_b,
@ -454,6 +456,8 @@ pub const Inst = struct {
mova,
/// Move packed single-precision floating-point values high to low
movhl,
/// Move packed single-precision floating-point values low to high
movlh,
/// Move unaligned packed single-precision floating-point values
/// Move unaligned packed double-precision floating-point values
movu,
@ -488,6 +492,9 @@ pub const Inst = struct {
cvtsi2sd,
/// Convert scalar single-precision floating-point value to scalar double-precision floating-point value
cvtss2sd,
/// Packed interleave shuffle of quadruplets of single-precision floating-point values
/// Packed interleave shuffle of pairs of double-precision floating-point values
shuf,
/// Shuffle packed high words
shufh,
/// Shuffle packed low words
@ -520,12 +527,20 @@ pub const Inst = struct {
/// Replicate single floating-point values
movsldup,
/// Extract packed floating-point values
extract,
/// Insert scalar single-precision floating-point value
/// Insert packed floating-point values
insert,
/// Round packed single-precision floating-point values
/// Round scalar single-precision floating-point value
/// Round packed double-precision floating-point values
/// Round scalar double-precision floating-point value
round,
/// Load with broadcast floating-point data
broadcast,
/// Convert 16-bit floating-point values to single-precision floating-point values
cvtph2ps,
/// Convert single-precision floating-point values to 16-bit floating-point values

View File

@ -867,6 +867,8 @@ pub const table = [_]Entry{
.{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
.{ .movlhps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .none, .sse },
.{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse },
.{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse },
@ -879,14 +881,16 @@ pub const table = [_]Entry{
.{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse },
.{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
.{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
.{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse },
.{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },
.{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse },
.{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse },
.{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse },
.{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .none, .sse },
.{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse },
@ -967,6 +971,8 @@ pub const table = [_]Entry{
.{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 },
.{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 },
.{ .shufpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .none, .sse2 },
.{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 },
.{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 },
@ -990,6 +996,10 @@ pub const table = [_]Entry{
.{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
// SSE4.1
.{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 },
.{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 },
.{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
.{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 },
.{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 },
@ -1019,6 +1029,11 @@ pub const table = [_]Entry{
.{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx },
.{ .vbroadcastss, .rm, &.{ .xmm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx },
.{ .vbroadcastss, .rm, &.{ .ymm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx },
.{ .vbroadcastsd, .rm, &.{ .ymm, .m64 }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx },
.{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx },
.{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx },
.{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx },
@ -1039,6 +1054,14 @@ pub const table = [_]Entry{
.{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx },
.{ .vextractf128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x19 }, 0, .vex_256_w0, .avx },
.{ .vextractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .vex_128_wig, .avx },
.{ .vinsertf128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x18 }, 0, .vex_256_w0, .avx },
.{ .vinsertps, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .vex_128_wig, .avx },
.{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
.{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
@ -1074,6 +1097,8 @@ pub const table = [_]Entry{
.{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
.{ .vmovlhps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx },
.{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
.{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
.{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx },
@ -1150,6 +1175,12 @@ pub const table = [_]Entry{
.{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx },
.{ .vshufpd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_128_wig, .avx },
.{ .vshufpd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_256_wig, .avx },
.{ .vshufps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_128_wig, .avx },
.{ .vshufps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_256_wig, .avx },
.{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx },
.{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx },
@ -1201,6 +1232,10 @@ pub const table = [_]Entry{
.{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma },
// AVX2
.{ .vbroadcastss, .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 },
.{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
.{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },
.{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
.{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
.{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },