x86_64: rewrite unsafe int vector multiplication

This commit is contained in:
Jacob Young 2025-02-16 05:07:51 -05:00
parent d7b93c7876
commit cec6867d76
5 changed files with 5810 additions and 255 deletions

File diff suppressed because it is too large Load Diff

View File

@ -459,6 +459,7 @@ pub const Mnemonic = enum {
vhaddpd, vhaddps,
vinsertf128, vinsertps,
vlddqu, vldmxcsr,
vmaskmovpd, vmaskmovps,
vmaxpd, vmaxps, vmaxsd, vmaxss,
vminpd, vminps, vminsd, vminss,
vmovapd, vmovaps,
@ -481,6 +482,7 @@ pub const Mnemonic = enum {
vpblendvb, vpblendw, vpclmulqdq,
vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
vperm2f128, vpermilpd, vpermilps,
vpextrb, vpextrd, vpextrq, vpextrw,
vpinsrb, vpinsrd, vpinsrq, vpinsrw,
vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
@ -521,6 +523,9 @@ pub const Mnemonic = enum {
// AVX2
vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
vextracti128, vinserti128, vpblendd,
vperm2i128, vpermd, vpermpd, vpermps, vpermq,
vpmaskmovd, vpmaskmovq,
vpsllvd, vpsllvq, vpsravd, vpsrlvd, vpsrlvq,
// ADX
adcx, adox,
// AESKLE
@ -997,7 +1002,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
}
const mnemonic_to_encodings_map = init: {
@setEvalBranchQuota(5_600);
@setEvalBranchQuota(5_700);
const mnemonic_count = @typeInfo(Mnemonic).@"enum".fields.len;
var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
const encodings = @import("encodings.zig");

View File

@ -756,6 +756,8 @@ pub const Inst = struct {
/// Swap GS base register
swapgs,
/// Test condition
/// Logical compare
/// Packed bit test
@"test",
/// Undefined instruction
ud,
@ -1053,6 +1055,7 @@ pub const Inst = struct {
/// Blend scalar single-precision floating-point values
/// Blend packed double-precision floating-point values
/// Blend scalar double-precision floating-point values
/// Blend packed dwords
blend,
/// Variable blend packed single-precision floating-point values
/// Variable blend scalar single-precision floating-point values
@ -1127,20 +1130,37 @@ pub const Inst = struct {
sha256rnds,
// AVX
/// Load with broadcast floating-point data
/// Load integer and broadcast
broadcast,
/// Conditional SIMD packed loads and stores
/// Condition SIMD integer packed loads and stores
maskmov,
/// Permute floating-point values
/// Permute integer values
perm2,
/// Permute in-lane pairs of double-precision floating-point values
/// Permute in-lane quadruples of single-precision floating-point values
permil,
// BMI
/// Bit field extract
bextr,
/// Extract lowest set isolated bit
/// Get mask up to lowest set bit
/// Reset lowest set bit
bls,
/// Load with broadcast floating-point data
/// Load integer and broadcast
broadcast,
/// Zero high bits starting with specified bit position
bzhi,
/// Count the number of trailing zero bits
tzcnt,
// BMI2
/// Zero high bits starting with specified bit position
bzhi,
/// Parallel bits deposit
pdep,
/// Parallel bits extract
pext,
// F16C
/// Convert 16-bit floating-point values to single-precision floating-point values
cvtph2,
@ -1164,6 +1184,19 @@ pub const Inst = struct {
/// Fused multiply-add of scalar double-precision floating-point values
fmadd231,
// AVX2
/// Permute packed doubleword elements
/// Permute packed qword elements
/// Permute double-precision floating-point elements
/// Permute single-precision floating-point elements
perm,
/// Variable bit shift left logical
sllv,
/// Variable bit shift right arithmetic
srav,
/// Variable bit shift right logical
srlv,
// ADX
/// Unsigned integer addition of two operands with overflow flag
ado,

View File

@ -1932,6 +1932,15 @@ pub const table = [_]Entry{
.{ .vldmxcsr, .m, &.{ .m32 }, &.{ 0x0f, 0xae }, 2, .vex_lz_wig, .avx },
.{ .vmaskmovps, .rvm, &.{ .xmm, .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_128_w0, .avx },
.{ .vmaskmovps, .rvm, &.{ .ymm, .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_256_w0, .avx },
.{ .vmaskmovpd, .rvm, &.{ .xmm, .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_128_w0, .avx },
.{ .vmaskmovpd, .rvm, &.{ .ymm, .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_256_w0, .avx },
.{ .vmaskmovps, .mvr, &.{ .m128, .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_128_w0, .avx },
.{ .vmaskmovps, .mvr, &.{ .m256, .ymm, .ymm }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_256_w0, .avx },
.{ .vmaskmovpd, .mvr, &.{ .m128, .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_128_w0, .avx },
.{ .vmaskmovpd, .mvr, &.{ .m256, .ymm, .ymm }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_256_w0, .avx },
.{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
.{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },
@ -2097,6 +2106,18 @@ pub const table = [_]Entry{
.{ .vpcmpgtq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_128_wig, .avx },
.{ .vperm2f128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x06 }, 0, .vex_256_w0, .avx },
.{ .vpermilpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_128_w0, .avx },
.{ .vpermilpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_256_w0, .avx },
.{ .vpermilpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_128_w0, .avx },
.{ .vpermilpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_256_w0, .avx },
.{ .vpermilpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_128_w0, .avx },
.{ .vpermilps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_128_w0, .avx },
.{ .vpermilps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_256_w0, .avx },
.{ .vpermilps, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_256_w0, .avx },
.{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
.{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
.{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
@ -2418,6 +2439,25 @@ pub const table = [_]Entry{
.{ .vpcmpgtq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_256_wig, .avx2 },
.{ .vperm2i128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x46 }, 0, .vex_256_w0, .avx2 },
.{ .vpermd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x36 }, 0, .vex_256_w0, .avx2 },
.{ .vpermpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x01 }, 0, .vex_256_w1, .avx2 },
.{ .vpermps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x16 }, 0, .vex_256_w0, .avx2 },
.{ .vpermq, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x00 }, 0, .vex_256_w1, .avx2 },
.{ .vpmaskmovd, .rvm, &.{ .xmm, .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w0, .avx2 },
.{ .vpmaskmovd, .rvm, &.{ .ymm, .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w0, .avx2 },
.{ .vpmaskmovq, .rvm, &.{ .xmm, .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w1, .avx2 },
.{ .vpmaskmovq, .rvm, &.{ .ymm, .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w1, .avx2 },
.{ .vpmaskmovd, .mvr, &.{ .m128, .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w0, .avx2 },
.{ .vpmaskmovd, .mvr, &.{ .m256, .ymm, .ymm }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w0, .avx2 },
.{ .vpmaskmovq, .mvr, &.{ .m128, .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w1, .avx2 },
.{ .vpmaskmovq, .mvr, &.{ .m256, .ymm, .ymm }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w1, .avx2 },
.{ .vpmaxsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_256_wig, .avx2 },
.{ .vpmaxsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xee }, 0, .vex_256_wig, .avx2 },
.{ .vpmaxsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_256_wig, .avx2 },
@ -2477,11 +2517,19 @@ pub const table = [_]Entry{
.{ .vpslldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .vex_256_wig, .avx2 },
.{ .vpsllvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w0, .avx2 },
.{ .vpsllvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w1, .avx2 },
.{ .vpsllvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w0, .avx2 },
.{ .vpsllvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w1, .avx2 },
.{ .vpsraw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .vex_256_wig, .avx2 },
.{ .vpsraw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 4, .vex_256_wig, .avx2 },
.{ .vpsrad, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .vex_256_wig, .avx2 },
.{ .vpsrad, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 4, .vex_256_wig, .avx2 },
.{ .vpsravd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_128_w0, .avx2 },
.{ .vpsravd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_256_w0, .avx2 },
.{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
.{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
.{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },
@ -2489,7 +2537,12 @@ pub const table = [_]Entry{
.{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 },
.{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 },
.{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_128_wig, .avx2 },
.{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_256_wig, .avx2 },
.{ .vpsrlvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w0, .avx2 },
.{ .vpsrlvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w1, .avx2 },
.{ .vpsrlvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w0, .avx2 },
.{ .vpsrlvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w1, .avx2 },
.{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 },
.{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 },

View File

@ -17962,6 +17962,78 @@ fn binary(comptime op: anytype, comptime opts: struct { compare: Compare = .rela
try testArgs(f128, nan(f128), nan(f128));
}
fn testIntVectors() !void {
try testArgs(@Vector(1, i4), .{
0x1,
}, .{
0x3,
});
try testArgs(@Vector(2, i4), .{
-0x1, 0x7,
}, .{
-0x7, 0x6,
});
try testArgs(@Vector(4, i4), .{
-0x1, 0x2, -0x3, -0x6,
}, .{
-0x2, -0x6, -0x4, 0x1,
});
try testArgs(@Vector(8, i4), .{
-0x4, 0x6, -0x4, -0x1, -0x1, 0x6, 0x5, 0x2,
}, .{
0x2, 0x4, -0x3, -0x6, 0x1, -0x5, -0x1, 0x2,
});
// workaround https://github.com/ziglang/zig/issues/22914
// TODO: try testArgs(@Vector(16, i4), .{
// 0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x0, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
// }, .{
// 0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
// });
try testArgs(@Vector(16, i4), .{
0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
}, .{
0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x1, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
});
try testArgs(@Vector(32, i4), .{
0x0, 0x4, 0x0, -0x6, -0x7, 0x4, -0x3, 0x4, -0x5, 0x2, 0x3, 0x2, -0x6, -0x4, -0x4, -0x3,
0x7, -0x5, -0x3, 0x2, -0x4, 0x4, -0x1, 0x6, -0x7, -0x1, -0x6, -0x2, -0x4, -0x2, 0x5, 0x0,
}, .{
0x5, 0x1, 0x5, 0x7, 0x1, -0x3, 0x3, 0x3, 0x5, 0x4, 0x1, 0x5, 0x4, -0x8, -0x3, -0x6,
-0x2, 0x3, 0x1, 0x2, 0x4, 0x4, -0x8, 0x2, 0x6, -0x1, 0x1, 0x3, -0x1, -0x3, 0x7, -0x7,
});
try testArgs(@Vector(1, u4), .{
0xe,
}, .{
0xc,
});
try testArgs(@Vector(2, u4), .{
0x2, 0x5,
}, .{
0x9, 0xe,
});
try testArgs(@Vector(4, u4), .{
0x2, 0xb, 0xc, 0x7,
}, .{
0x2, 0xa, 0x8, 0x1,
});
try testArgs(@Vector(8, u4), .{
0xf, 0x9, 0x0, 0x6, 0x8, 0x7, 0xd, 0x7,
}, .{
0xb, 0xb, 0x3, 0x6, 0x1, 0x5, 0x4, 0xd,
});
try testArgs(@Vector(16, u4), .{
0x5, 0x1, 0xa, 0x6, 0xb, 0x3, 0x0, 0x7, 0x8, 0x0, 0x9, 0xe, 0x2, 0x9, 0x2, 0x5,
}, .{
0x4, 0x9, 0x4, 0x8, 0x5, 0x7, 0xf, 0x8, 0x3, 0xc, 0x6, 0x9, 0xd, 0xd, 0x2, 0xd,
});
try testArgs(@Vector(32, u4), .{
0xa, 0x5, 0xd, 0x4, 0xe, 0xf, 0xf, 0x2, 0xb, 0x3, 0x9, 0x2, 0x1, 0x9, 0x6, 0x8,
0x7, 0xc, 0x3, 0x5, 0x4, 0xb, 0x5, 0x4, 0x8, 0x2, 0x5, 0x9, 0xf, 0x6, 0x7, 0x7,
}, .{
0xb, 0xf, 0xf, 0xf, 0xb, 0xf, 0xd, 0xc, 0x1, 0xa, 0x1, 0xd, 0x7, 0x4, 0x4, 0x8,
0x2, 0xb, 0xb, 0x4, 0xa, 0x7, 0x6, 0xd, 0xb, 0xb, 0x6, 0xb, 0x1, 0x8, 0xa, 0x6,
});
try testArgs(@Vector(1, i8), .{
-0x54,
}, .{
@ -19013,6 +19085,7 @@ inline fn mulUnsafe(comptime Type: type, lhs: Type, rhs: Type) DoubleBits(Type)
test mulUnsafe {
const test_mul_unsafe = binary(mulUnsafe, .{});
try test_mul_unsafe.testInts();
try test_mul_unsafe.testIntVectors();
}
inline fn multiply(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs * rhs) {