x86_64: rewrite unsafe int vector multiplication

2025-12-06 14:23:09 +00:00 · 2025-02-16 05:07:51 -05:00 · 2025-02-16 05:07:51 -05:00 · cec6867d76
commit cec6867d76
parent d7b93c7876
5 changed files with 5810 additions and 255 deletions
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@ -459,6 +459,7 @@ pub const Mnemonic = enum {
    vhaddpd, vhaddps,
    vinsertf128, vinsertps,
    vlddqu, vldmxcsr,
+    vmaskmovpd, vmaskmovps,
    vmaxpd, vmaxps, vmaxsd, vmaxss,
    vminpd, vminps, vminsd, vminss,
    vmovapd, vmovaps,
@ -481,6 +482,7 @@ pub const Mnemonic = enum {
    vpblendvb, vpblendw, vpclmulqdq,
    vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
    vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
+    vperm2f128, vpermilpd, vpermilps,
    vpextrb, vpextrd, vpextrq, vpextrw,
    vpinsrb, vpinsrd, vpinsrq, vpinsrw,
    vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
@ -521,6 +523,9 @@ pub const Mnemonic = enum {
    // AVX2
    vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
    vextracti128, vinserti128, vpblendd,
+    vperm2i128, vpermd, vpermpd, vpermps, vpermq,
+    vpmaskmovd, vpmaskmovq,
+    vpsllvd, vpsllvq, vpsravd, vpsrlvd, vpsrlvq,
    // ADX
    adcx, adox,
    // AESKLE
@ -997,7 +1002,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op
 }

 const mnemonic_to_encodings_map = init: {
-    @setEvalBranchQuota(5_600);
+    @setEvalBranchQuota(5_700);
    const mnemonic_count = @typeInfo(Mnemonic).@"enum".fields.len;
    var mnemonic_map: [mnemonic_count][]Data = @splat(&.{});
    const encodings = @import("encodings.zig");
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@ -756,6 +756,8 @@ pub const Inst = struct {
        /// Swap GS base register
        swapgs,
        /// Test condition
+        /// Logical compare
+        /// Packed bit test
        @"test",
        /// Undefined instruction
        ud,
@ -1053,6 +1055,7 @@ pub const Inst = struct {
        /// Blend scalar single-precision floating-point values
        /// Blend packed double-precision floating-point values
        /// Blend scalar double-precision floating-point values
+        /// Blend packed dwords
        blend,
        /// Variable blend packed single-precision floating-point values
        /// Variable blend scalar single-precision floating-point values
@ -1127,20 +1130,37 @@ pub const Inst = struct {
        sha256rnds,

        // AVX
+        /// Load with broadcast floating-point data
+        /// Load integer and broadcast
+        broadcast,
+        /// Conditional SIMD packed loads and stores
+        /// Condition SIMD integer packed loads and stores
+        maskmov,
+        /// Permute floating-point values
+        /// Permute integer values
+        perm2,
+        /// Permute in-lane pairs of double-precision floating-point values
+        /// Permute in-lane quadruples of single-precision floating-point values
+        permil,
+
+        // BMI
        /// Bit field extract
        bextr,
        /// Extract lowest set isolated bit
        /// Get mask up to lowest set bit
        /// Reset lowest set bit
        bls,
-        /// Load with broadcast floating-point data
-        /// Load integer and broadcast
-        broadcast,
-        /// Zero high bits starting with specified bit position
-        bzhi,
        /// Count the number of trailing zero bits
        tzcnt,

+        // BMI2
+        /// Zero high bits starting with specified bit position
+        bzhi,
+        /// Parallel bits deposit
+        pdep,
+        /// Parallel bits extract
+        pext,
+
        // F16C
        /// Convert 16-bit floating-point values to single-precision floating-point values
        cvtph2,
@ -1164,6 +1184,19 @@ pub const Inst = struct {
        /// Fused multiply-add of scalar double-precision floating-point values
        fmadd231,

+        // AVX2
+        /// Permute packed doubleword elements
+        /// Permute packed qword elements
+        /// Permute double-precision floating-point elements
+        /// Permute single-precision floating-point elements
+        perm,
+        /// Variable bit shift left logical
+        sllv,
+        /// Variable bit shift right arithmetic
+        srav,
+        /// Variable bit shift right logical
+        srlv,
+
        // ADX
        /// Unsigned integer addition of two operands with overflow flag
        ado,
--- a/src/arch/x86_64/encodings.zig
+++ b/src/arch/x86_64/encodings.zig
@ -1932,6 +1932,15 @@ pub const table = [_]Entry{

    .{ .vldmxcsr, .m, &.{ .m32 }, &.{ 0x0f, 0xae }, 2, .vex_lz_wig, .avx },

+    .{ .vmaskmovps, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovps, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2c }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovpd, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovpd, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x2d }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovps, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovps, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x2e }, 0, .vex_256_w0, .avx },
+    .{ .vmaskmovpd, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_128_w0, .avx },
+    .{ .vmaskmovpd, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x2f }, 0, .vex_256_w0, .avx },
+
    .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx },
    .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx },

@ -2097,6 +2106,18 @@ pub const table = [_]Entry{

    .{ .vpcmpgtq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_128_wig, .avx },

+    .{ .vperm2f128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x06 }, 0, .vex_256_w0, .avx },
+
+    .{ .vpermilpd, .rvm, &.{ .xmm, .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_128_w0, .avx },
+    .{ .vpermilpd, .rvm, &.{ .ymm, .ymm,      .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0d }, 0, .vex_256_w0, .avx },
+    .{ .vpermilpd, .rmi, &.{ .xmm, .xmm_m128, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_128_w0, .avx },
+    .{ .vpermilpd, .rmi, &.{ .ymm, .ymm_m256, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x05 }, 0, .vex_256_w0, .avx },
+
+    .{ .vpermilpd, .rvm, &.{ .xmm, .xmm,      .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_128_w0, .avx },
+    .{ .vpermilps, .rmi, &.{ .xmm, .xmm_m128, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_128_w0, .avx },
+    .{ .vpermilps, .rvm, &.{ .ymm, .ymm,      .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x0c }, 0, .vex_256_w0, .avx },
+    .{ .vpermilps, .rmi, &.{ .ymm, .ymm_m256, .imm8     }, &.{ 0x66, 0x0f, 0x3a, 0x04 }, 0, .vex_256_w0, .avx },
+
    .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
    .{ .vpextrd, .mri, &.{ .rm32,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
    .{ .vpextrq, .mri, &.{ .rm64,   .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
@ -2418,6 +2439,25 @@ pub const table = [_]Entry{

    .{ .vpcmpgtq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x37 }, 0, .vex_256_wig, .avx2 },

+    .{ .vperm2i128, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x46 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x36 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x01 }, 0, .vex_256_w1, .avx2 },
+
+    .{ .vpermps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x16 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpermq, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x00 }, 0, .vex_256_w1, .avx2 },
+
+    .{ .vpmaskmovd, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w0, .avx2 },
+    .{ .vpmaskmovd, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w0, .avx2 },
+    .{ .vpmaskmovq, .rvm, &.{ .xmm,  .xmm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_128_w1, .avx2 },
+    .{ .vpmaskmovq, .rvm, &.{ .ymm,  .ymm, .m256 }, &.{ 0x66, 0x0f, 0x38, 0x8c }, 0, .vex_256_w1, .avx2 },
+    .{ .vpmaskmovd, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w0, .avx2 },
+    .{ .vpmaskmovd, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w0, .avx2 },
+    .{ .vpmaskmovq, .mvr, &.{ .m128, .xmm, .xmm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_128_w1, .avx2 },
+    .{ .vpmaskmovq, .mvr, &.{ .m256, .ymm, .ymm  }, &.{ 0x66, 0x0f, 0x38, 0x8e }, 0, .vex_256_w1, .avx2 },
+
    .{ .vpmaxsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_256_wig, .avx2 },
    .{ .vpmaxsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f,       0xee }, 0, .vex_256_wig, .avx2 },
    .{ .vpmaxsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_256_wig, .avx2 },
@ -2477,11 +2517,19 @@ pub const table = [_]Entry{

    .{ .vpslldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 7, .vex_256_wig, .avx2 },

+    .{ .vpsllvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsllvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_128_w1, .avx2 },
+    .{ .vpsllvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpsllvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x47 }, 0, .vex_256_w1, .avx2 },
+
    .{ .vpsraw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe1 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsraw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 4, .vex_256_wig, .avx2 },
    .{ .vpsrad, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe2 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrad, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x72 }, 4, .vex_256_wig, .avx2 },

+    .{ .vpsravd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsravd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x46 }, 0, .vex_256_w0, .avx2 },
+
    .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 },
    .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 },
@ -2489,7 +2537,12 @@ pub const table = [_]Entry{
    .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8     }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 },

-    .{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_128_wig, .avx2 },
+    .{ .vpsrldq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 3, .vex_256_wig, .avx2 },
+
+    .{ .vpsrlvd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpsrlvq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_128_w1, .avx2 },
+    .{ .vpsrlvd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w0, .avx2 },
+    .{ .vpsrlvq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x45 }, 0, .vex_256_w1, .avx2 },

    .{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 },
    .{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 },
--- a/test/behavior/x86_64/math.zig
+++ b/test/behavior/x86_64/math.zig
@ -17962,6 +17962,78 @@ fn binary(comptime op: anytype, comptime opts: struct { compare: Compare = .rela
            try testArgs(f128, nan(f128), nan(f128));
        }
        fn testIntVectors() !void {
+            try testArgs(@Vector(1, i4), .{
+                0x1,
+            }, .{
+                0x3,
+            });
+            try testArgs(@Vector(2, i4), .{
+                -0x1, 0x7,
+            }, .{
+                -0x7, 0x6,
+            });
+            try testArgs(@Vector(4, i4), .{
+                -0x1, 0x2, -0x3, -0x6,
+            }, .{
+                -0x2, -0x6, -0x4, 0x1,
+            });
+            try testArgs(@Vector(8, i4), .{
+                -0x4, 0x6, -0x4, -0x1, -0x1, 0x6, 0x5, 0x2,
+            }, .{
+                0x2, 0x4, -0x3, -0x6, 0x1, -0x5, -0x1, 0x2,
+            });
+            // workaround https://github.com/ziglang/zig/issues/22914
+            // TODO: try testArgs(@Vector(16, i4), .{
+            //     0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x0, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
+            // }, .{
+            //     0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
+            // });
+            try testArgs(@Vector(16, i4), .{
+                0x7, -0x7, -0x6, -0x1, 0x3, -0x5, -0x3, -0x6, 0x4, 0x4, -0x2, 0x7, -0x2, 0x6, -0x4, -0x1,
+            }, .{
+                0x4, 0x1, -0x7, -0x2, -0x7, 0x4, -0x4, -0x8, -0x1, 0x1, -0x8, 0x5, -0x5, 0x3, 0x3, 0x2,
+            });
+            try testArgs(@Vector(32, i4), .{
+                0x0, 0x4,  0x0,  -0x6, -0x7, 0x4, -0x3, 0x4, -0x5, 0x2,  0x3,  0x2,  -0x6, -0x4, -0x4, -0x3,
+                0x7, -0x5, -0x3, 0x2,  -0x4, 0x4, -0x1, 0x6, -0x7, -0x1, -0x6, -0x2, -0x4, -0x2, 0x5,  0x0,
+            }, .{
+                0x5,  0x1, 0x5, 0x7, 0x1, -0x3, 0x3,  0x3, 0x5, 0x4,  0x1, 0x5, 0x4,  -0x8, -0x3, -0x6,
+                -0x2, 0x3, 0x1, 0x2, 0x4, 0x4,  -0x8, 0x2, 0x6, -0x1, 0x1, 0x3, -0x1, -0x3, 0x7,  -0x7,
+            });
+
+            try testArgs(@Vector(1, u4), .{
+                0xe,
+            }, .{
+                0xc,
+            });
+            try testArgs(@Vector(2, u4), .{
+                0x2, 0x5,
+            }, .{
+                0x9, 0xe,
+            });
+            try testArgs(@Vector(4, u4), .{
+                0x2, 0xb, 0xc, 0x7,
+            }, .{
+                0x2, 0xa, 0x8, 0x1,
+            });
+            try testArgs(@Vector(8, u4), .{
+                0xf, 0x9, 0x0, 0x6, 0x8, 0x7, 0xd, 0x7,
+            }, .{
+                0xb, 0xb, 0x3, 0x6, 0x1, 0x5, 0x4, 0xd,
+            });
+            try testArgs(@Vector(16, u4), .{
+                0x5, 0x1, 0xa, 0x6, 0xb, 0x3, 0x0, 0x7, 0x8, 0x0, 0x9, 0xe, 0x2, 0x9, 0x2, 0x5,
+            }, .{
+                0x4, 0x9, 0x4, 0x8, 0x5, 0x7, 0xf, 0x8, 0x3, 0xc, 0x6, 0x9, 0xd, 0xd, 0x2, 0xd,
+            });
+            try testArgs(@Vector(32, u4), .{
+                0xa, 0x5, 0xd, 0x4, 0xe, 0xf, 0xf, 0x2, 0xb, 0x3, 0x9, 0x2, 0x1, 0x9, 0x6, 0x8,
+                0x7, 0xc, 0x3, 0x5, 0x4, 0xb, 0x5, 0x4, 0x8, 0x2, 0x5, 0x9, 0xf, 0x6, 0x7, 0x7,
+            }, .{
+                0xb, 0xf, 0xf, 0xf, 0xb, 0xf, 0xd, 0xc, 0x1, 0xa, 0x1, 0xd, 0x7, 0x4, 0x4, 0x8,
+                0x2, 0xb, 0xb, 0x4, 0xa, 0x7, 0x6, 0xd, 0xb, 0xb, 0x6, 0xb, 0x1, 0x8, 0xa, 0x6,
+            });
+
            try testArgs(@Vector(1, i8), .{
                -0x54,
            }, .{
@ -19013,6 +19085,7 @@ inline fn mulUnsafe(comptime Type: type, lhs: Type, rhs: Type) DoubleBits(Type)
 test mulUnsafe {
    const test_mul_unsafe = binary(mulUnsafe, .{});
    try test_mul_unsafe.testInts();
+    try test_mul_unsafe.testIntVectors();
 }

 inline fn multiply(comptime Type: type, lhs: Type, rhs: Type) @TypeOf(lhs * rhs) {