From 5a6b6992d887ffee1ea64b4f778c5c07ecbaf270 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Mon, 5 Dec 2022 21:25:32 +0100
Subject: [PATCH 1/6] std: Add Wasm SIMD opcodes and value type

This adds the opcodes for both the simd128 and relaxed-simd features.
Those instructions are required by the self-hosted WebAssembly
backend. Additionally, this also adds the new `v128` Valtype which
is required to represent a 128bit simd value. SIMD values that do
not have exactly 128 bits will be represented differently.
---
 lib/std/wasm.zig | 272 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 272 insertions(+)

diff --git a/lib/std/wasm.zig b/lib/std/wasm.zig
index 12026f1858..6601e92973 100644
--- a/lib/std/wasm.zig
+++ b/lib/std/wasm.zig
@@ -237,6 +237,277 @@ pub const PrefixedOpcode = enum(u8) {
     _,
 };
 
+/// Simd opcodes that require a prefix `0xFD`.
+/// Each opcode represents a varuint32, meaning
+/// they are encoded as leb128 in binary.
+pub const SimdOpcode = enum(u32) {
+    v128_load = 0x00,
+    v128_load8x8_s = 0x01,
+    v128_load8x8_u = 0x02,
+    v128_load16x4_s = 0x03,
+    v128_load16x4_u = 0x04,
+    v128_load32x2_s = 0x05,
+    v128_load32x2_u = 0x06,
+    v128_load8_splat = 0x07,
+    v128_load16_splat = 0x08,
+    v128_load32_splat = 0x09,
+    v128_load64_splat = 0x0A,
+    v128_store = 0x0B,
+    v128_const = 0x0C,
+    i8x16_shuffle = 0x0D,
+    i8x16_swizzle = 0x0E,
+    @"8x16_splat" = 0x0F,
+    i16x8_splat = 0x10,
+    i32x4_splat = 0x11,
+    i64x2_splat = 0x12,
+    f32x4_splat = 0x13,
+    f64x2_splat = 0x14,
+    @"8x16_extract_lane_s" = 0x15,
+    i8x16_extract_lane_u = 0x16,
+    i8x16_replace_lane = 0x17,
+    i16x8_extract_lane_s = 0x18,
+    i16x8_extract_lane_u = 0x19,
+    i16x8_replace_lane = 0x1A,
+    i32x4_extract_lane = 0x1B,
+    i32x4_replace_lane = 0x1C,
+    i64x2_extract_lane = 0x1D,
+    i64x2_replace_lane = 0x1E,
+    f32x4_extract_lane = 0x1F,
+    f32x4_replace_lane = 0x20,
+    f64x2_extract_lane = 0x21,
+    f64x2_replace_lane = 0x22,
+    i8x16_eq = 0x23,
+    i16x8_eq = 0x2D,
+    i32x4_eq = 0x37,
+    i8x16_ne = 0x24,
+    i16x8_ne = 0x2E,
+    i32x4_ne = 0x38,
+    i8x16_lt_s = 0x25,
+    i16x8_lt_s = 0x2F,
+    i32x4_lt_s = 0x39,
+    i8x16_lt_u = 0x26,
+    i16x8_lt_u = 0x30,
+    i32x4_lt_u = 0x3A,
+    i8x16_gt_s = 0x27,
+    i16x8_gt_s = 0x31,
+    i32x4_gt_s = 0x3B,
+    i8x16_gt_u = 0x28,
+    i16x8_gt_u = 0x32,
+    i32x4_gt_u = 0x3C,
+    i8x16_le_s = 0x29,
+    i16x8_le_s = 0x33,
+    i32x4_le_s = 0x3D,
+    i8x16_le_u = 0x2A,
+    i16x8_le_u = 0x34,
+    i32x4_le_u = 0x3E,
+    i8x16_ge_s = 0x2B,
+    i16x8_ge_s = 0x35,
+    i32x4_ge_s = 0x3F,
+    i8x16_ge_u = 0x2C,
+    i16x8_ge_u = 0x36,
+    i32x4_ge_u = 0x40,
+    f32x4_eq = 0x41,
+    f64x2_eq = 0x47,
+    f32x4_ne = 0x42,
+    f64x2_ne = 0x48,
+    f32x4_lt = 0x43,
+    f64x2_lt = 0x49,
+    f32x4_gt = 0x44,
+    f64x2_gt = 0x4A,
+    f32x4_le = 0x45,
+    f64x2_le = 0x4B,
+    f32x4_ge = 0x46,
+    f64x2_ge = 0x4C,
+    v128_not = 0x4D,
+    v128_and = 0x4E,
+    v128_andnot = 0x4F,
+    v128_or = 0x50,
+    v128_xor = 0x51,
+    v128_bitselect = 0x52,
+    v128_any_true = 0x53,
+    v128_load8_lane = 0x54,
+    v128_load16_lane = 0x55,
+    v128_load32_lane = 0x56,
+    v128_load64_lane = 0x57,
+    v128_store8_lane = 0x58,
+    v128_store16_lane = 0x59,
+    v128_store32_lane = 0x5A,
+    v128_store64_lane = 0x5B,
+    v128_load32_zero = 0x5C,
+    v128_load64_zero = 0x5D,
+    f32x4_demote_f64x2_zero = 0x5E,
+    f64x2_promote_low_f32x4 = 0x5F,
+    i8x16_abs = 0x60,
+    i16x8_abs = 0x80,
+    i32x4_abs = 0xA0,
+    i64x2_abs = 0xC0,
+    i8x16_neg = 0x61,
+    i16x8_neg = 0x81,
+    i32x4_neg = 0xA1,
+    i64x2_neg = 0xC1,
+    i8x16_popcnt = 0x62,
+    i16x8_q15mulr_sat_s = 0x82,
+    i8x16_all_true = 0x63,
+    i16x8_all_true = 0x83,
+    i32x4_all_true = 0xA3,
+    i64x2_all_true = 0xC3,
+    i8x16_bitmask = 0x64,
+    i16x8_bitmask = 0x84,
+    i32x4_bitmask = 0xA4,
+    i64x2_bitmask = 0xC4,
+    i8x16_narrow_i16x8_s = 0x65,
+    i16x8_narrow_i32x4_s = 0x85,
+    i8x16_narrow_i16x8_u = 0x66,
+    i16x8_narrow_i32x4_u = 0x86,
+    f32x4_ceil = 0x67,
+    i16x8_extend_low_i8x16_s = 0x87,
+    i32x4_extend_low_i16x8_s = 0xA7,
+    i64x2_extend_low_i32x4_s = 0xC7,
+    f32x4_floor = 0x68,
+    i16x8_extend_high_i8x16_s = 0x88,
+    i32x4_extend_high_i16x8_s = 0xA8,
+    i64x2_extend_high_i32x4_s = 0xC8,
+    f32x4_trunc = 0x69,
+    i16x8_extend_low_i8x16_u = 0x89,
+    i32x4_extend_low_i16x8_u = 0xA9,
+    i64x2_extend_low_i32x4_u = 0xC9,
+    f32x4_nearest = 0x6A,
+    i16x8_extend_high_i8x16_u = 0x8A,
+    i32x4_extend_high_i16x8_u = 0xAA,
+    i64x2_extend_high_i32x4_u = 0xCA,
+    i8x16_shl = 0x6B,
+    i16x8_shl = 0x8B,
+    i32x4_shl = 0xAB,
+    i64x2_shl = 0xCB,
+    i8x16_shr_s = 0x6C,
+    i16x8_shr_s = 0x8C,
+    i32x4_shr_s = 0xAC,
+    i64x2_shr_s = 0xCC,
+    i8x16_shr_u = 0x6D,
+    i16x8_shr_u = 0x8D,
+    i32x4_shr_u = 0xAD,
+    i64x2_shr_u = 0xCD,
+    i8x16_add = 0x6E,
+    i16x8_add = 0x8E,
+    i32x4_add = 0xAE,
+    i64x2_add = 0xCE,
+    i8x16_add_sat_s = 0x6F,
+    i16x8_add_sat_s = 0x8F,
+    i8x16_add_sat_u = 0x70,
+    i16x8_add_sat_u = 0x90,
+    i8x16_sub = 0x71,
+    i16x8_sub = 0x91,
+    i32x4_sub = 0xB1,
+    i64x2_sub = 0xD1,
+    i8x16_sub_sat_s = 0x72,
+    i16x8_sub_sat_s = 0x92,
+    i8x16_sub_sat_u = 0x73,
+    i16x8_sub_sat_u = 0x93,
+    f64x2_ceil = 0x74,
+    f64x2_nearest = 0x94,
+    f64x2_floor = 0x75,
+    i16x8_mul = 0x95,
+    i32x4_mul = 0xB5,
+    i64x2_mul = 0xD5,
+    i8x16_min_s = 0x76,
+    i16x8_min_s = 0x96,
+    i32x4_min_s = 0xB6,
+    i64x2_eq = 0xD6,
+    i8x16_min_u = 0x77,
+    i16x8_min_u = 0x97,
+    i32x4_min_u = 0xB7,
+    i64x2_ne = 0xD7,
+    i8x16_max_s = 0x78,
+    i16x8_max_s = 0x98,
+    i32x4_max_s = 0xB8,
+    i64x2_lt_s = 0xD8,
+    i8x16_max_u = 0x79,
+    i16x8_max_u = 0x99,
+    i32x4_max_u = 0xB9,
+    i64x2_gt_s = 0xD9,
+    f64x2_trunc = 0x7A,
+    i32x4_dot_i16x8_s = 0xBA,
+    i64x2_le_s = 0xDA,
+    i8x16_avgr_u = 0x7B,
+    i16x8_avgr_u = 0x9B,
+    i64x2_ge_s = 0xDB,
+    i16x8_extadd_pairwise_i8x16_s = 0x7C,
+    i16x8_extmul_low_i8x16_s = 0x9C,
+    i32x4_extmul_low_i16x8_s = 0xBC,
+    i64x2_extmul_low_i32x4_s = 0xDC,
+    i16x8_extadd_pairwise_i8x16_u = 0x7D,
+    i16x8_extmul_high_i8x16_s = 0x9D,
+    i32x4_extmul_high_i16x8_s = 0xBD,
+    i64x2_extmul_high_i32x4_s = 0xDD,
+    i32x4_extadd_pairwise_i16x8_s = 0x7E,
+    i16x8_extmul_low_i8x16_u = 0x9E,
+    i32x4_extmul_low_i16x8_u = 0xBE,
+    i64x2_extmul_low_i32x4_u = 0xDE,
+    i32x4_extadd_pairwise_i16x8_u = 0x7F,
+    i16x8_extmul_high_i8x16_u = 0x9F,
+    i32x4_extmul_high_i16x8_u = 0xBF,
+    i64x2_extmul_high_i32x4_u = 0xDF,
+    f32x4_abs = 0xE0,
+    f64x2_abs = 0xEC,
+    f32x4_neg = 0xE1,
+    f64x2_neg = 0xED,
+    f32x4_sqrt = 0xE3,
+    f64x2_sqrt = 0xEF,
+    f32x4_add = 0xE4,
+    f64x2_add = 0xF0,
+    f32x4_sub = 0xE5,
+    f64x2_sub = 0xF1,
+    f32x4_mul = 0xE6,
+    f64x2_mul = 0xF2,
+    f32x4_div = 0xE7,
+    f64x2_div = 0xF3,
+    f32x4_min = 0xE8,
+    f64x2_min = 0xF4,
+    f32x4_max = 0xE9,
+    f64x2_max = 0xF5,
+    f32x4_pmin = 0xEA,
+    f64x2_pmin = 0xF6,
+    f32x4_pmax = 0xEB,
+    f64x2_pmax = 0xF7,
+    i32x4_trunc_sat_f32x4_s = 0xF8,
+    i32x4_trunc_sat_f32x4_u = 0xF9,
+    f32x4_convert_i32x4_s = 0xFA,
+    f32x4_convert_i32x4_u = 0xFB,
+    i32x4_trunc_sat_f64x2_s_zero = 0xFC,
+    i32x4_trunc_sat_f64x2_u_zero = 0xFD,
+    f64x2_convert_low_i32x4_s = 0xFE,
+    f64x2_convert_low_i32x4_u = 0xFF,
+
+    // relaxed-simd opcodes
+    i8x16_relaxed_swizzle = 0x100,
+    i32x4_relaxed_trunc_f32x4_s = 0x101,
+    i32x4_relaxed_trunc_f32x4_u = 0x102,
+    i32x4_relaxed_trunc_f64x2_s_zero = 0x103,
+    i32x4_relaxed_trunc_f64x2_u_zero = 0x104,
+    f32x4_relaxed_madd = 0x105,
+    f32x4_relaxed_nmadd = 0x106,
+    f64x2_relaxed_madd = 0x107,
+    f64x2_relaxed_nmadd = 0x108,
+    i8x16_relaxed_laneselect = 0x109,
+    i16x8_relaxed_laneselect = 0x10a,
+    i32x4_relaxed_laneselect = 0x10b,
+    i64x2_relaxed_laneselect = 0x10c,
+    f32x4_relaxed_min = 0x10d,
+    f32x4_relaxed_max = 0x10e,
+    f64x2_relaxed_min = 0x10f,
+    f64x2_relaxed_max = 0x110,
+    i16x8_relaxed_q15mulr_s = 0x111,
+    i16x8_relaxed_dot_i8x16_i7x16_s = 0x112,
+    i32x4_relaxed_dot_i8x16_i7x16_add_s = 0x113,
+    f32x4_relaxed_dot_bf16x8_add_f32x4 = 0x114,
+};
+
+/// Returns the integer value of an `SimdOpcode`. Used by the Zig compiler
+/// to write instructions to the wasm binary file
+pub fn simdOpcode(op: SimdOpcode) u32 {
+    return @enumToInt(op);
+}
+
 /// Enum representing all Wasm value types as per spec:
 /// https://webassembly.github.io/spec/core/binary/types.html
 pub const Valtype = enum(u8) {
@@ -244,6 +515,7 @@ pub const Valtype = enum(u8) {
     i64 = 0x7E,
     f32 = 0x7D,
     f64 = 0x7C,
+    v128 = 0x7B,
 };
 
 /// Returns the integer value of a `Valtype`

From 63b69e2c55ae0b41915897f15636f5f6dc4689a5 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Tue, 6 Dec 2022 19:19:52 +0100
Subject: [PATCH 2/6] wasm: load+store simd immediate values

This implements loading and storing immediate values representing
a vector with exactly 128 bits. When the vector does not equal to
128 bits, or when the simd128 or relaxed-simd features are disabled
the value will be treated as an array instead.
---
 src/arch/wasm/CodeGen.zig | 172 ++++++++++++++++++++++++++++++++++----
 src/arch/wasm/Emit.zig    |  27 +++++-
 src/arch/wasm/Mir.zig     |   8 +-
 3 files changed, 189 insertions(+), 18 deletions(-)

diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index faed432a38..0164b17c0f 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -43,6 +43,10 @@ const WValue = union(enum) {
     imm32: u32,
     /// An immediate 64bit value
     imm64: u64,
+    /// Index into the list of simd128 immediates. This `WValue` is
+    /// only possible in very rare cases, therefore it would be
+    /// a waste of memory to store the value in a 128 bit integer.
+    imm128: u32,
     /// A constant 32bit float value
     float32: f32,
     /// A constant 64bit float value
@@ -116,6 +120,7 @@ const WValue = union(enum) {
             .i64 => gen.free_locals_i64.append(gen.gpa, local_value) catch return,
             .f32 => gen.free_locals_f32.append(gen.gpa, local_value) catch return,
             .f64 => gen.free_locals_f64.append(gen.gpa, local_value) catch return,
+            .v128 => gen.free_locals_v128.append(gen.gpa, local_value) catch return,
         }
         value.* = undefined;
     }
@@ -258,18 +263,18 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             8 => switch (args.valtype1.?) {
                 .i32 => if (args.signedness.? == .signed) return .i32_load8_s else return .i32_load8_u,
                 .i64 => if (args.signedness.? == .signed) return .i64_load8_s else return .i64_load8_u,
-                .f32, .f64 => unreachable,
+                .f32, .f64, .v128 => unreachable,
             },
             16 => switch (args.valtype1.?) {
                 .i32 => if (args.signedness.? == .signed) return .i32_load16_s else return .i32_load16_u,
                 .i64 => if (args.signedness.? == .signed) return .i64_load16_s else return .i64_load16_u,
-                .f32, .f64 => unreachable,
+                .f32, .f64, .v128 => unreachable,
             },
             32 => switch (args.valtype1.?) {
                 .i64 => if (args.signedness.? == .signed) return .i64_load32_s else return .i64_load32_u,
                 .i32 => return .i32_load,
                 .f32 => return .f32_load,
-                .f64 => unreachable,
+                .f64, .v128 => unreachable,
             },
             64 => switch (args.valtype1.?) {
                 .i64 => return .i64_load,
@@ -282,24 +287,25 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => return .i64_load,
             .f32 => return .f32_load,
             .f64 => return .f64_load,
+            .v128 => unreachable, // handled independently
         },
         .store => if (args.width) |width| {
             switch (width) {
                 8 => switch (args.valtype1.?) {
                     .i32 => return .i32_store8,
                     .i64 => return .i64_store8,
-                    .f32, .f64 => unreachable,
+                    .f32, .f64, .v128 => unreachable,
                 },
                 16 => switch (args.valtype1.?) {
                     .i32 => return .i32_store16,
                     .i64 => return .i64_store16,
-                    .f32, .f64 => unreachable,
+                    .f32, .f64, .v128 => unreachable,
                 },
                 32 => switch (args.valtype1.?) {
                     .i64 => return .i64_store32,
                     .i32 => return .i32_store,
                     .f32 => return .f32_store,
-                    .f64 => unreachable,
+                    .f64, .v128 => unreachable,
                 },
                 64 => switch (args.valtype1.?) {
                     .i64 => return .i64_store,
@@ -314,6 +320,7 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
                 .i64 => return .i64_store,
                 .f32 => return .f32_store,
                 .f64 => return .f64_store,
+                .v128 => unreachable, // handled independently
             }
         },
 
@@ -325,24 +332,27 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => return .i64_const,
             .f32 => return .f32_const,
             .f64 => return .f64_const,
+            .v128 => unreachable, // handled independently
         },
 
         .eqz => switch (args.valtype1.?) {
             .i32 => return .i32_eqz,
             .i64 => return .i64_eqz,
-            .f32, .f64 => unreachable,
+            .f32, .f64, .v128 => unreachable,
         },
         .eq => switch (args.valtype1.?) {
             .i32 => return .i32_eq,
             .i64 => return .i64_eq,
             .f32 => return .f32_eq,
             .f64 => return .f64_eq,
+            .v128 => unreachable, // handled independently
         },
         .ne => switch (args.valtype1.?) {
             .i32 => return .i32_ne,
             .i64 => return .i64_ne,
             .f32 => return .f32_ne,
             .f64 => return .f64_ne,
+            .v128 => unreachable, // handled independently
         },
 
         .lt => switch (args.valtype1.?) {
@@ -350,40 +360,47 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => if (args.signedness.? == .signed) return .i64_lt_s else return .i64_lt_u,
             .f32 => return .f32_lt,
             .f64 => return .f64_lt,
+            .v128 => unreachable, // handled independently
         },
         .gt => switch (args.valtype1.?) {
             .i32 => if (args.signedness.? == .signed) return .i32_gt_s else return .i32_gt_u,
             .i64 => if (args.signedness.? == .signed) return .i64_gt_s else return .i64_gt_u,
             .f32 => return .f32_gt,
             .f64 => return .f64_gt,
+            .v128 => unreachable, // handled independently
         },
         .le => switch (args.valtype1.?) {
             .i32 => if (args.signedness.? == .signed) return .i32_le_s else return .i32_le_u,
             .i64 => if (args.signedness.? == .signed) return .i64_le_s else return .i64_le_u,
             .f32 => return .f32_le,
             .f64 => return .f64_le,
+            .v128 => unreachable, // handled independently
         },
         .ge => switch (args.valtype1.?) {
             .i32 => if (args.signedness.? == .signed) return .i32_ge_s else return .i32_ge_u,
             .i64 => if (args.signedness.? == .signed) return .i64_ge_s else return .i64_ge_u,
             .f32 => return .f32_ge,
             .f64 => return .f64_ge,
+            .v128 => unreachable, // handled independently
         },
 
         .clz => switch (args.valtype1.?) {
             .i32 => return .i32_clz,
             .i64 => return .i64_clz,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .ctz => switch (args.valtype1.?) {
             .i32 => return .i32_ctz,
             .i64 => return .i64_ctz,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .popcnt => switch (args.valtype1.?) {
             .i32 => return .i32_popcnt,
             .i64 => return .i64_popcnt,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
 
         .add => switch (args.valtype1.?) {
@@ -391,18 +408,21 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => return .i64_add,
             .f32 => return .f32_add,
             .f64 => return .f64_add,
+            .v128 => unreachable, // handled independently
         },
         .sub => switch (args.valtype1.?) {
             .i32 => return .i32_sub,
             .i64 => return .i64_sub,
             .f32 => return .f32_sub,
             .f64 => return .f64_sub,
+            .v128 => unreachable, // handled independently
         },
         .mul => switch (args.valtype1.?) {
             .i32 => return .i32_mul,
             .i64 => return .i64_mul,
             .f32 => return .f32_mul,
             .f64 => return .f64_mul,
+            .v128 => unreachable, // handled independently
         },
 
         .div => switch (args.valtype1.?) {
@@ -410,71 +430,84 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => if (args.signedness.? == .signed) return .i64_div_s else return .i64_div_u,
             .f32 => return .f32_div,
             .f64 => return .f64_div,
+            .v128 => unreachable, // handled independently
         },
         .rem => switch (args.valtype1.?) {
             .i32 => if (args.signedness.? == .signed) return .i32_rem_s else return .i32_rem_u,
             .i64 => if (args.signedness.? == .signed) return .i64_rem_s else return .i64_rem_u,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
 
         .@"and" => switch (args.valtype1.?) {
             .i32 => return .i32_and,
             .i64 => return .i64_and,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .@"or" => switch (args.valtype1.?) {
             .i32 => return .i32_or,
             .i64 => return .i64_or,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .xor => switch (args.valtype1.?) {
             .i32 => return .i32_xor,
             .i64 => return .i64_xor,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
 
         .shl => switch (args.valtype1.?) {
             .i32 => return .i32_shl,
             .i64 => return .i64_shl,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .shr => switch (args.valtype1.?) {
             .i32 => if (args.signedness.? == .signed) return .i32_shr_s else return .i32_shr_u,
             .i64 => if (args.signedness.? == .signed) return .i64_shr_s else return .i64_shr_u,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .rotl => switch (args.valtype1.?) {
             .i32 => return .i32_rotl,
             .i64 => return .i64_rotl,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .rotr => switch (args.valtype1.?) {
             .i32 => return .i32_rotr,
             .i64 => return .i64_rotr,
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
 
         .abs => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_abs,
             .f64 => return .f64_abs,
+            .v128 => unreachable, // handled independently
         },
         .neg => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_neg,
             .f64 => return .f64_neg,
+            .v128 => unreachable, // handled independently
         },
         .ceil => switch (args.valtype1.?) {
             .i64 => unreachable,
             .i32 => return .f32_ceil, // when valtype is f16, we store it in i32.
             .f32 => return .f32_ceil,
             .f64 => return .f64_ceil,
+            .v128 => unreachable, // handled independently
         },
         .floor => switch (args.valtype1.?) {
             .i64 => unreachable,
             .i32 => return .f32_floor, // when valtype is f16, we store it in i32.
             .f32 => return .f32_floor,
             .f64 => return .f64_floor,
+            .v128 => unreachable, // handled independently
         },
         .trunc => switch (args.valtype1.?) {
             .i32 => if (args.valtype2) |valty| switch (valty) {
@@ -482,40 +515,48 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
                 .i64 => unreachable,
                 .f32 => if (args.signedness.? == .signed) return .i32_trunc_f32_s else return .i32_trunc_f32_u,
                 .f64 => if (args.signedness.? == .signed) return .i32_trunc_f64_s else return .i32_trunc_f64_u,
+                .v128 => unreachable, // handled independently
             } else return .f32_trunc, // when no valtype2, it's an f16 instead which is stored in an i32.
             .i64 => switch (args.valtype2.?) {
                 .i32 => unreachable,
                 .i64 => unreachable,
                 .f32 => if (args.signedness.? == .signed) return .i64_trunc_f32_s else return .i64_trunc_f32_u,
                 .f64 => if (args.signedness.? == .signed) return .i64_trunc_f64_s else return .i64_trunc_f64_u,
+                .v128 => unreachable, // handled independently
             },
             .f32 => return .f32_trunc,
             .f64 => return .f64_trunc,
+            .v128 => unreachable, // handled independently
         },
         .nearest => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_nearest,
             .f64 => return .f64_nearest,
+            .v128 => unreachable, // handled independently
         },
         .sqrt => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_sqrt,
             .f64 => return .f64_sqrt,
+            .v128 => unreachable, // handled independently
         },
         .min => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_min,
             .f64 => return .f64_min,
+            .v128 => unreachable, // handled independently
         },
         .max => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_max,
             .f64 => return .f64_max,
+            .v128 => unreachable, // handled independently
         },
         .copysign => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
             .f32 => return .f32_copysign,
             .f64 => return .f64_copysign,
+            .v128 => unreachable, // handled independently
         },
 
         .wrap => switch (args.valtype1.?) {
@@ -523,8 +564,10 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
                 .i32 => unreachable,
                 .i64 => return .i32_wrap_i64,
                 .f32, .f64 => unreachable,
+                .v128 => unreachable, // handled independently
             },
             .i64, .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
         .convert => switch (args.valtype1.?) {
             .i32, .i64 => unreachable,
@@ -532,12 +575,15 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
                 .i32 => if (args.signedness.? == .signed) return .f32_convert_i32_s else return .f32_convert_i32_u,
                 .i64 => if (args.signedness.? == .signed) return .f32_convert_i64_s else return .f32_convert_i64_u,
                 .f32, .f64 => unreachable,
+                .v128 => unreachable, // handled independently
             },
             .f64 => switch (args.valtype2.?) {
                 .i32 => if (args.signedness.? == .signed) return .f64_convert_i32_s else return .f64_convert_i32_u,
                 .i64 => if (args.signedness.? == .signed) return .f64_convert_i64_s else return .f64_convert_i64_u,
                 .f32, .f64 => unreachable,
+                .v128 => unreachable, // handled independently
             },
+            .v128 => unreachable, // handled independently
         },
         .demote => if (args.valtype1.? == .f32 and args.valtype2.? == .f64) return .f32_demote_f64 else unreachable,
         .promote => if (args.valtype1.? == .f64 and args.valtype2.? == .f32) return .f64_promote_f32 else unreachable,
@@ -546,6 +592,7 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
             .i64 => if (args.valtype2.? == .f64) return .i64_reinterpret_f64 else unreachable,
             .f32 => if (args.valtype2.? == .i32) return .f32_reinterpret_i32 else unreachable,
             .f64 => if (args.valtype2.? == .i64) return .f64_reinterpret_i64 else unreachable,
+            .v128 => unreachable, // handled independently
         },
         .extend => switch (args.valtype1.?) {
             .i32 => switch (args.width.?) {
@@ -560,6 +607,7 @@ fn buildOpcode(args: OpcodeBuildArguments) wasm.Opcode {
                 else => unreachable,
             },
             .f32, .f64 => unreachable,
+            .v128 => unreachable, // handled independently
         },
     }
 }
@@ -629,6 +677,10 @@ err_msg: *Module.ErrorMsg,
 /// List of all locals' types generated throughout this declaration
 /// used to emit locals count at start of 'code' section.
 locals: std.ArrayListUnmanaged(u8),
+/// List of simd128 immediates. Each value is stored as an array of bytes.
+/// This list will only be populated for 128bit-simd values when the target features
+/// are enabled also.
+simd_immediates: std.ArrayListUnmanaged([16]u8) = .{},
 /// The Target we're emitting (used to call intInfo)
 target: std.Target,
 /// Represents the wasm binary file that is being linked.
@@ -665,14 +717,17 @@ stack_alignment: u32 = 16,
 /// It is illegal to store a non-i32 valtype in this list.
 free_locals_i32: std.ArrayListUnmanaged(u32) = .{},
 /// A list of indexes which represents a local of valtype `i64`.
-/// It is illegal to store a non-i32 valtype in this list.
+/// It is illegal to store a non-i64 valtype in this list.
 free_locals_i64: std.ArrayListUnmanaged(u32) = .{},
 /// A list of indexes which represents a local of valtype `f32`.
-/// It is illegal to store a non-i32 valtype in this list.
+/// It is illegal to store a non-f32 valtype in this list.
 free_locals_f32: std.ArrayListUnmanaged(u32) = .{},
 /// A list of indexes which represents a local of valtype `f64`.
-/// It is illegal to store a non-i32 valtype in this list.
+/// It is illegal to store a non-f64 valtype in this list.
 free_locals_f64: std.ArrayListUnmanaged(u32) = .{},
+/// A list of indexes which represents a local of valtype `v127`.
+/// It is illegal to store a non-v128 valtype in this list.
+free_locals_v128: std.ArrayListUnmanaged(u32) = .{},
 
 /// When in debug mode, this tracks if no `finishAir` was missed.
 /// Forgetting to call `finishAir` will cause the result to not be
@@ -699,12 +754,14 @@ pub fn deinit(func: *CodeGen) void {
     func.branches.deinit(func.gpa);
     func.blocks.deinit(func.gpa);
     func.locals.deinit(func.gpa);
+    func.simd_immediates.deinit(func.gpa);
     func.mir_instructions.deinit(func.gpa);
     func.mir_extra.deinit(func.gpa);
     func.free_locals_i32.deinit(func.gpa);
     func.free_locals_i64.deinit(func.gpa);
     func.free_locals_f32.deinit(func.gpa);
     func.free_locals_f64.deinit(func.gpa);
+    func.free_locals_v128.deinit(func.gpa);
     func.* = undefined;
 }
 
@@ -867,6 +924,17 @@ fn addImm64(func: *CodeGen, imm: u64) error{OutOfMemory}!void {
     try func.addInst(.{ .tag = .i64_const, .data = .{ .payload = extra_index } });
 }
 
+/// Accepts the index into the list of 128bit-immediates
+fn addImm128(func: *CodeGen, index: u32) error{OutOfMemory}!void {
+    const simd_values = func.simd_immediates.items[index];
+    const extra_index = @intCast(u32, func.mir_extra.items.len);
+    // tag + 128bit value
+    try func.mir_extra.ensureUnusedCapacity(func.gpa, 5);
+    func.mir_extra.appendAssumeCapacity(std.wasm.simdOpcode(.v128_const));
+    func.mir_extra.appendSliceAssumeCapacity(@alignCast(4, mem.bytesAsSlice(u32, &simd_values)));
+    try func.addInst(.{ .tag = .simd, .data = .{ .payload = extra_index } });
+}
+
 fn addFloat64(func: *CodeGen, float: f64) error{OutOfMemory}!void {
     const extra_index = try func.addExtra(Mir.Float64.fromFloat64(float));
     try func.addInst(.{ .tag = .f64_const, .data = .{ .payload = extra_index } });
@@ -924,6 +992,10 @@ fn typeToValtype(ty: Type, target: std.Target) wasm.Valtype {
             },
             else => wasm.Valtype.i32,
         },
+        .Vector => switch (determineSimdStoreStrategy(ty, target)) {
+            .direct => wasm.Valtype.v128,
+            .unrolled => wasm.Valtype.i32,
+        },
         else => wasm.Valtype.i32, // all represented as reference/immediate
     };
 }
@@ -950,6 +1022,7 @@ fn emitWValue(func: *CodeGen, value: WValue) InnerError!void {
         .local => |idx| try func.addLabel(.local_get, idx.value),
         .imm32 => |val| try func.addImm32(@bitCast(i32, val)),
         .imm64 => |val| try func.addImm64(val),
+        .imm128 => |val| try func.addImm128(val),
         .float32 => |val| try func.addInst(.{ .tag = .f32_const, .data = .{ .float32 = val } }),
         .float64 => |val| try func.addFloat64(val),
         .memory => |ptr| {
@@ -1016,6 +1089,10 @@ fn allocLocal(func: *CodeGen, ty: Type) InnerError!WValue {
             log.debug("reusing local ({d}) of type {}\n", .{ index, valtype });
             return WValue{ .local = .{ .value = index, .references = 1 } };
         },
+        .v128 => if (func.free_locals_v128.popOrNull()) |index| {
+            log.debug("reusing local ({d}) of type {}\n", .{ index, valtype });
+            return WValue{ .local = .{ .value = index, .references = 1 } };
+        },
     }
     log.debug("new local of type {}\n", .{valtype});
     // no local was free to be re-used, so allocate a new local instead
@@ -1098,7 +1175,6 @@ pub fn generate(
         .gpa = bin_file.allocator,
         .air = air,
         .liveness = liveness,
-        // .values = .{},
         .code = code,
         .decl_index = func.owner_decl,
         .decl = bin_file.options.module.?.declPtr(func.owner_decl),
@@ -1481,9 +1557,9 @@ fn memcpy(func: *CodeGen, dst: WValue, src: WValue, len: WValue) !void {
                 .imm64 => |val| val,
                 else => unreachable,
             };
-            // if the size (length) is more than 1024 bytes, we use a runtime loop instead to prevent
+            // if the size (length) is more than 32 bytes, we use a runtime loop instead to prevent
             // binary size bloat.
-            if (length > 1024) break :blk;
+            if (length > 32) break :blk;
             var offset: u32 = 0;
             const lhs_base = dst.offset();
             const rhs_base = src.offset();
@@ -1612,7 +1688,6 @@ fn isByRef(ty: Type, target: std.Target) bool {
         => return false,
 
         .Array,
-        .Vector,
         .Frame,
         .Union,
         => return ty.hasRuntimeBitsIgnoreComptime(),
@@ -1625,6 +1700,7 @@ fn isByRef(ty: Type, target: std.Target) bool {
             }
             return ty.hasRuntimeBitsIgnoreComptime();
         },
+        .Vector => return determineSimdStoreStrategy(ty, target) == .unrolled,
         .Int => return ty.intInfo(target).bits > 64,
         .Float => return ty.floatBits(target) > 64,
         .ErrorUnion => {
@@ -1647,6 +1723,26 @@ fn isByRef(ty: Type, target: std.Target) bool {
     }
 }
 
+const SimdStoreStrategy = enum {
+    direct,
+    unrolled,
+};
+
+/// For a given vector type, returns the `SimdStoreStrategy`.
+/// This means when a given type is 128 bits and either the simd128 or relaxed-simd
+/// features are enabled, the function will return `.direct`. This would allow to store
+/// it using a instruction, rather than an unrolled version.
+fn determineSimdStoreStrategy(ty: Type, target: std.Target) SimdStoreStrategy {
+    std.debug.assert(ty.zigTypeTag() == .Vector);
+    if (ty.bitSize(target) != 128) return .unrolled;
+    const hasFeature = std.Target.wasm.featureSetHas;
+    const features = target.cpu.features;
+    if (hasFeature(features, .relaxed_simd) or hasFeature(features, .simd128)) {
+        return .direct;
+    }
+    return .unrolled;
+}
+
 /// Creates a new local for a pointer that points to memory with given offset.
 /// This can be used to get a pointer to a struct field, error payload, etc.
 /// By providing `modify` as action, it will modify the given `ptr_value` instead of making a new
@@ -2187,10 +2283,29 @@ fn store(func: *CodeGen, lhs: WValue, rhs: WValue, ty: Type, offset: u32) InnerE
             const len = @intCast(u32, ty.abiSize(func.target));
             return func.memcpy(lhs, rhs, .{ .imm32 = len });
         },
-        .Struct, .Array, .Union, .Vector => if (isByRef(ty, func.target)) {
+        .Struct, .Array, .Union => if (isByRef(ty, func.target)) {
             const len = @intCast(u32, ty.abiSize(func.target));
             return func.memcpy(lhs, rhs, .{ .imm32 = len });
         },
+        .Vector => switch (determineSimdStoreStrategy(ty, func.target)) {
+            .unrolled => {
+                const len = @intCast(u32, ty.abiSize(func.target));
+                return func.memcpy(lhs, rhs, .{ .imm32 = len });
+            },
+            .direct => {
+                try func.emitWValue(lhs);
+                try func.lowerToStack(rhs);
+                // TODO: Add helper functions for simd opcodes
+                const extra_index = @intCast(u32, func.mir_extra.items.len);
+                // stores as := opcode, offset, alignment (opcode::memarg)
+                try func.mir_extra.appendSlice(func.gpa, &[_]u32{
+                    std.wasm.simdOpcode(.v128_store),
+                    offset + lhs.offset(),
+                    ty.abiAlignment(func.target),
+                });
+                return func.addInst(.{ .tag = .simd, .data = .{ .payload = extra_index } });
+            },
+        },
         .Pointer => {
             if (ty.isSlice()) {
                 // store pointer first
@@ -2289,6 +2404,19 @@ fn load(func: *CodeGen, operand: WValue, ty: Type, offset: u32) InnerError!WValu
     // load local's value from memory by its stack position
     try func.emitWValue(operand);
 
+    if (ty.zigTypeTag() == .Vector) {
+        // TODO: Add helper functions for simd opcodes
+        const extra_index = @intCast(u32, func.mir_extra.items.len);
+        // stores as := opcode, offset, alignment (opcode::memarg)
+        try func.mir_extra.appendSlice(func.gpa, &[_]u32{
+            std.wasm.simdOpcode(.v128_load),
+            offset + operand.offset(),
+            ty.abiAlignment(func.target),
+        });
+        try func.addInst(.{ .tag = .simd, .data = .{ .payload = extra_index } });
+        return WValue{ .stack = {} };
+    }
+
     const abi_size = @intCast(u8, ty.abiSize(func.target));
     const opcode = buildOpcode(.{
         .valtype1 = typeToValtype(ty, func.target),
@@ -2766,10 +2894,24 @@ fn lowerConstant(func: *CodeGen, arg_val: Value, ty: Type) InnerError!WValue {
             const int_val = Value.initPayload(&payload.base);
             return func.lowerConstant(int_val, struct_obj.backing_int_ty);
         },
+        .Vector => {
+            assert(determineSimdStoreStrategy(ty, target) == .direct);
+            var buf: [16]u8 = undefined;
+            val.writeToMemory(ty, func.bin_file.base.options.module.?, &buf);
+            return func.storeSimdImmd(buf);
+        },
         else => |zig_type| return func.fail("Wasm TODO: LowerConstant for zigTypeTag {}", .{zig_type}),
     }
 }
 
+/// Stores the value as a 128bit-immediate value by storing it inside
+/// the list and returning the index into this list as `WValue`.
+fn storeSimdImmd(func: *CodeGen, value: [16]u8) !WValue {
+    const index = @intCast(u32, func.simd_immediates.items.len);
+    try func.simd_immediates.append(func.gpa, value);
+    return WValue{ .imm128 = index };
+}
+
 fn emitUndefined(func: *CodeGen, ty: Type) InnerError!WValue {
     switch (ty.zigTypeTag()) {
         .Bool, .ErrorSet => return WValue{ .imm32 = 0xaaaaaaaa },
diff --git a/src/arch/wasm/Emit.zig b/src/arch/wasm/Emit.zig
index 8203865aca..ae371458ff 100644
--- a/src/arch/wasm/Emit.zig
+++ b/src/arch/wasm/Emit.zig
@@ -240,6 +240,7 @@ pub fn emitMir(emit: *Emit) InnerError!void {
             .i64_ctz => try emit.emitTag(tag),
 
             .extended => try emit.emitExtended(inst),
+            .simd => try emit.emitSimd(inst),
         }
     }
 }
@@ -341,11 +342,14 @@ fn emitMemArg(emit: *Emit, tag: Mir.Inst.Tag, inst: Mir.Inst.Index) !void {
     const extra_index = emit.mir.instructions.items(.data)[inst].payload;
     const mem_arg = emit.mir.extraData(Mir.MemArg, extra_index).data;
     try emit.code.append(@enumToInt(tag));
+    try encodeMemArg(mem_arg, emit.code.writer());
+}
 
+fn encodeMemArg(mem_arg: Mir.MemArg, writer: anytype) !void {
     // wasm encodes alignment as power of 2, rather than natural alignment
     const encoded_alignment = @ctz(mem_arg.alignment);
-    try leb128.writeULEB128(emit.code.writer(), encoded_alignment);
-    try leb128.writeULEB128(emit.code.writer(), mem_arg.offset);
+    try leb128.writeULEB128(writer, encoded_alignment);
+    try leb128.writeULEB128(writer, mem_arg.offset);
 }
 
 fn emitCall(emit: *Emit, inst: Mir.Inst.Index) !void {
@@ -426,6 +430,25 @@ fn emitExtended(emit: *Emit, inst: Mir.Inst.Index) !void {
     }
 }
 
+fn emitSimd(emit: *Emit, inst: Mir.Inst.Index) !void {
+    const extra_index = emit.mir.instructions.items(.data)[inst].payload;
+    const opcode = emit.mir.extra[extra_index];
+    const writer = emit.code.writer();
+    try emit.code.append(0xFD);
+    try leb128.writeULEB128(writer, opcode);
+    switch (@intToEnum(std.wasm.SimdOpcode, opcode)) {
+        .v128_store, .v128_load => {
+            const mem_arg = emit.mir.extraData(Mir.MemArg, extra_index + 1).data;
+            try encodeMemArg(mem_arg, writer);
+        },
+        .v128_const => {
+            const simd_value = emit.mir.extra[extra_index + 1 ..][0..4];
+            try writer.writeAll(std.mem.asBytes(simd_value));
+        },
+        else => |tag| return emit.fail("TODO: Implement simd instruction: {s}\n", .{@tagName(tag)}),
+    }
+}
+
 fn emitMemFill(emit: *Emit) !void {
     try emit.code.append(0xFC);
     try emit.code.append(0x0B);
diff --git a/src/arch/wasm/Mir.zig b/src/arch/wasm/Mir.zig
index 6cf43c1e03..0f33dd9350 100644
--- a/src/arch/wasm/Mir.zig
+++ b/src/arch/wasm/Mir.zig
@@ -518,6 +518,12 @@ pub const Inst = struct {
         ///
         /// The `data` field depends on the extension instruction
         extended = 0xFC,
+        /// The instruction consists of a simd opcode.
+        /// The actual simd-opcode is found at payload's index.
+        ///
+        /// The `data` field depends on the simd instruction and
+        /// may contain additional data.
+        simd = 0xFD,
         /// Contains a symbol to a function pointer
         /// uses `label`
         ///
@@ -578,7 +584,7 @@ pub fn deinit(self: *Mir, gpa: std.mem.Allocator) void {
     self.* = undefined;
 }
 
-pub fn extraData(self: Mir, comptime T: type, index: usize) struct { data: T, end: usize } {
+pub fn extraData(self: *const Mir, comptime T: type, index: usize) struct { data: T, end: usize } {
     const fields = std.meta.fields(T);
     var i: usize = index;
     var result: T = undefined;

From db06eed7a3e741a319182b2e4edc889b83787962 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Tue, 6 Dec 2022 20:08:18 +0100
Subject: [PATCH 3/6] codegen: implement generating vector values

---
 src/codegen.zig | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/src/codegen.zig b/src/codegen.zig
index 985fa8d6c5..bc50f36041 100644
--- a/src/codegen.zig
+++ b/src/codegen.zig
@@ -808,6 +808,54 @@ pub fn generateSymbol(
             }
             return Result{ .appended = {} };
         },
+        .Vector => switch (typed_value.val.tag()) {
+            .bytes => {
+                const bytes = typed_value.val.castTag(.bytes).?.data;
+                const len = @intCast(usize, typed_value.ty.arrayLen());
+                try code.ensureUnusedCapacity(len);
+                code.appendSliceAssumeCapacity(bytes[0..len]);
+                return Result{ .appended = {} };
+            },
+            .aggregate => {
+                const elem_vals = typed_value.val.castTag(.aggregate).?.data;
+                const elem_ty = typed_value.ty.elemType();
+                const len = @intCast(usize, typed_value.ty.arrayLen());
+                for (elem_vals[0..len]) |elem_val| {
+                    switch (try generateSymbol(bin_file, src_loc, .{
+                        .ty = elem_ty,
+                        .val = elem_val,
+                    }, code, debug_output, reloc_info)) {
+                        .appended => {},
+                        .externally_managed => |slice| {
+                            code.appendSliceAssumeCapacity(slice);
+                        },
+                        .fail => |em| return Result{ .fail = em },
+                    }
+                }
+                return Result{ .appended = {} };
+            },
+            .repeated => {
+                const array = typed_value.val.castTag(.repeated).?.data;
+                const elem_ty = typed_value.ty.childType();
+                const len = typed_value.ty.arrayLen();
+
+                var index: u64 = 0;
+                while (index < len) : (index += 1) {
+                    switch (try generateSymbol(bin_file, src_loc, .{
+                        .ty = elem_ty,
+                        .val = array,
+                    }, code, debug_output, reloc_info)) {
+                        .appended => {},
+                        .externally_managed => |slice| {
+                            code.appendSliceAssumeCapacity(slice);
+                        },
+                        .fail => |em| return Result{ .fail = em },
+                    }
+                }
+                return Result{ .appended = {} };
+            },
+            else => unreachable,
+        },
         else => |t| {
             return Result{
                 .fail = try ErrorMsg.create(

From c6d654f73bbe80ed3653be6a31ddcaa4772a4fe2 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Thu, 8 Dec 2022 21:18:11 +0100
Subject: [PATCH 4/6] wasm: implement the 'splat' instruction part 1

This implements `airSplat` for the native WebAssembly backend when
the features 'simd128' or 'relaxed-simd' are enabled. The commit
supports splat where the value lives in the linear memory segment,
as well as on the stack. This saves a lot of instruction cost.
When it detects the element type is not 8, 16, 32 or 64 bits,
the backend will instead use the same strategy as if the features
where disabled.
---
 lib/std/wasm.zig          |  4 +--
 src/arch/wasm/CodeGen.zig | 51 +++++++++++++++++++++++++++++++++++++--
 src/arch/wasm/Emit.zig    | 15 +++++++++++-
 3 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/lib/std/wasm.zig b/lib/std/wasm.zig
index 6601e92973..0e61b3fb30 100644
--- a/lib/std/wasm.zig
+++ b/lib/std/wasm.zig
@@ -256,13 +256,13 @@ pub const SimdOpcode = enum(u32) {
     v128_const = 0x0C,
     i8x16_shuffle = 0x0D,
     i8x16_swizzle = 0x0E,
-    @"8x16_splat" = 0x0F,
+    i8x16_splat = 0x0F,
     i16x8_splat = 0x10,
     i32x4_splat = 0x11,
     i64x2_splat = 0x12,
     f32x4_splat = 0x13,
     f64x2_splat = 0x14,
-    @"8x16_extract_lane_s" = 0x15,
+    i8x16_extract_lane_s = 0x15,
     i8x16_extract_lane_u = 0x16,
     i8x16_replace_lane = 0x17,
     i16x8_extract_lane_s = 0x18,
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 0164b17c0f..5e69860fbc 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -4430,9 +4430,56 @@ fn airIntToFloat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
 fn airSplat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const ty_op = func.air.instructions.items(.data)[inst].ty_op;
     const operand = try func.resolveInst(ty_op.operand);
+    const ty = func.air.typeOfIndex(inst);
+    const elem_ty = ty.childType();
 
-    _ = operand;
-    return func.fail("TODO: Implement wasm airSplat", .{});
+    const result = try func.allocLocal(ty);
+    if (determineSimdStoreStrategy(ty, func.target) == .direct) blk: {
+        switch (operand) {
+            // when the operand lives in the linear memory section, we can directly
+            // load and splat the value at once. Meaning we do not first have to load
+            // the scalar value onto the stack.
+            .stack_offset, .memory, .memory_offset => {
+                const opcode = switch (elem_ty.bitSize(func.target)) {
+                    8 => std.wasm.simdOpcode(.v128_load8_splat),
+                    16 => std.wasm.simdOpcode(.v128_load16_splat),
+                    32 => std.wasm.simdOpcode(.v128_load32_splat),
+                    64 => std.wasm.simdOpcode(.v128_load64_splat),
+                    else => break :blk, // Cannot make use of simd-instructions
+                };
+                try func.emitWValue(operand);
+                // TODO: Add helper functions for simd opcodes
+                const extra_index = @intCast(u32, func.mir_extra.items.len);
+                // stores as := opcode, offset, alignment (opcode::memarg)
+                try func.mir_extra.appendSlice(func.gpa, &[_]u32{
+                    opcode,
+                    operand.offset(),
+                    elem_ty.abiAlignment(func.target),
+                });
+                try func.addInst(.{ .tag = .simd, .data = .{ .payload = extra_index } });
+                try func.addLabel(.local_set, result.local.value);
+                return func.finishAir(inst, result, &.{ty_op.operand});
+            },
+            .local => {
+                const opcode = switch (elem_ty.bitSize(func.target)) {
+                    8 => std.wasm.simdOpcode(.i8x16_splat),
+                    16 => std.wasm.simdOpcode(.i16x8_splat),
+                    32 => if (elem_ty.isInt()) std.wasm.simdOpcode(.i32x4_splat) else std.wasm.simdOpcode(.f32x4_splat),
+                    64 => if (elem_ty.isInt()) std.wasm.simdOpcode(.i64x2_splat) else std.wasm.simdOpcode(.f64x2_splat),
+                    else => break :blk, // Cannot make use of simd-instructions
+                };
+                try func.emitWValue(operand);
+                const extra_index = @intCast(u32, func.mir_extra.items.len);
+                try func.mir_extra.append(func.gpa, opcode);
+                try func.addInst(.{ .tag = .simd, .data = .{ .payload = extra_index } });
+                try func.addLabel(.local_set, result.local.value);
+                return func.finishAir(inst, result, &.{ty_op.operand});
+            },
+            else => unreachable,
+        }
+    }
+
+    return func.fail("TODO: Implement wasm airSplat unrolled", .{});
 }
 
 fn airSelect(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
diff --git a/src/arch/wasm/Emit.zig b/src/arch/wasm/Emit.zig
index ae371458ff..45e02e0986 100644
--- a/src/arch/wasm/Emit.zig
+++ b/src/arch/wasm/Emit.zig
@@ -437,7 +437,13 @@ fn emitSimd(emit: *Emit, inst: Mir.Inst.Index) !void {
     try emit.code.append(0xFD);
     try leb128.writeULEB128(writer, opcode);
     switch (@intToEnum(std.wasm.SimdOpcode, opcode)) {
-        .v128_store, .v128_load => {
+        .v128_store,
+        .v128_load,
+        .v128_load8_splat,
+        .v128_load16_splat,
+        .v128_load32_splat,
+        .v128_load64_splat,
+        => {
             const mem_arg = emit.mir.extraData(Mir.MemArg, extra_index + 1).data;
             try encodeMemArg(mem_arg, writer);
         },
@@ -445,6 +451,13 @@ fn emitSimd(emit: *Emit, inst: Mir.Inst.Index) !void {
             const simd_value = emit.mir.extra[extra_index + 1 ..][0..4];
             try writer.writeAll(std.mem.asBytes(simd_value));
         },
+        .i8x16_splat,
+        .i16x8_splat,
+        .i32x4_splat,
+        .i64x2_splat,
+        .f32x4_splat,
+        .f64x2_splat,
+        => {}, // opcode already written
         else => |tag| return emit.fail("TODO: Implement simd instruction: {s}\n", .{@tagName(tag)}),
     }
 }

From 355b5929b2c49422b92e7aa7374c7e4f0008f400 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Sun, 11 Dec 2022 15:18:33 +0100
Subject: [PATCH 5/6] wasm: `splat` for vector elements divisible by 8

This implements `@splat` for vectors where the element type is
divisible by 8 and a power of two. This is fairly simple as we can
store the values directly within the virtual stack. But for
all other sizes, we must first shift and bitwise-or the values
before we can store them to fit them like a packed-struct, rather
than an array.
---
 src/arch/wasm/CodeGen.zig | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 5e69860fbc..f6c380aeb3 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -4433,7 +4433,6 @@ fn airSplat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
     const ty = func.air.typeOfIndex(inst);
     const elem_ty = ty.childType();
 
-    const result = try func.allocLocal(ty);
     if (determineSimdStoreStrategy(ty, func.target) == .direct) blk: {
         switch (operand) {
             // when the operand lives in the linear memory section, we can directly
@@ -4447,6 +4446,7 @@ fn airSplat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
                     64 => std.wasm.simdOpcode(.v128_load64_splat),
                     else => break :blk, // Cannot make use of simd-instructions
                 };
+                const result = try func.allocLocal(ty);
                 try func.emitWValue(operand);
                 // TODO: Add helper functions for simd opcodes
                 const extra_index = @intCast(u32, func.mir_extra.items.len);
@@ -4468,6 +4468,7 @@ fn airSplat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
                     64 => if (elem_ty.isInt()) std.wasm.simdOpcode(.i64x2_splat) else std.wasm.simdOpcode(.f64x2_splat),
                     else => break :blk, // Cannot make use of simd-instructions
                 };
+                const result = try func.allocLocal(ty);
                 try func.emitWValue(operand);
                 const extra_index = @intCast(u32, func.mir_extra.items.len);
                 try func.mir_extra.append(func.gpa, opcode);
@@ -4478,8 +4479,22 @@ fn airSplat(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
             else => unreachable,
         }
     }
+    const elem_size = elem_ty.bitSize(func.target);
+    const vector_len = @intCast(usize, ty.vectorLen());
+    if ((!std.math.isPowerOfTwo(elem_size) or elem_size % 8 != 0) and vector_len > 1) {
+        return func.fail("TODO: WebAssembly `@splat` for arbitrary element bitsize {d}", .{elem_size});
+    }
 
-    return func.fail("TODO: Implement wasm airSplat unrolled", .{});
+    const result = try func.allocStack(ty);
+    const elem_byte_size = @intCast(u32, elem_ty.abiSize(func.target));
+    var index: usize = 0;
+    var offset: u32 = 0;
+    while (index < vector_len) : (index += 1) {
+        try func.store(result, operand, elem_ty, offset);
+        offset += elem_byte_size;
+    }
+
+    return func.finishAir(inst, result, &.{ty_op.operand});
 }
 
 fn airSelect(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {

From 37561a920b6dad1231f0b9e5a69eb0978af6f5d0 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Mon, 12 Dec 2022 13:35:05 +0100
Subject: [PATCH 6/6] wasm: enable passing vector tests

---
 test/behavior/vector.zig | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig
index 687267d656..9bcc5f9b9e 100644
--- a/test/behavior/vector.zig
+++ b/test/behavior/vector.zig
@@ -138,7 +138,6 @@ test "vector bit operators" {
 }
 
 test "implicit cast vector to array" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -157,7 +156,6 @@ test "implicit cast vector to array" {
 }
 
 test "array to vector" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -235,7 +233,6 @@ test "vector casts of sizes not divisible by 8" {
 }
 
 test "vector @splat" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@@ -284,7 +281,6 @@ test "vector @splat" {
 }
 
 test "load vector elements via comptime index" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -307,7 +303,6 @@ test "load vector elements via comptime index" {
 }
 
 test "store vector elements via comptime index" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -336,7 +331,6 @@ test "store vector elements via comptime index" {
 }
 
 test "load vector elements via runtime index" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -359,7 +353,6 @@ test "load vector elements via runtime index" {
 }
 
 test "store vector elements via runtime index" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
@@ -383,7 +376,6 @@ test "store vector elements via runtime index" {
 }
 
 test "initialize vector which is a struct field" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO