Merge pull request #18906 from jacobly0/x86_64-tests

x86_64: pass more tests
2026-02-21 16:54:52 +00:00 · 2024-02-25 21:43:20 -08:00 · 2024-02-25 21:43:20 -08:00 · 91fb211faa
commit 91fb211faa
parent d656c2a7ab 4fcc750ba5
24 changed files with 2224 additions and 651 deletions
--- a/lib/std/crypto/aes.zig
+++ b/lib/std/crypto/aes.zig
@ -6,7 +6,7 @@ const has_aesni = std.Target.x86.featureSetHas(builtin.cpu.features, .aes);
 const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx);
 const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes);
 // C backend doesn't currently support passing vectors to inline asm.
-const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and has_aesni and has_avx) impl: {
+const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and has_aesni and has_avx) impl: {
    break :impl @import("aes/aesni.zig");
 } else if (builtin.cpu.arch == .aarch64 and builtin.zig_backend != .stage2_c and has_armaes)
 impl: {
--- a/lib/std/crypto/blake3.zig
+++ b/lib/std/crypto/blake3.zig
@ -200,7 +200,7 @@ const CompressGeneric = struct {
    }
 };

-const compress = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64)
+const compress = if (builtin.cpu.arch == .x86_64)
    CompressVectorized.compress
 else
    CompressGeneric.compress;
--- a/lib/std/crypto/salsa20.zig
+++ b/lib/std/crypto/salsa20.zig
@ -302,7 +302,10 @@ fn SalsaNonVecImpl(comptime rounds: comptime_int) type {
    };
 }

-const SalsaImpl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64) SalsaVecImpl else SalsaNonVecImpl;
+const SalsaImpl = if (builtin.cpu.arch == .x86_64)
+    SalsaVecImpl
+else
+    SalsaNonVecImpl;

 fn keyToWords(key: [32]u8) [8]u32 {
    var k: [8]u32 = undefined;
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@ -238,7 +238,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                        return;
                    },
                    // C backend doesn't currently support passing vectors to inline asm.
-                    .x86_64 => if (builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
+                    .x86_64 => if (builtin.zig_backend != .stage2_c and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) {
                        var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
                        var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
                        const s_v = @as(*[16]v4u32, @ptrCast(&s));
--- a/lib/std/meta.zig
+++ b/lib/std/meta.zig
@ -1286,5 +1286,6 @@ test "hasUniqueRepresentation" {
    try testing.expect(!hasUniqueRepresentation([]u8));
    try testing.expect(!hasUniqueRepresentation([]const u8));

-    try testing.expect(hasUniqueRepresentation(@Vector(4, u16)));
+    try testing.expect(hasUniqueRepresentation(@Vector(std.simd.suggestVectorLength(u8) orelse 1, u8)));
+    try testing.expect(@sizeOf(@Vector(3, u8)) == 3 or !hasUniqueRepresentation(@Vector(3, u8)));
 }
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@ -239,18 +239,19 @@ pub fn utf8ValidateSlice(input: []const u8) bool {
 fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool {
    var remaining = input;

-    const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
-    const Chunk = @Vector(chunk_len, u8);
+    if (std.simd.suggestVectorLength(u8)) |chunk_len| {
+        const Chunk = @Vector(chunk_len, u8);

-    // Fast path. Check for and skip ASCII characters at the start of the input.
-    while (remaining.len >= chunk_len) {
-        const chunk: Chunk = remaining[0..chunk_len].*;
-        const mask: Chunk = @splat(0x80);
-        if (@reduce(.Or, chunk & mask == mask)) {
-            // found a non ASCII byte
-            break;
+        // Fast path. Check for and skip ASCII characters at the start of the input.
+        while (remaining.len >= chunk_len) {
+            const chunk: Chunk = remaining[0..chunk_len].*;
+            const mask: Chunk = @splat(0x80);
+            if (@reduce(.Or, chunk & mask == mask)) {
+                // found a non ASCII byte
+                break;
+            }
+            remaining = remaining[chunk_len..];
        }
-        remaining = remaining[chunk_len..];
    }

    // default lowest and highest continuation byte
@ -601,9 +602,9 @@ fn testUtf8IteratorOnAscii() !void {
    const s = Utf8View.initComptime("abc");

    var it1 = s.iterator();
-    try testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "a", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "b", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "c", it1.nextCodepointSlice().?));
    try testing.expect(it1.nextCodepointSlice() == null);

    var it2 = s.iterator();
@ -631,9 +632,9 @@ fn testUtf8ViewOk() !void {
    const s = Utf8View.initComptime("東京市");

    var it1 = s.iterator();
-    try testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "東", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "京", it1.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "市", it1.nextCodepointSlice().?));
    try testing.expect(it1.nextCodepointSlice() == null);

    var it2 = s.iterator();
@ -771,20 +772,20 @@ fn testUtf8Peeking() !void {
    const s = Utf8View.initComptime("noël");
    var it = s.iterator();

-    try testing.expect(std.mem.eql(u8, "n", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "n", it.nextCodepointSlice().?));

-    try testing.expect(std.mem.eql(u8, "o", it.peek(1)));
-    try testing.expect(std.mem.eql(u8, "oë", it.peek(2)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(3)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(4)));
-    try testing.expect(std.mem.eql(u8, "oël", it.peek(10)));
+    try testing.expect(mem.eql(u8, "o", it.peek(1)));
+    try testing.expect(mem.eql(u8, "oë", it.peek(2)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(3)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(4)));
+    try testing.expect(mem.eql(u8, "oël", it.peek(10)));

-    try testing.expect(std.mem.eql(u8, "o", it.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "ë", it.nextCodepointSlice().?));
-    try testing.expect(std.mem.eql(u8, "l", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "o", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "ë", it.nextCodepointSlice().?));
+    try testing.expect(mem.eql(u8, "l", it.nextCodepointSlice().?));
    try testing.expect(it.nextCodepointSlice() == null);

-    try testing.expect(std.mem.eql(u8, &[_]u8{}, it.peek(1)));
+    try testing.expect(mem.eql(u8, &[_]u8{}, it.peek(1)));
 }

 fn testError(bytes: []const u8, expected_err: anyerror) !void {
@ -926,59 +927,50 @@ test "fmtUtf8" {
 }

 fn utf16LeToUtf8ArrayListImpl(
-    array_list: *std.ArrayList(u8),
+    result: *std.ArrayList(u8),
    utf16le: []const u16,
    comptime surrogates: Surrogates,
 ) (switch (surrogates) {
    .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError,
    .can_encode_surrogate_half => mem.Allocator.Error,
 })!void {
-    // optimistically guess that it will all be ascii.
-    try array_list.ensureTotalCapacityPrecise(utf16le.len);
+    assert(result.capacity >= utf16le.len);

    var remaining = utf16le;
-    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
        while (remaining.len >= chunk_len) {
            const chunk: Chunk = remaining[0..chunk_len].*;
-            const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
+            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
            if (@reduce(.Or, chunk | mask != mask)) {
                // found a non ASCII code unit
                break;
            }
-            const chunk_byte_len = chunk_len * 2;
-            const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
-            const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
-            const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
+            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
            // We allocated enough space to encode every UTF-16 code unit
            // as ASCII, so if the entire string is ASCII then we are
            // guaranteed to have enough space allocated
-            array_list.appendSliceAssumeCapacity(&ascii_bytes);
+            result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk;
            remaining = remaining[chunk_len..];
        }
    }

-    var out_index: usize = array_list.items.len;
    switch (surrogates) {
        .cannot_encode_surrogate_half => {
            var it = Utf16LeIterator.init(remaining);
            while (try it.nextCodepoint()) |codepoint| {
                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
-                try array_list.resize(array_list.items.len + utf8_len);
-                assert((utf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);
-                out_index += utf8_len;
+                assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
            }
        },
        .can_encode_surrogate_half => {
            var it = Wtf16LeIterator.init(remaining);
            while (it.nextCodepoint()) |codepoint| {
                const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable;
-                try array_list.resize(array_list.items.len + utf8_len);
-                assert((wtf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len);
-                out_index += utf8_len;
+                assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len);
            }
        },
    }
@ -986,8 +978,9 @@ fn utf16LeToUtf8ArrayListImpl(

 pub const Utf16LeToUtf8AllocError = mem.Allocator.Error || Utf16LeToUtf8Error;

-pub fn utf16LeToUtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
-    return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .cannot_encode_surrogate_half);
+pub fn utf16LeToUtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void {
+    try result.ensureTotalCapacityPrecise(utf16le.len);
+    return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half);
 }

 /// Deprecated; renamed to utf16LeToUtf8Alloc
@ -999,8 +992,7 @@ pub fn utf16LeToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) Utf16L
    var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len);
    errdefer result.deinit();

-    try utf16LeToUtf8ArrayList(&result, utf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
    return result.toOwnedSlice();
 }

@ -1013,8 +1005,7 @@ pub fn utf16LeToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) Utf16
    var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1);
    errdefer result.deinit();

-    try utf16LeToUtf8ArrayList(&result, utf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half);
    return result.toOwnedSliceSentinel(0);
 }

@ -1026,27 +1017,24 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
    .cannot_encode_surrogate_half => Utf16LeToUtf8Error,
    .can_encode_surrogate_half => error{},
 })!usize {
-    var end_index: usize = 0;
+    var dest_index: usize = 0;

    var remaining = utf16le;
-    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
        while (remaining.len >= chunk_len) {
            const chunk: Chunk = remaining[0..chunk_len].*;
-            const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F));
+            const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F));
            if (@reduce(.Or, chunk | mask != mask)) {
                // found a non ASCII code unit
                break;
            }
-            const chunk_byte_len = chunk_len * 2;
-            const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*;
-            const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes);
-            const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0];
-            @memcpy(utf8[end_index .. end_index + chunk_len], &ascii_bytes);
-            end_index += chunk_len;
+            const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk));
+            utf8[dest_index..][0..chunk_len].* = ascii_chunk;
+            dest_index += chunk_len;
            remaining = remaining[chunk_len..];
        }
    }
@ -1055,7 +1043,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
        .cannot_encode_surrogate_half => {
            var it = Utf16LeIterator.init(remaining);
            while (try it.nextCodepoint()) |codepoint| {
-                end_index += utf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {
+                dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
                    // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
                    // which is within the valid codepoint range.
                    error.CodepointTooLarge => unreachable,
@ -1068,7 +1056,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
        .can_encode_surrogate_half => {
            var it = Wtf16LeIterator.init(remaining);
            while (it.nextCodepoint()) |codepoint| {
-                end_index += wtf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) {
+                dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) {
                    // The maximum possible codepoint encoded by UTF-16 is U+10FFFF,
                    // which is within the valid codepoint range.
                    error.CodepointTooLarge => unreachable,
@ -1076,7 +1064,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr
            }
        },
    }
-    return end_index;
+    return dest_index;
 }

 /// Deprecated; renamed to utf16LeToUtf8
@ -1149,14 +1137,12 @@ test utf16LeToUtf8 {
    }
 }

-fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
-    // optimistically guess that it will not require surrogate pairs
-    try array_list.ensureTotalCapacityPrecise(utf8.len);
+fn utf8ToUtf16LeArrayListImpl(result: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void {
+    assert(result.capacity >= utf8.len);

    var remaining = utf8;
-    // Need support for std.simd.interlace
-    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -1167,9 +1153,8 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8,
                // found a non ASCII code unit
                break;
            }
-            const zeroes: Chunk = @splat(0);
-            const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
-            array_list.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk));
+            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
+            result.addManyAsArrayAssumeCapacity(chunk_len).* = utf16_chunk;
            remaining = remaining[chunk_len..];
        }
    }
@ -1181,21 +1166,18 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8,
    var it = view.iterator();
    while (it.nextCodepoint()) |codepoint| {
        if (codepoint < 0x10000) {
-            const short = @as(u16, @intCast(codepoint));
-            try array_list.append(mem.nativeToLittle(u16, short));
+            try result.append(mem.nativeToLittle(u16, @intCast(codepoint)));
        } else {
            const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
            const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
-            var out: [2]u16 = undefined;
-            out[0] = mem.nativeToLittle(u16, high);
-            out[1] = mem.nativeToLittle(u16, low);
-            try array_list.appendSlice(out[0..]);
+            try result.appendSlice(&.{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) });
        }
    }
 }

-pub fn utf8ToUtf16LeArrayList(array_list: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
-    return utf8ToUtf16LeArrayListImpl(array_list, utf8, .cannot_encode_surrogate_half);
+pub fn utf8ToUtf16LeArrayList(result: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void {
+    try result.ensureTotalCapacityPrecise(utf8.len);
+    return utf8ToUtf16LeArrayListImpl(result, utf8, .cannot_encode_surrogate_half);
 }

 pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![]u16 {
@ -1204,7 +1186,6 @@ pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ Inv
    errdefer result.deinit();

    try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
-
    return result.toOwnedSlice();
 }

@ -1217,7 +1198,6 @@ pub fn utf8ToUtf16LeAllocZ(allocator: mem.Allocator, utf8: []const u8) error{ In
    errdefer result.deinit();

    try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half);
-
    return result.toOwnedSliceSentinel(0);
 }

@ -1228,12 +1208,11 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) error{InvalidUtf8}!usize
 }

 pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize {
-    var dest_i: usize = 0;
+    var dest_index: usize = 0;

    var remaining = utf8;
-    // Need support for std.simd.interlace
-    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
+    vectorized: {
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized;
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -1244,57 +1223,60 @@ pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates:
                // found a non ASCII code unit
                break;
            }
-            const zeroes: Chunk = @splat(0);
-            const utf16_bytes: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes });
-            @memcpy(utf16le[dest_i..][0..chunk_len], std.mem.bytesAsSlice(u16, &utf16_bytes));
-            dest_i += chunk_len;
+            const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk);
+            utf16le[dest_index..][0..chunk_len].* = utf16_chunk;
+            dest_index += chunk_len;
            remaining = remaining[chunk_len..];
        }
    }

-    var src_i: usize = 0;
-    while (src_i < remaining.len) {
-        const n = utf8ByteSequenceLength(remaining[src_i]) catch return switch (surrogates) {
-            .cannot_encode_surrogate_half => error.InvalidUtf8,
-            .can_encode_surrogate_half => error.InvalidWtf8,
-        };
-        const next_src_i = src_i + n;
-        const codepoint = switch (surrogates) {
-            .cannot_encode_surrogate_half => utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8,
-            .can_encode_surrogate_half => wtf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidWtf8,
-        };
+    const view = switch (surrogates) {
+        .cannot_encode_surrogate_half => try Utf8View.init(remaining),
+        .can_encode_surrogate_half => try Wtf8View.init(remaining),
+    };
+    var it = view.iterator();
+    while (it.nextCodepoint()) |codepoint| {
        if (codepoint < 0x10000) {
-            const short = @as(u16, @intCast(codepoint));
-            utf16le[dest_i] = mem.nativeToLittle(u16, short);
-            dest_i += 1;
+            utf16le[dest_index] = mem.nativeToLittle(u16, @intCast(codepoint));
+            dest_index += 1;
        } else {
            const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800;
            const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00;
-            utf16le[dest_i] = mem.nativeToLittle(u16, high);
-            utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
-            dest_i += 2;
+            utf16le[dest_index..][0..2].* = .{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) };
+            dest_index += 2;
        }
-        src_i = next_src_i;
    }
-    return dest_i;
+    return dest_index;
 }

 test "utf8ToUtf16Le" {
-    var utf16le: [2]u16 = [_]u16{0} ** 2;
+    var utf16le: [128]u16 = undefined;
    {
        const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
-        try testing.expectEqual(@as(usize, 2), length);
-        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..]));
+        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..length]));
    }
    {
        const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
-        try testing.expectEqual(@as(usize, 2), length);
-        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..]));
+        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..length]));
    }
    {
        const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80");
        try testing.expectError(error.InvalidUtf8, result);
    }
+    {
+        const length = try utf8ToUtf16Le(utf16le[0..], "This string has been designed to test the vectorized implementat" ++
+            "ion by beginning with one hundred twenty-seven ASCII characters¡");
+        try testing.expectEqualSlices(u8, &.{
+            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
+            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
+            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
+            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
+            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
+            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
+            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
+            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
+        }, mem.sliceAsBytes(utf16le[0..length]));
+    }
 }

 test utf8ToUtf16LeArrayList {
@ -1339,25 +1321,40 @@ test utf8ToUtf16LeAllocZ {
    {
        const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷");
        defer testing.allocator.free(utf16);
-        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..]));
+        try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16));
        try testing.expect(utf16[2] == 0);
    }
    {
        const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}");
        defer testing.allocator.free(utf16);
-        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..]));
+        try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16));
        try testing.expect(utf16[2] == 0);
    }
    {
        const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80");
        try testing.expectError(error.InvalidUtf8, result);
    }
+    {
+        const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "This string has been designed to test the vectorized implementat" ++
+            "ion by beginning with one hundred twenty-seven ASCII characters¡");
+        defer testing.allocator.free(utf16);
+        try testing.expectEqualSlices(u8, &.{
+            'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ',  0,
+            'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o',  0,
+            ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r',  0,
+            'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't',  0,
+            'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g',  0,
+            ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e',  0,
+            'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A',  0,
+            'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0,
+        }, mem.sliceAsBytes(utf16));
+    }
 }

 /// Converts a UTF-8 string literal into a UTF-16LE string literal.
-pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 {
+pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 {
    return comptime blk: {
-        const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err);
+        const len: usize = calcUtf16LeLen(utf8) catch unreachable;
        var utf16le: [len:0]u16 = [_:0]u16{0} ** len;
        const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err);
        assert(len == utf16le_len);
@ -1438,12 +1435,12 @@ test "fmtUtf16Le" {
    try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))});
    try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))});
    try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))});
-    try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xd7", native_endian)})});
-    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xd8", native_endian)})});
-    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdb", native_endian)})});
-    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xdc", native_endian)})});
-    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdf", native_endian)})});
-    try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xe0", native_endian)})});
+    try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})});
+    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})});
+    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})});
+    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xdc", native_endian)})});
+    try expectFmt("<EFBFBD>", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdf", native_endian)})});
+    try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})});
 }

 test "utf8ToUtf16LeStringLiteral" {
@ -1686,8 +1683,9 @@ pub const Wtf8Iterator = struct {
    }
 };

-pub fn wtf16LeToWtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void {
-    return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .can_encode_surrogate_half);
+pub fn wtf16LeToWtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void {
+    try result.ensureTotalCapacityPrecise(utf16le.len);
+    return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half);
 }

 /// Caller must free returned memory.
@ -1696,8 +1694,7 @@ pub fn wtf16LeToWtf8Alloc(allocator: mem.Allocator, wtf16le: []const u16) mem.Al
    var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len);
    errdefer result.deinit();

-    try wtf16LeToWtf8ArrayList(&result, wtf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
    return result.toOwnedSlice();
 }

@ -1707,8 +1704,7 @@ pub fn wtf16LeToWtf8AllocZ(allocator: mem.Allocator, wtf16le: []const u16) mem.A
    var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len + 1);
    errdefer result.deinit();

-    try wtf16LeToWtf8ArrayList(&result, wtf16le);
-
+    try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half);
    return result.toOwnedSliceSentinel(0);
 }

@ -1716,8 +1712,9 @@ pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize {
    return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {};
 }

-pub fn wtf8ToWtf16LeArrayList(array_list: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
-    return utf8ToUtf16LeArrayListImpl(array_list, wtf8, .can_encode_surrogate_half);
+pub fn wtf8ToWtf16LeArrayList(result: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void {
+    try result.ensureTotalCapacityPrecise(wtf8.len);
+    return utf8ToUtf16LeArrayListImpl(result, wtf8, .can_encode_surrogate_half);
 }

 pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u16 {
@ -1726,7 +1723,6 @@ pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ Inv
    errdefer result.deinit();

    try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
-
    return result.toOwnedSlice();
 }

@ -1736,7 +1732,6 @@ pub fn wtf8ToWtf16LeAllocZ(allocator: mem.Allocator, wtf8: []const u8) error{ In
    errdefer result.deinit();

    try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half);
-
    return result.toOwnedSliceSentinel(0);
 }

@ -1895,7 +1890,7 @@ pub const Wtf16LeIterator = struct {

    pub fn init(s: []const u16) Wtf16LeIterator {
        return Wtf16LeIterator{
-            .bytes = std.mem.sliceAsBytes(s),
+            .bytes = mem.sliceAsBytes(s),
            .i = 0,
        };
    }
@ -1908,12 +1903,12 @@ pub const Wtf16LeIterator = struct {
        assert(it.i <= it.bytes.len);
        if (it.i == it.bytes.len) return null;
        var code_units: [2]u16 = undefined;
-        code_units[0] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);
+        code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
        it.i += 2;
        surrogate_pair: {
            if (utf16IsHighSurrogate(code_units[0])) {
                if (it.i >= it.bytes.len) break :surrogate_pair;
-                code_units[1] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little);
+                code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little);
                const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair;
                it.i += 2;
                return codepoint;
@ -2030,31 +2025,31 @@ fn testRoundtripWtf16(wtf16le: []const u16) !void {

 test "well-formed WTF-16 roundtrips" {
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD83D), // high surrogate
-        std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate
+        mem.nativeToLittle(u16, 0xD83D), // high surrogate
+        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD83D), // high surrogate
-        std.mem.nativeToLittle(u16, ' '), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate
+        mem.nativeToLittle(u16, 0xD83D), // high surrogate
+        mem.nativeToLittle(u16, ' '), // not surrogate
+        mem.nativeToLittle(u16, 0xDCA9), // low surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD800), // high surrogate
-        std.mem.nativeToLittle(u16, 0xDBFF), // high surrogate
+        mem.nativeToLittle(u16, 0xD800), // high surrogate
+        mem.nativeToLittle(u16, 0xDBFF), // high surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD800), // high surrogate
-        std.mem.nativeToLittle(u16, 0xE000), // not surrogate
+        mem.nativeToLittle(u16, 0xD800), // high surrogate
+        mem.nativeToLittle(u16, 0xE000), // not surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xD7FF), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0xD7FF), // not surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0x61), // not surrogate
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0x61), // not surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
    });
    try testRoundtripWtf16(&[_]u16{
-        std.mem.nativeToLittle(u16, 0xDC00), // low surrogate
+        mem.nativeToLittle(u16, 0xDC00), // low surrogate
    });
 }
--- a/lib/std/zig/c_translation.zig
+++ b/lib/std/zig/c_translation.zig
@ -308,14 +308,12 @@ test "promoteIntLiteral" {

 /// Convert from clang __builtin_shufflevector index to Zig @shuffle index
 /// clang requires __builtin_shufflevector index arguments to be integer constants.
-/// negative values for `this_index` indicate "don't care" so we arbitrarily choose 0
+/// negative values for `this_index` indicate "don't care".
 /// clang enforces that `this_index` is less than the total number of vector elements
 /// See https://ziglang.org/documentation/master/#shuffle
 /// See https://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector
 pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len: usize) i32 {
-    if (this_index <= 0) return 0;
-
-    const positive_index = @as(usize, @intCast(this_index));
+    const positive_index = std.math.cast(usize, this_index) orelse return undefined;
    if (positive_index < source_vector_len) return @as(i32, @intCast(this_index));
    const b_index = positive_index - source_vector_len;
    return ~@as(i32, @intCast(b_index));
@ -324,7 +322,7 @@ pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len
 test "shuffleVectorIndex" {
    const vector_len: usize = 4;

-    try testing.expect(shuffleVectorIndex(-1, vector_len) == 0);
+    _ = shuffleVectorIndex(-1, vector_len);

    try testing.expect(shuffleVectorIndex(0, vector_len) == 0);
    try testing.expect(shuffleVectorIndex(1, vector_len) == 1);
--- a/src/InternPool.zig
+++ b/src/InternPool.zig
@ -3587,6 +3587,7 @@ pub const Alignment = enum(u6) {
    @"8" = 3,
    @"16" = 4,
    @"32" = 5,
+    @"64" = 6,
    none = std.math.maxInt(u6),
    _,

@ -7403,10 +7404,14 @@ pub fn isIntegerType(ip: *const InternPool, ty: Index) bool {
        .c_ulong_type,
        .c_longlong_type,
        .c_ulonglong_type,
-        .c_longdouble_type,
        .comptime_int_type,
        => true,
-        else => ip.indexToKey(ty) == .int_type,
+        else => switch (ip.items.items(.tag)[@intFromEnum(ty)]) {
+            .type_int_signed,
+            .type_int_unsigned,
+            => true,
+            else => false,
+        },
    };
 }

--- a/src/Sema.zig
+++ b/src/Sema.zig
@ -23315,7 +23315,8 @@ fn checkVectorElemType(
    const mod = sema.mod;
    switch (ty.zigTypeTag(mod)) {
        .Int, .Float, .Bool => return,
-        else => if (ty.isPtrAtRuntime(mod)) return,
+        .Optional, .Pointer => if (ty.isPtrAtRuntime(mod)) return,
+        else => {},
    }
    return sema.fail(block, ty_src, "expected integer, float, bool, or pointer for the vector element type; found '{}'", .{ty.fmt(mod)});
 }
@ -28442,7 +28443,7 @@ const CoerceOpts = struct {
    report_err: bool = true,
    /// Ignored if `report_err == false`.
    is_ret: bool = false,
-    /// Should coercion to comptime_int ermit an error message.
+    /// Should coercion to comptime_int emit an error message.
    no_cast_to_comptime_int: bool = false,

    param_src: struct {
@ -31845,6 +31846,34 @@ fn coerceArrayLike(
    }

    const dest_elem_ty = dest_ty.childType(mod);
+    if (dest_ty.isVector(mod) and inst_ty.isVector(mod) and (try sema.resolveValue(inst)) == null) {
+        const inst_elem_ty = inst_ty.childType(mod);
+        switch (dest_elem_ty.zigTypeTag(mod)) {
+            .Int => if (inst_elem_ty.isInt(mod)) {
+                // integer widening
+                const dst_info = dest_elem_ty.intInfo(mod);
+                const src_info = inst_elem_ty.intInfo(mod);
+                if ((src_info.signedness == dst_info.signedness and dst_info.bits >= src_info.bits) or
+                    // small enough unsigned ints can get casted to large enough signed ints
+                    (dst_info.signedness == .signed and dst_info.bits > src_info.bits))
+                {
+                    try sema.requireRuntimeBlock(block, inst_src, null);
+                    return block.addTyOp(.intcast, dest_ty, inst);
+                }
+            },
+            .Float => if (inst_elem_ty.isRuntimeFloat()) {
+                // float widening
+                const src_bits = inst_elem_ty.floatBits(target);
+                const dst_bits = dest_elem_ty.floatBits(target);
+                if (dst_bits >= src_bits) {
+                    try sema.requireRuntimeBlock(block, inst_src, null);
+                    return block.addTyOp(.fpext, dest_ty, inst);
+                }
+            },
+            else => {},
+        }
+    }
+
    const element_vals = try sema.arena.alloc(InternPool.Index, dest_len);
    const element_refs = try sema.arena.alloc(Air.Inst.Ref, dest_len);
    var runtime_src: ?LazySrcLoc = null;
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@ -324,16 +324,19 @@ pub const Mnemonic = enum {
    // SSE3
    movddup, movshdup, movsldup,
    // SSSE3
-    pabsb, pabsd, pabsw, palignr,
+    pabsb, pabsd, pabsw, palignr, pshufb,
    // SSE4.1
    blendpd, blendps, blendvpd, blendvps,
    extractps,
    insertps,
    packusdw,
+    pblendvb, pblendw,
    pcmpeqq,
    pextrb, pextrd, pextrq,
    pinsrb, pinsrd, pinsrq,
    pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw,
+    pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq,
+    pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq,
    pmulld,
    roundpd, roundps, roundsd, roundss,
    // SSE4.2
@ -377,7 +380,8 @@ pub const Mnemonic = enum {
    vpabsb, vpabsd, vpabsw,
    vpackssdw, vpacksswb, vpackusdw, vpackuswb,
    vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw,
-    vpalignr, vpand, vpandn, vpclmulqdq,
+    vpalignr, vpand, vpandn,
+    vpblendvb, vpblendw, vpclmulqdq,
    vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw,
    vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw,
    vpextrb, vpextrd, vpextrq, vpextrw,
@ -385,9 +389,11 @@ pub const Mnemonic = enum {
    vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw,
    vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw,
    vpmovmskb,
+    vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq,
+    vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq,
    vpmulhw, vpmulld, vpmullw,
    vpor,
-    vpshufd, vpshufhw, vpshuflw,
+    vpshufb, vpshufd, vpshufhw, vpshuflw,
    vpslld, vpslldq, vpsllq, vpsllw,
    vpsrad, vpsraq, vpsraw,
    vpsrld, vpsrldq, vpsrlq, vpsrlw,
@ -409,7 +415,8 @@ pub const Mnemonic = enum {
    vfmadd132sd, vfmadd213sd, vfmadd231sd,
    vfmadd132ss, vfmadd213ss, vfmadd231ss,
    // AVX2
-    vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw,
+    vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw,
+    vextracti128, vinserti128, vpblendd,
    // zig fmt: on
 };

--- a/src/arch/x86_64/Lower.zig
+++ b/src/arch/x86_64/Lower.zig
@ -477,8 +477,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
        .rri_s, .rri_u => inst.data.rri.fixes,
        .ri_s, .ri_u => inst.data.ri.fixes,
        .ri64, .rm, .rmi_s, .mr => inst.data.rx.fixes,
-        .mrr, .rrm => inst.data.rrx.fixes,
+        .mrr, .rrm, .rmr => inst.data.rrx.fixes,
        .rmi, .mri => inst.data.rix.fixes,
+        .rrmr => inst.data.rrrx.fixes,
        .rrmi => inst.data.rrix.fixes,
        .mi_u, .mi_s => inst.data.x.fixes,
        .m => inst.data.x.fixes,
@ -565,6 +566,11 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
            .{ .reg = inst.data.rx.r1 },
            .{ .mem = lower.mem(inst.data.rx.payload) },
        },
+        .rmr => &.{
+            .{ .reg = inst.data.rrx.r1 },
+            .{ .mem = lower.mem(inst.data.rrx.payload) },
+            .{ .reg = inst.data.rrx.r2 },
+        },
        .rmi => &.{
            .{ .reg = inst.data.rix.r1 },
            .{ .mem = lower.mem(inst.data.rix.payload) },
@ -597,6 +603,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void {
            .{ .reg = inst.data.rrx.r2 },
            .{ .mem = lower.mem(inst.data.rrx.payload) },
        },
+        .rrmr => &.{
+            .{ .reg = inst.data.rrrx.r1 },
+            .{ .reg = inst.data.rrrx.r2 },
+            .{ .mem = lower.mem(inst.data.rrrx.payload) },
+            .{ .reg = inst.data.rrrx.r3 },
+        },
        .rrmi => &.{
            .{ .reg = inst.data.rrix.r1 },
            .{ .reg = inst.data.rrix.r2 },
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@ -230,6 +230,8 @@ pub const Inst = struct {
        v_d,
        /// VEX-Encoded ___ QuadWord
        v_q,
+        /// VEX-Encoded ___ Integer Data
+        v_i128,
        /// VEX-Encoded Packed ___
        vp_,
        /// VEX-Encoded Packed ___ Byte
@ -242,8 +244,6 @@ pub const Inst = struct {
        vp_q,
        /// VEX-Encoded Packed ___ Double Quadword
        vp_dq,
-        /// VEX-Encoded Packed ___ Integer Data
-        vp_i128,
        /// VEX-Encoded ___ Scalar Single-Precision Values
        v_ss,
        /// VEX-Encoded ___ Packed Single-Precision Values
@ -654,10 +654,19 @@ pub const Inst = struct {
        /// Variable blend scalar double-precision floating-point values
        blendv,
        /// Extract packed floating-point values
+        /// Extract packed integer values
        extract,
        /// Insert scalar single-precision floating-point value
        /// Insert packed floating-point values
        insert,
+        /// Packed move with sign extend
+        movsxb,
+        movsxd,
+        movsxw,
+        /// Packed move with zero extend
+        movzxb,
+        movzxd,
+        movzxw,
        /// Round packed single-precision floating-point values
        /// Round scalar single-precision floating-point value
        /// Round packed double-precision floating-point values
@ -688,6 +697,7 @@ pub const Inst = struct {
        sha256rnds2,

        /// Load with broadcast floating-point data
+        /// Load integer and broadcast
        broadcast,

        /// Convert 16-bit floating-point values to single-precision floating-point values
@ -762,8 +772,11 @@ pub const Inst = struct {
        /// Uses `imm` payload.
        rel,
        /// Register, memory operands.
-        /// Uses `rx` payload.
+        /// Uses `rx` payload with extra data of type `Memory`.
        rm,
+        /// Register, memory, register operands.
+        /// Uses `rrx` payload with extra data of type `Memory`.
+        rmr,
        /// Register, memory, immediate (word) operands.
        /// Uses `rix` payload with extra data of type `Memory`.
        rmi,
@ -776,6 +789,9 @@ pub const Inst = struct {
        /// Register, register, memory.
        /// Uses `rrix` payload with extra data of type `Memory`.
        rrm,
+        /// Register, register, memory, register.
+        /// Uses `rrrx` payload with extra data of type `Memory`.
+        rrmr,
        /// Register, register, memory, immediate (byte) operands.
        /// Uses `rrix` payload with extra data of type `Memory`.
        rrmi,
@ -953,6 +969,14 @@ pub const Inst = struct {
            r2: Register,
            payload: u32,
        },
+        /// Register, register, register, followed by Custom payload found in extra.
+        rrrx: struct {
+            fixes: Fixes = ._,
+            r1: Register,
+            r2: Register,
+            r3: Register,
+            payload: u32,
+        },
        /// Register, byte immediate, followed by Custom payload found in extra.
        rix: struct {
            fixes: Fixes = ._,
--- a/src/arch/x86_64/encodings.zig
+++ b/src/arch/x86_64/encodings.zig
@ -1185,6 +1185,8 @@ pub const table = [_]Entry{

    .{ .palignr, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 },

+    .{ .pshufb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .none, .ssse3 },
+
    // SSE4.1
    .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 },

@ -1202,6 +1204,11 @@ pub const table = [_]Entry{

    .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 },

+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128        }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+    .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 },
+
+    .{ .pblendw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .none, .sse4_1 },
+
    .{ .pcmpeqq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x29 }, 0, .none, .sse4_1 },

    .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
@ -1228,6 +1235,20 @@ pub const table = [_]Entry{

    .{ .pminud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .none, .sse4_1 },

+    .{ .pmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .none, .sse4_1 },
+    .{ .pmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .none, .sse4_1 },
+    .{ .pmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .none, .sse4_1 },
+    .{ .pmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .none, .sse4_1 },
+    .{ .pmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .none, .sse4_1 },
+    .{ .pmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .none, .sse4_1 },
+
+    .{ .pmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .none, .sse4_1 },
+    .{ .pmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .none, .sse4_1 },
+    .{ .pmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .none, .sse4_1 },
+    .{ .pmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .none, .sse4_1 },
+    .{ .pmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 },
+    .{ .pmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 },
+
    .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 },

    .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
@ -1528,6 +1549,10 @@ pub const table = [_]Entry{

    .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx },

+    .{ .vpblendvb, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_128_w0, .avx },
+
+    .{ .vpblendw, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_128_wig, .avx },
+
    .{ .vpclmulqdq, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_128_wig, .@"pclmul avx" },

    .{ .vpcmpeqb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_128_wig, .avx },
@ -1576,6 +1601,20 @@ pub const table = [_]Entry{
    .{ .vpmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx },
    .{ .vpmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx },

+    .{ .vpmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_128_wig, .avx },
+
+    .{ .vpmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx },
+    .{ .vpmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx },
+
    .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx },

    .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx },
@ -1584,6 +1623,8 @@ pub const table = [_]Entry{

    .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx },

+    .{ .vpshufb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx },
+
    .{ .vpshufd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },

    .{ .vpshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_128_wig, .avx },
@ -1728,6 +1769,10 @@ pub const table = [_]Entry{
    .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 },
    .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 },

+    .{ .vextracti128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x39 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vinserti128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x38 }, 0, .vex_256_w0, .avx2 },
+
    .{ .vpabsb, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1c }, 0, .vex_256_wig, .avx2 },
    .{ .vpabsd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1e }, 0, .vex_256_wig, .avx2 },
    .{ .vpabsw, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .vex_256_wig, .avx2 },
@ -1756,6 +1801,13 @@ pub const table = [_]Entry{

    .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 },

+    .{ .vpblendd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_128_w0, .avx2 },
+    .{ .vpblendd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpblendvb, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_256_w0, .avx2 },
+
+    .{ .vpblendw, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpbroadcastb,    .rm, &.{ .xmm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_128_w0, .avx2 },
    .{ .vpbroadcastb,    .rm, &.{ .ymm, .xmm_m8  }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_256_w0, .avx2 },
    .{ .vpbroadcastw,    .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_128_w0, .avx2 },
@ -1764,7 +1816,7 @@ pub const table = [_]Entry{
    .{ .vpbroadcastd,    .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 },
    .{ .vpbroadcastq,    .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 },
    .{ .vpbroadcastq,    .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 },
-    .{ .vpbroadcasti128, .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },
+    .{ .vbroadcasti128,  .rm, &.{ .ymm, .m128    }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 },

    .{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 },
    .{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 },
@ -1799,6 +1851,20 @@ pub const table = [_]Entry{
    .{ .vpmovmskb, .rm, &.{ .r32, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 },
    .{ .vpmovmskb, .rm, &.{ .r64, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 },

+    .{ .vpmovsxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxbd, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxbq, .rm, &.{ .ymm, .xmm_m32  }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxwq, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovsxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_256_wig, .avx2 },
+
+    .{ .vpmovzxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxbd, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxbq, .rm, &.{ .ymm, .xmm_m32  }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxwq, .rm, &.{ .ymm, .xmm_m64  }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 },
+    .{ .vpmovzxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 },
+
    .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 },

    .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 },
@ -1807,6 +1873,7 @@ pub const table = [_]Entry{

    .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 },

+    .{ .vpshufb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 },
    .{ .vpshufd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },

    .{ .vpshufhw, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 },
--- a/src/codegen.zig
+++ b/src/codegen.zig
@ -405,7 +405,7 @@ pub fn generateSymbol(
            .vector_type => |vector_type| {
                const abi_size = math.cast(usize, typed_value.ty.abiSize(mod)) orelse
                    return error.Overflow;
-                if (Type.fromInterned(vector_type.child).bitSize(mod) == 1) {
+                if (vector_type.child == .bool_type) {
                    const bytes = try code.addManyAsSlice(abi_size);
                    @memset(bytes, 0xaa);
                    var index: usize = 0;
@ -443,37 +443,34 @@ pub fn generateSymbol(
                            },
                        }) byte.* |= mask else byte.* &= ~mask;
                    }
-                } else switch (aggregate.storage) {
-                    .bytes => |bytes| try code.appendSlice(bytes),
-                    .elems, .repeated_elem => {
-                        var index: u64 = 0;
-                        while (index < vector_type.len) : (index += 1) {
-                            switch (try generateSymbol(bin_file, src_loc, .{
-                                .ty = Type.fromInterned(vector_type.child),
-                                .val = Value.fromInterned(switch (aggregate.storage) {
-                                    .bytes => unreachable,
-                                    .elems => |elems| elems[
-                                        math.cast(usize, index) orelse return error.Overflow
-                                    ],
-                                    .repeated_elem => |elem| elem,
-                                }),
-                            }, code, debug_output, reloc_info)) {
-                                .ok => {},
-                                .fail => |em| return .{ .fail = em },
+                } else {
+                    switch (aggregate.storage) {
+                        .bytes => |bytes| try code.appendSlice(bytes),
+                        .elems, .repeated_elem => {
+                            var index: u64 = 0;
+                            while (index < vector_type.len) : (index += 1) {
+                                switch (try generateSymbol(bin_file, src_loc, .{
+                                    .ty = Type.fromInterned(vector_type.child),
+                                    .val = Value.fromInterned(switch (aggregate.storage) {
+                                        .bytes => unreachable,
+                                        .elems => |elems| elems[
+                                            math.cast(usize, index) orelse return error.Overflow
+                                        ],
+                                        .repeated_elem => |elem| elem,
+                                    }),
+                                }, code, debug_output, reloc_info)) {
+                                    .ok => {},
+                                    .fail => |em| return .{ .fail = em },
+                                }
                            }
-                        }
-                    },
-                }
+                        },
+                    }

-                const padding = abi_size - (math.cast(usize, math.divCeil(
-                    u64,
-                    Type.fromInterned(vector_type.child).bitSize(mod) * vector_type.len,
-                    8,
-                ) catch |err| switch (err) {
-                    error.DivisionByZero => unreachable,
-                    else => |e| return e,
-                }) orelse return error.Overflow);
-                if (padding > 0) try code.appendNTimes(0, padding);
+                    const padding = abi_size -
+                        (math.cast(usize, Type.fromInterned(vector_type.child).abiSize(mod) * vector_type.len) orelse
+                        return error.Overflow);
+                    if (padding > 0) try code.appendNTimes(0, padding);
+                }
            },
            .anon_struct_type => |tuple| {
                const struct_begin = code.items.len;
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@ -4140,9 +4140,7 @@ fn airCmpOp(
    if (need_cast) try writer.writeAll("(void*)");
    try f.writeCValue(writer, lhs, .Other);
    try v.elem(f, writer);
-    try writer.writeByte(' ');
    try writer.writeAll(compareOperatorC(operator));
-    try writer.writeByte(' ');
    if (need_cast) try writer.writeAll("(void*)");
    try f.writeCValue(writer, rhs, .Other);
    try v.elem(f, writer);
@ -4181,41 +4179,28 @@ fn airEquality(
    const writer = f.object.writer();
    const inst_ty = f.typeOfIndex(inst);
    const local = try f.allocLocal(inst, inst_ty);
+    const a = try Assignment.start(f, writer, inst_ty);
    try f.writeCValue(writer, local, .Other);
-    try writer.writeAll(" = ");
+    try a.assign(f, writer);

    if (operand_ty.zigTypeTag(mod) == .Optional and !operand_ty.optionalReprIsPayload(mod)) {
-        // (A && B)  || (C && (A == B))
-        // A = lhs.is_null  ;  B = rhs.is_null  ;  C = rhs.payload == lhs.payload
-
-        switch (operator) {
-            .eq => {},
-            .neq => try writer.writeByte('!'),
-            else => unreachable,
-        }
-        try writer.writeAll("((");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" || ");
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" ? ");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" });
+        try writer.writeAll(compareOperatorC(operator));
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" });
+        try writer.writeAll(" : ");
+        try f.writeCValueMember(writer, lhs, .{ .identifier = "payload" });
+        try writer.writeAll(compareOperatorC(operator));
+        try f.writeCValueMember(writer, rhs, .{ .identifier = "payload" });
+    } else {
        try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".is_null && ");
+        try writer.writeAll(compareOperatorC(operator));
        try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".is_null) || (");
-        try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".payload == ");
-        try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".payload && ");
-        try f.writeCValue(writer, lhs, .Other);
-        try writer.writeAll(".is_null == ");
-        try f.writeCValue(writer, rhs, .Other);
-        try writer.writeAll(".is_null));\n");
-
-        return local;
    }
-
-    try f.writeCValue(writer, lhs, .Other);
-    try writer.writeByte(' ');
-    try writer.writeAll(compareOperatorC(operator));
-    try writer.writeByte(' ');
-    try f.writeCValue(writer, rhs, .Other);
-    try writer.writeAll(";\n");
+    try a.end(f, writer);

    return local;
 }
@ -6109,41 +6094,48 @@ fn airFloatCast(f: *Function, inst: Air.Inst.Index) !CValue {
    const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op;

    const inst_ty = f.typeOfIndex(inst);
+    const inst_scalar_ty = inst_ty.scalarType(mod);
    const operand = try f.resolveInst(ty_op.operand);
    try reap(f, inst, &.{ty_op.operand});
    const operand_ty = f.typeOf(ty_op.operand);
+    const scalar_ty = operand_ty.scalarType(mod);
    const target = f.object.dg.module.getTarget();
-    const operation = if (inst_ty.isRuntimeFloat() and operand_ty.isRuntimeFloat())
-        if (inst_ty.floatBits(target) < operand_ty.floatBits(target)) "trunc" else "extend"
-    else if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat())
-        if (inst_ty.isSignedInt(mod)) "fix" else "fixuns"
-    else if (inst_ty.isRuntimeFloat() and operand_ty.isInt(mod))
-        if (operand_ty.isSignedInt(mod)) "float" else "floatun"
+    const operation = if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isRuntimeFloat())
+        if (inst_scalar_ty.floatBits(target) < scalar_ty.floatBits(target)) "trunc" else "extend"
+    else if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat())
+        if (inst_scalar_ty.isSignedInt(mod)) "fix" else "fixuns"
+    else if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isInt(mod))
+        if (scalar_ty.isSignedInt(mod)) "float" else "floatun"
    else
        unreachable;

    const writer = f.object.writer();
    const local = try f.allocLocal(inst, inst_ty);
+    const v = try Vectorize.start(f, inst, writer, operand_ty);
+    const a = try Assignment.start(f, writer, scalar_ty);
    try f.writeCValue(writer, local, .Other);
-
-    try writer.writeAll(" = ");
-    if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) {
+    try v.elem(f, writer);
+    try a.assign(f, writer);
+    if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) {
        try writer.writeAll("zig_wrap_");
-        try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty);
+        try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty);
        try writer.writeByte('(');
    }
    try writer.writeAll("zig_");
    try writer.writeAll(operation);
-    try writer.writeAll(compilerRtAbbrev(operand_ty, mod));
-    try writer.writeAll(compilerRtAbbrev(inst_ty, mod));
+    try writer.writeAll(compilerRtAbbrev(scalar_ty, mod));
+    try writer.writeAll(compilerRtAbbrev(inst_scalar_ty, mod));
    try writer.writeByte('(');
    try f.writeCValue(writer, operand, .FunctionArgument);
+    try v.elem(f, writer);
    try writer.writeByte(')');
-    if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) {
-        try f.object.dg.renderBuiltinInfo(writer, inst_ty, .bits);
+    if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) {
+        try f.object.dg.renderBuiltinInfo(writer, inst_scalar_ty, .bits);
        try writer.writeByte(')');
    }
-    try writer.writeAll(";\n");
+    try a.end(f, writer);
+    try v.end(f, inst, writer);
+
    return local;
 }

@ -6315,7 +6307,7 @@ fn airCmpBuiltinCall(
    try v.elem(f, writer);
    try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info);
    try writer.writeByte(')');
-    if (!ref_ret) try writer.print(" {s} {}", .{
+    if (!ref_ret) try writer.print("{s}{}", .{
        compareOperatorC(operator),
        try f.fmtIntLiteral(Type.i32, try mod.intValue(Type.i32, 0)),
    });
@ -7661,12 +7653,12 @@ fn compareOperatorAbbrev(operator: std.math.CompareOperator) []const u8 {

 fn compareOperatorC(operator: std.math.CompareOperator) []const u8 {
    return switch (operator) {
-        .lt => "<",
-        .lte => "<=",
-        .eq => "==",
-        .gte => ">=",
-        .gt => ">",
-        .neq => "!=",
+        .lt => " < ",
+        .lte => " <= ",
+        .eq => " == ",
+        .gte => " >= ",
+        .gt => " > ",
+        .neq => " != ",
    };
 }

--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@ -8646,8 +8646,6 @@ pub const FuncGen = struct {
        const operand_ty = self.typeOf(ty_op.operand);
        const dest_ty = self.typeOfIndex(inst);
        const target = mod.getTarget();
-        const dest_bits = dest_ty.floatBits(target);
-        const src_bits = operand_ty.floatBits(target);

        if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) {
            return self.wip.cast(.fpext, operand, try o.lowerType(dest_ty), "");
@ -8655,11 +8653,19 @@ pub const FuncGen = struct {
            const operand_llvm_ty = try o.lowerType(operand_ty);
            const dest_llvm_ty = try o.lowerType(dest_ty);

+            const dest_bits = dest_ty.scalarType(mod).floatBits(target);
+            const src_bits = operand_ty.scalarType(mod).floatBits(target);
            const fn_name = try o.builder.fmt("__extend{s}f{s}f2", .{
                compilerRtFloatAbbrev(src_bits), compilerRtFloatAbbrev(dest_bits),
            });

            const libc_fn = try self.getLibcFunction(fn_name, &.{operand_llvm_ty}, dest_llvm_ty);
+            if (dest_ty.isVector(mod)) return self.buildElementwiseCall(
+                libc_fn,
+                &.{operand},
+                try o.builder.poisonValue(dest_llvm_ty),
+                dest_ty.vectorLen(mod),
+            );
            return self.wip.call(
                .normal,
                .ccc,
--- a/src/type.zig
+++ b/src/type.zig
@ -905,11 +905,32 @@ pub const Type = struct {
                    return Type.fromInterned(array_type.child).abiAlignmentAdvanced(mod, strat);
                },
                .vector_type => |vector_type| {
-                    const bits_u64 = try bitSizeAdvanced(Type.fromInterned(vector_type.child), mod, opt_sema);
-                    const bits: u32 = @intCast(bits_u64);
-                    const bytes = ((bits * vector_type.len) + 7) / 8;
-                    const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
-                    return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                    if (vector_type.len == 0) return .{ .scalar = .@"1" };
+                    switch (mod.comp.getZigBackend()) {
+                        else => {
+                            const elem_bits: u32 = @intCast(try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema));
+                            if (elem_bits == 0) return .{ .scalar = .@"1" };
+                            const bytes = ((elem_bits * vector_type.len) + 7) / 8;
+                            const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
+                            return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                        },
+                        .stage2_x86_64 => {
+                            if (vector_type.child == .bool_type) {
+                                if (vector_type.len > 256 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" };
+                                if (vector_type.len > 128 and std.Target.x86.featureSetHas(target.cpu.features, .avx2)) return .{ .scalar = .@"32" };
+                                if (vector_type.len > 64) return .{ .scalar = .@"16" };
+                                const bytes = std.math.divCeil(u32, vector_type.len, 8) catch unreachable;
+                                const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes);
+                                return .{ .scalar = Alignment.fromByteUnits(alignment) };
+                            }
+                            const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar);
+                            if (elem_bytes == 0) return .{ .scalar = .@"1" };
+                            const bytes = elem_bytes * vector_type.len;
+                            if (bytes > 32 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" };
+                            if (bytes > 16 and std.Target.x86.featureSetHas(target.cpu.features, .avx)) return .{ .scalar = .@"32" };
+                            return .{ .scalar = .@"16" };
+                        },
+                    }
                },

                .opt_type => return abiAlignmentAdvancedOptional(ty, mod, strat),
@ -1237,9 +1258,6 @@ pub const Type = struct {
                            .storage = .{ .lazy_size = ty.toIntern() },
                        } }))) },
                    };
-                    const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema);
-                    const total_bits = elem_bits * vector_type.len;
-                    const total_bytes = (total_bits + 7) / 8;
                    const alignment = switch (try ty.abiAlignmentAdvanced(mod, strat)) {
                        .scalar => |x| x,
                        .val => return .{ .val = Value.fromInterned((try mod.intern(.{ .int = .{
@ -1247,6 +1265,18 @@ pub const Type = struct {
                            .storage = .{ .lazy_size = ty.toIntern() },
                        } }))) },
                    };
+                    const total_bytes = switch (mod.comp.getZigBackend()) {
+                        else => total_bytes: {
+                            const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema);
+                            const total_bits = elem_bits * vector_type.len;
+                            break :total_bytes (total_bits + 7) / 8;
+                        },
+                        .stage2_x86_64 => total_bytes: {
+                            if (vector_type.child == .bool_type) break :total_bytes std.math.divCeil(u32, vector_type.len, 8) catch unreachable;
+                            const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar);
+                            break :total_bytes elem_bytes * vector_type.len;
+                        },
+                    };
                    return AbiSizeAdvanced{ .scalar = alignment.forward(total_bytes) };
                },

@ -2108,7 +2138,8 @@ pub const Type = struct {

    /// Returns true if and only if the type is a fixed-width integer.
    pub fn isInt(self: Type, mod: *const Module) bool {
-        return self.isSignedInt(mod) or self.isUnsignedInt(mod);
+        return self.toIntern() != .comptime_int_type and
+            mod.intern_pool.isIntegerType(self.toIntern());
    }

    /// Returns true if and only if the type is a fixed-width, signed integer.
--- a/test/behavior/bitcast.zig
+++ b/test/behavior/bitcast.zig
@ -336,7 +336,7 @@ test "comptime @bitCast packed struct to int and back" {
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
-    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;

    if (builtin.zig_backend == .stage2_llvm and native_endian == .big) {
        // https://github.com/ziglang/zig/issues/13782
--- a/test/behavior/cast.zig
+++ b/test/behavior/cast.zig
@ -601,25 +601,25 @@ test "cast *[1][*]const u8 to [*]const ?[*]const u8" {

 test "@intCast on vector" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;

    const S = struct {
        fn doTheTest() !void {
            // Upcast (implicit, equivalent to @intCast)
            var up0: @Vector(2, u8) = [_]u8{ 0x55, 0xaa };
            _ = &up0;
-            const up1 = @as(@Vector(2, u16), up0);
-            const up2 = @as(@Vector(2, u32), up0);
-            const up3 = @as(@Vector(2, u64), up0);
+            const up1: @Vector(2, u16) = up0;
+            const up2: @Vector(2, u32) = up0;
+            const up3: @Vector(2, u64) = up0;
            // Downcast (safety-checked)
            var down0 = up3;
            _ = &down0;
-            const down1 = @as(@Vector(2, u32), @intCast(down0));
-            const down2 = @as(@Vector(2, u16), @intCast(down0));
-            const down3 = @as(@Vector(2, u8), @intCast(down0));
+            const down1: @Vector(2, u32) = @intCast(down0);
+            const down2: @Vector(2, u16) = @intCast(down0);
+            const down3: @Vector(2, u8) = @intCast(down0);

            try expect(mem.eql(u16, &@as([2]u16, up1), &[2]u16{ 0x55, 0xaa }));
            try expect(mem.eql(u32, &@as([2]u32, up2), &[2]u32{ 0x55, 0xaa }));
@ -629,20 +629,10 @@ test "@intCast on vector" {
            try expect(mem.eql(u16, &@as([2]u16, down2), &[2]u16{ 0x55, 0xaa }));
            try expect(mem.eql(u8, &@as([2]u8, down3), &[2]u8{ 0x55, 0xaa }));
        }
-
-        fn doTheTestFloat() !void {
-            var vec: @Vector(2, f32) = @splat(1234.0);
-            _ = &vec;
-            const wider: @Vector(2, f64) = vec;
-            try expect(wider[0] == 1234.0);
-            try expect(wider[1] == 1234.0);
-        }
    };

    try S.doTheTest();
    try comptime S.doTheTest();
-    try S.doTheTestFloat();
-    try comptime S.doTheTestFloat();
 }

 test "@floatCast cast down" {
@ -2340,10 +2330,31 @@ test "@floatCast on vector" {

    const S = struct {
        fn doTheTest() !void {
-            var a: @Vector(3, f64) = .{ 1.5, 2.5, 3.5 };
-            _ = &a;
-            const b: @Vector(3, f32) = @floatCast(a);
-            try expectEqual(@Vector(3, f32){ 1.5, 2.5, 3.5 }, b);
+            {
+                var a: @Vector(2, f64) = .{ 1.5, 2.5 };
+                _ = &a;
+                const b: @Vector(2, f32) = @floatCast(a);
+                try expectEqual(@Vector(2, f32){ 1.5, 2.5 }, b);
+            }
+            {
+                var a: @Vector(2, f32) = .{ 3.25, 4.25 };
+                _ = &a;
+                const b: @Vector(2, f64) = @floatCast(a);
+                try expectEqual(@Vector(2, f64){ 3.25, 4.25 }, b);
+            }
+            {
+                var a: @Vector(2, f32) = .{ 5.75, 6.75 };
+                _ = &a;
+                const b: @Vector(2, f64) = a;
+                try expectEqual(@Vector(2, f64){ 5.75, 6.75 }, b);
+            }
+            {
+                var vec: @Vector(2, f32) = @splat(1234.0);
+                _ = &vec;
+                const wider: @Vector(2, f64) = vec;
+                try expect(wider[0] == 1234.0);
+                try expect(wider[1] == 1234.0);
+            }
        }
    };

@ -2441,6 +2452,7 @@ test "@intFromBool on vector" {
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;

    const S = struct {
        fn doTheTest() !void {
--- a/test/behavior/optional.zig
+++ b/test/behavior/optional.zig
@ -110,44 +110,89 @@ test "nested optional field in struct" {
    try expect(s.x.?.y == 127);
 }

-test "equality compare optional with non-optional" {
+test "equality compare optionals and non-optionals" {
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO

-    try test_cmp_optional_non_optional();
-    try comptime test_cmp_optional_non_optional();
+    const S = struct {
+        fn doTheTest() !void {
+            var five: isize = 5;
+            var ten: isize = 10;
+            var opt_null: ?isize = null;
+            var opt_ten: ?isize = 10;
+            _ = .{ &five, &ten, &opt_null, &opt_ten };
+            try expect(opt_null != five);
+            try expect(opt_null != ten);
+            try expect(opt_ten != five);
+            try expect(opt_ten == ten);
+
+            var opt_int: ?isize = null;
+            try expect(opt_int != five);
+            try expect(opt_int != ten);
+            try expect(opt_int == opt_null);
+            try expect(opt_int != opt_ten);
+
+            opt_int = 10;
+            try expect(opt_int != five);
+            try expect(opt_int == ten);
+            try expect(opt_int != opt_null);
+            try expect(opt_int == opt_ten);
+
+            opt_int = five;
+            try expect(opt_int == five);
+            try expect(opt_int != ten);
+            try expect(opt_int != opt_null);
+            try expect(opt_int != opt_ten);
+
+            // test evaluation is always lexical
+            // ensure that the optional isn't always computed before the non-optional
+            var mutable_state: i32 = 0;
+            _ = blk1: {
+                mutable_state += 1;
+                break :blk1 @as(?f64, 10.0);
+            } != blk2: {
+                try expect(mutable_state == 1);
+                break :blk2 @as(f64, 5.0);
+            };
+            _ = blk1: {
+                mutable_state += 1;
+                break :blk1 @as(f64, 10.0);
+            } != blk2: {
+                try expect(mutable_state == 2);
+                break :blk2 @as(?f64, 5.0);
+            };
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
 }

-fn test_cmp_optional_non_optional() !void {
-    var ten: i32 = 10;
-    var opt_ten: ?i32 = 10;
-    var five: i32 = 5;
-    var int_n: ?i32 = null;
+test "compare optionals with modified payloads" {
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;

-    _ = .{ &ten, &opt_ten, &five, &int_n };
+    var lhs: ?bool = false;
+    const lhs_payload = &lhs.?;
+    var rhs: ?bool = true;
+    const rhs_payload = &rhs.?;
+    try expect(lhs != rhs and !(lhs == rhs));

-    try expect(int_n != ten);
-    try expect(opt_ten == ten);
-    try expect(opt_ten != five);
+    lhs = null;
+    lhs_payload.* = false;
+    rhs = false;
+    try expect(lhs != rhs and !(lhs == rhs));

-    // test evaluation is always lexical
-    // ensure that the optional isn't always computed before the non-optional
-    var mutable_state: i32 = 0;
-    _ = blk1: {
-        mutable_state += 1;
-        break :blk1 @as(?f64, 10.0);
-    } != blk2: {
-        try expect(mutable_state == 1);
-        break :blk2 @as(f64, 5.0);
-    };
-    _ = blk1: {
-        mutable_state += 1;
-        break :blk1 @as(f64, 10.0);
-    } != blk2: {
-        try expect(mutable_state == 2);
-        break :blk2 @as(?f64, 5.0);
-    };
+    lhs = true;
+    rhs = null;
+    rhs_payload.* = true;
+    try expect(lhs != rhs and !(lhs == rhs));
+
+    lhs = null;
+    lhs_payload.* = false;
+    rhs = null;
+    rhs_payload.* = true;
+    try expect(lhs == rhs and !(lhs != rhs));
 }

 test "unwrap function call with optional pointer return value" {
--- a/test/behavior/select.zig
+++ b/test/behavior/select.zig
@ -5,7 +5,6 @@ const expect = std.testing.expect;

 test "@select vectors" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@ -36,11 +35,12 @@ fn selectVectors() !void {

 test "@select arrays" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest;
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) return error.SkipZigTest;

    try comptime selectArrays();
    try selectArrays();
--- a/test/behavior/shuffle.zig
+++ b/test/behavior/shuffle.zig
@ -4,10 +4,11 @@ const mem = std.mem;
 const expect = std.testing.expect;

 test "@shuffle int" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest;

    const S = struct {
        fn doTheTest() !void {
--- a/test/behavior/vector.zig
+++ b/test/behavior/vector.zig
@ -29,7 +29,7 @@ test "vector wrap operators" {
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_x86_64 and
-        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest;

    const S = struct {
        fn doTheTest() !void {
@ -906,22 +906,26 @@ test "vector @reduce comptime" {
 }

 test "mask parameter of @shuffle is comptime scope" {
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and
+        !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest;

    const __v4hi = @Vector(4, i16);
-    var v4_a = __v4hi{ 0, 0, 0, 0 };
-    var v4_b = __v4hi{ 0, 0, 0, 0 };
+    var v4_a = __v4hi{ 1, 2, 3, 4 };
+    var v4_b = __v4hi{ 5, 6, 7, 8 };
    _ = .{ &v4_a, &v4_b };
    const shuffled: __v4hi = @shuffle(i16, v4_a, v4_b, @Vector(4, i32){
        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
-        std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(2, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(4, @typeInfo(@TypeOf(v4_a)).Vector.len),
+        std.zig.c_translation.shuffleVectorIndex(6, @typeInfo(@TypeOf(v4_a)).Vector.len),
    });
-    _ = shuffled;
+    try expect(shuffled[0] == 1);
+    try expect(shuffled[1] == 3);
+    try expect(shuffled[2] == 5);
+    try expect(shuffled[3] == 7);
 }

 test "saturating add" {
@ -1177,10 +1181,22 @@ test "@shlWithOverflow" {
 }

 test "alignment of vectors" {
-    try expect(@alignOf(@Vector(2, u8)) == 2);
-    try expect(@alignOf(@Vector(2, u1)) == 1);
-    try expect(@alignOf(@Vector(1, u1)) == 1);
-    try expect(@alignOf(@Vector(2, u16)) == 4);
+    try expect(@alignOf(@Vector(2, u8)) == switch (builtin.zig_backend) {
+        else => 2,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(2, u1)) == switch (builtin.zig_backend) {
+        else => 1,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(1, u1)) == switch (builtin.zig_backend) {
+        else => 1,
+        .stage2_x86_64 => 16,
+    });
+    try expect(@alignOf(@Vector(2, u16)) == switch (builtin.zig_backend) {
+        else => 4,
+        .stage2_x86_64 => 16,
+    });
 }

 test "loading the second vector from a slice of vectors" {
@ -1316,10 +1332,10 @@ test "modRem with zero divisor" {

 test "array operands to shuffle are coerced to vectors" {
    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;

    const mask = [5]i32{ -1, 0, 1, 2, 3 };