diff --git a/lib/std/crypto/aes.zig b/lib/std/crypto/aes.zig index f5752888fc..5e5ae04b58 100644 --- a/lib/std/crypto/aes.zig +++ b/lib/std/crypto/aes.zig @@ -6,7 +6,7 @@ const has_aesni = std.Target.x86.featureSetHas(builtin.cpu.features, .aes); const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx); const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes); // C backend doesn't currently support passing vectors to inline asm. -const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and has_aesni and has_avx) impl: { +const impl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and has_aesni and has_avx) impl: { break :impl @import("aes/aesni.zig"); } else if (builtin.cpu.arch == .aarch64 and builtin.zig_backend != .stage2_c and has_armaes) impl: { diff --git a/lib/std/crypto/blake3.zig b/lib/std/crypto/blake3.zig index d87211fb1e..585c338417 100644 --- a/lib/std/crypto/blake3.zig +++ b/lib/std/crypto/blake3.zig @@ -200,7 +200,7 @@ const CompressGeneric = struct { } }; -const compress = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64) +const compress = if (builtin.cpu.arch == .x86_64) CompressVectorized.compress else CompressGeneric.compress; diff --git a/lib/std/crypto/salsa20.zig b/lib/std/crypto/salsa20.zig index 7f4c1b0157..c791c6b773 100644 --- a/lib/std/crypto/salsa20.zig +++ b/lib/std/crypto/salsa20.zig @@ -302,7 +302,10 @@ fn SalsaNonVecImpl(comptime rounds: comptime_int) type { }; } -const SalsaImpl = if (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_x86_64) SalsaVecImpl else SalsaNonVecImpl; +const SalsaImpl = if (builtin.cpu.arch == .x86_64) + SalsaVecImpl +else + SalsaNonVecImpl; fn keyToWords(key: [32]u8) [8]u32 { var k: [8]u32 = undefined; diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig index 10909cfaec..31884c7381 100644 --- a/lib/std/crypto/sha2.zig +++ b/lib/std/crypto/sha2.zig @@ -238,7 +238,7 @@ fn Sha2x32(comptime params: Sha2Params32) type { return; }, // C backend doesn't currently support passing vectors to inline asm. - .x86_64 => if (builtin.zig_backend != .stage2_c and builtin.zig_backend != .stage2_x86_64 and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) { + .x86_64 => if (builtin.zig_backend != .stage2_c and comptime std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sha, .avx2 })) { var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] }; var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] }; const s_v = @as(*[16]v4u32, @ptrCast(&s)); diff --git a/lib/std/meta.zig b/lib/std/meta.zig index e7dd4e5652..17df0650f3 100644 --- a/lib/std/meta.zig +++ b/lib/std/meta.zig @@ -1286,5 +1286,6 @@ test "hasUniqueRepresentation" { try testing.expect(!hasUniqueRepresentation([]u8)); try testing.expect(!hasUniqueRepresentation([]const u8)); - try testing.expect(hasUniqueRepresentation(@Vector(4, u16))); + try testing.expect(hasUniqueRepresentation(@Vector(std.simd.suggestVectorLength(u8) orelse 1, u8))); + try testing.expect(@sizeOf(@Vector(3, u8)) == 3 or !hasUniqueRepresentation(@Vector(3, u8))); } diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 224b0b3801..b2067c4f8f 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -239,18 +239,19 @@ pub fn utf8ValidateSlice(input: []const u8) bool { fn utf8ValidateSliceImpl(input: []const u8, comptime surrogates: Surrogates) bool { var remaining = input; - const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; - const Chunk = @Vector(chunk_len, u8); + if (std.simd.suggestVectorLength(u8)) |chunk_len| { + const Chunk = @Vector(chunk_len, u8); - // Fast path. Check for and skip ASCII characters at the start of the input. - while (remaining.len >= chunk_len) { - const chunk: Chunk = remaining[0..chunk_len].*; - const mask: Chunk = @splat(0x80); - if (@reduce(.Or, chunk & mask == mask)) { - // found a non ASCII byte - break; + // Fast path. Check for and skip ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(0x80); + if (@reduce(.Or, chunk & mask == mask)) { + // found a non ASCII byte + break; + } + remaining = remaining[chunk_len..]; } - remaining = remaining[chunk_len..]; } // default lowest and highest continuation byte @@ -601,9 +602,9 @@ fn testUtf8IteratorOnAscii() !void { const s = Utf8View.initComptime("abc"); var it1 = s.iterator(); - try testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "a", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "b", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "c", it1.nextCodepointSlice().?)); try testing.expect(it1.nextCodepointSlice() == null); var it2 = s.iterator(); @@ -631,9 +632,9 @@ fn testUtf8ViewOk() !void { const s = Utf8View.initComptime("東京市"); var it1 = s.iterator(); - try testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "東", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "京", it1.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "市", it1.nextCodepointSlice().?)); try testing.expect(it1.nextCodepointSlice() == null); var it2 = s.iterator(); @@ -771,20 +772,20 @@ fn testUtf8Peeking() !void { const s = Utf8View.initComptime("noël"); var it = s.iterator(); - try testing.expect(std.mem.eql(u8, "n", it.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "n", it.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "o", it.peek(1))); - try testing.expect(std.mem.eql(u8, "oë", it.peek(2))); - try testing.expect(std.mem.eql(u8, "oël", it.peek(3))); - try testing.expect(std.mem.eql(u8, "oël", it.peek(4))); - try testing.expect(std.mem.eql(u8, "oël", it.peek(10))); + try testing.expect(mem.eql(u8, "o", it.peek(1))); + try testing.expect(mem.eql(u8, "oë", it.peek(2))); + try testing.expect(mem.eql(u8, "oël", it.peek(3))); + try testing.expect(mem.eql(u8, "oël", it.peek(4))); + try testing.expect(mem.eql(u8, "oël", it.peek(10))); - try testing.expect(std.mem.eql(u8, "o", it.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "ë", it.nextCodepointSlice().?)); - try testing.expect(std.mem.eql(u8, "l", it.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "o", it.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "ë", it.nextCodepointSlice().?)); + try testing.expect(mem.eql(u8, "l", it.nextCodepointSlice().?)); try testing.expect(it.nextCodepointSlice() == null); - try testing.expect(std.mem.eql(u8, &[_]u8{}, it.peek(1))); + try testing.expect(mem.eql(u8, &[_]u8{}, it.peek(1))); } fn testError(bytes: []const u8, expected_err: anyerror) !void { @@ -926,59 +927,50 @@ test "fmtUtf8" { } fn utf16LeToUtf8ArrayListImpl( - array_list: *std.ArrayList(u8), + result: *std.ArrayList(u8), utf16le: []const u16, comptime surrogates: Surrogates, ) (switch (surrogates) { .cannot_encode_surrogate_half => Utf16LeToUtf8AllocError, .can_encode_surrogate_half => mem.Allocator.Error, })!void { - // optimistically guess that it will all be ascii. - try array_list.ensureTotalCapacityPrecise(utf16le.len); + assert(result.capacity >= utf16le.len); var remaining = utf16le; - if (builtin.zig_backend != .stage2_x86_64) { - const chunk_len = std.simd.suggestVectorLength(u16) orelse 1; + vectorized: { + const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized; const Chunk = @Vector(chunk_len, u16); // Fast path. Check for and encode ASCII characters at the start of the input. while (remaining.len >= chunk_len) { const chunk: Chunk = remaining[0..chunk_len].*; - const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F)); + const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F)); if (@reduce(.Or, chunk | mask != mask)) { // found a non ASCII code unit break; } - const chunk_byte_len = chunk_len * 2; - const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*; - const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes); - const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0]; + const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk)); // We allocated enough space to encode every UTF-16 code unit // as ASCII, so if the entire string is ASCII then we are // guaranteed to have enough space allocated - array_list.appendSliceAssumeCapacity(&ascii_bytes); + result.addManyAsArrayAssumeCapacity(chunk_len).* = ascii_chunk; remaining = remaining[chunk_len..]; } } - var out_index: usize = array_list.items.len; switch (surrogates) { .cannot_encode_surrogate_half => { var it = Utf16LeIterator.init(remaining); while (try it.nextCodepoint()) |codepoint| { const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; - try array_list.resize(array_list.items.len + utf8_len); - assert((utf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len); - out_index += utf8_len; + assert((utf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len); } }, .can_encode_surrogate_half => { var it = Wtf16LeIterator.init(remaining); while (it.nextCodepoint()) |codepoint| { const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; - try array_list.resize(array_list.items.len + utf8_len); - assert((wtf8Encode(codepoint, array_list.items[out_index..]) catch unreachable) == utf8_len); - out_index += utf8_len; + assert((wtf8Encode(codepoint, try result.addManyAsSlice(utf8_len)) catch unreachable) == utf8_len); } }, } @@ -986,8 +978,9 @@ fn utf16LeToUtf8ArrayListImpl( pub const Utf16LeToUtf8AllocError = mem.Allocator.Error || Utf16LeToUtf8Error; -pub fn utf16LeToUtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void { - return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .cannot_encode_surrogate_half); +pub fn utf16LeToUtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) Utf16LeToUtf8AllocError!void { + try result.ensureTotalCapacityPrecise(utf16le.len); + return utf16LeToUtf8ArrayListImpl(result, utf16le, .cannot_encode_surrogate_half); } /// Deprecated; renamed to utf16LeToUtf8Alloc @@ -999,8 +992,7 @@ pub fn utf16LeToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) Utf16L var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len); errdefer result.deinit(); - try utf16LeToUtf8ArrayList(&result, utf16le); - + try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half); return result.toOwnedSlice(); } @@ -1013,8 +1005,7 @@ pub fn utf16LeToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) Utf16 var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1); errdefer result.deinit(); - try utf16LeToUtf8ArrayList(&result, utf16le); - + try utf16LeToUtf8ArrayListImpl(&result, utf16le, .cannot_encode_surrogate_half); return result.toOwnedSliceSentinel(0); } @@ -1026,27 +1017,24 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr .cannot_encode_surrogate_half => Utf16LeToUtf8Error, .can_encode_surrogate_half => error{}, })!usize { - var end_index: usize = 0; + var dest_index: usize = 0; var remaining = utf16le; - if (builtin.zig_backend != .stage2_x86_64) { - const chunk_len = std.simd.suggestVectorLength(u16) orelse 1; + vectorized: { + const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized; const Chunk = @Vector(chunk_len, u16); // Fast path. Check for and encode ASCII characters at the start of the input. while (remaining.len >= chunk_len) { const chunk: Chunk = remaining[0..chunk_len].*; - const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F)); + const mask: Chunk = @splat(mem.nativeToLittle(u16, 0x7F)); if (@reduce(.Or, chunk | mask != mask)) { // found a non ASCII code unit break; } - const chunk_byte_len = chunk_len * 2; - const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*; - const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes); - const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0]; - @memcpy(utf8[end_index .. end_index + chunk_len], &ascii_bytes); - end_index += chunk_len; + const ascii_chunk: @Vector(chunk_len, u8) = @truncate(mem.nativeToLittle(Chunk, chunk)); + utf8[dest_index..][0..chunk_len].* = ascii_chunk; + dest_index += chunk_len; remaining = remaining[chunk_len..]; } } @@ -1055,7 +1043,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr .cannot_encode_surrogate_half => { var it = Utf16LeIterator.init(remaining); while (try it.nextCodepoint()) |codepoint| { - end_index += utf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) { + dest_index += utf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) { // The maximum possible codepoint encoded by UTF-16 is U+10FFFF, // which is within the valid codepoint range. error.CodepointTooLarge => unreachable, @@ -1068,7 +1056,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr .can_encode_surrogate_half => { var it = Wtf16LeIterator.init(remaining); while (it.nextCodepoint()) |codepoint| { - end_index += wtf8Encode(codepoint, utf8[end_index..]) catch |err| switch (err) { + dest_index += wtf8Encode(codepoint, utf8[dest_index..]) catch |err| switch (err) { // The maximum possible codepoint encoded by UTF-16 is U+10FFFF, // which is within the valid codepoint range. error.CodepointTooLarge => unreachable, @@ -1076,7 +1064,7 @@ fn utf16LeToUtf8Impl(utf8: []u8, utf16le: []const u16, comptime surrogates: Surr } }, } - return end_index; + return dest_index; } /// Deprecated; renamed to utf16LeToUtf8 @@ -1149,14 +1137,12 @@ test utf16LeToUtf8 { } } -fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void { - // optimistically guess that it will not require surrogate pairs - try array_list.ensureTotalCapacityPrecise(utf8.len); +fn utf8ToUtf16LeArrayListImpl(result: *std.ArrayList(u16), utf8: []const u8, comptime surrogates: Surrogates) !void { + assert(result.capacity >= utf8.len); var remaining = utf8; - // Need support for std.simd.interlace - if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) { - const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; + vectorized: { + const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized; const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -1167,9 +1153,8 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, // found a non ASCII code unit break; } - const zeroes: Chunk = @splat(0); - const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes }); - array_list.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk)); + const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk); + result.addManyAsArrayAssumeCapacity(chunk_len).* = utf16_chunk; remaining = remaining[chunk_len..]; } } @@ -1181,21 +1166,18 @@ fn utf8ToUtf16LeArrayListImpl(array_list: *std.ArrayList(u16), utf8: []const u8, var it = view.iterator(); while (it.nextCodepoint()) |codepoint| { if (codepoint < 0x10000) { - const short = @as(u16, @intCast(codepoint)); - try array_list.append(mem.nativeToLittle(u16, short)); + try result.append(mem.nativeToLittle(u16, @intCast(codepoint))); } else { const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800; const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00; - var out: [2]u16 = undefined; - out[0] = mem.nativeToLittle(u16, high); - out[1] = mem.nativeToLittle(u16, low); - try array_list.appendSlice(out[0..]); + try result.appendSlice(&.{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) }); } } } -pub fn utf8ToUtf16LeArrayList(array_list: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void { - return utf8ToUtf16LeArrayListImpl(array_list, utf8, .cannot_encode_surrogate_half); +pub fn utf8ToUtf16LeArrayList(result: *std.ArrayList(u16), utf8: []const u8) error{ InvalidUtf8, OutOfMemory }!void { + try result.ensureTotalCapacityPrecise(utf8.len); + return utf8ToUtf16LeArrayListImpl(result, utf8, .cannot_encode_surrogate_half); } pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ InvalidUtf8, OutOfMemory }![]u16 { @@ -1204,7 +1186,6 @@ pub fn utf8ToUtf16LeAlloc(allocator: mem.Allocator, utf8: []const u8) error{ Inv errdefer result.deinit(); try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half); - return result.toOwnedSlice(); } @@ -1217,7 +1198,6 @@ pub fn utf8ToUtf16LeAllocZ(allocator: mem.Allocator, utf8: []const u8) error{ In errdefer result.deinit(); try utf8ToUtf16LeArrayListImpl(&result, utf8, .cannot_encode_surrogate_half); - return result.toOwnedSliceSentinel(0); } @@ -1228,12 +1208,11 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) error{InvalidUtf8}!usize } pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: Surrogates) !usize { - var dest_i: usize = 0; + var dest_index: usize = 0; var remaining = utf8; - // Need support for std.simd.interlace - if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) { - const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; + vectorized: { + const chunk_len = std.simd.suggestVectorLength(u16) orelse break :vectorized; const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -1244,57 +1223,60 @@ pub fn utf8ToUtf16LeImpl(utf16le: []u16, utf8: []const u8, comptime surrogates: // found a non ASCII code unit break; } - const zeroes: Chunk = @splat(0); - const utf16_bytes: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes }); - @memcpy(utf16le[dest_i..][0..chunk_len], std.mem.bytesAsSlice(u16, &utf16_bytes)); - dest_i += chunk_len; + const utf16_chunk = mem.nativeToLittle(@Vector(chunk_len, u16), chunk); + utf16le[dest_index..][0..chunk_len].* = utf16_chunk; + dest_index += chunk_len; remaining = remaining[chunk_len..]; } } - var src_i: usize = 0; - while (src_i < remaining.len) { - const n = utf8ByteSequenceLength(remaining[src_i]) catch return switch (surrogates) { - .cannot_encode_surrogate_half => error.InvalidUtf8, - .can_encode_surrogate_half => error.InvalidWtf8, - }; - const next_src_i = src_i + n; - const codepoint = switch (surrogates) { - .cannot_encode_surrogate_half => utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8, - .can_encode_surrogate_half => wtf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidWtf8, - }; + const view = switch (surrogates) { + .cannot_encode_surrogate_half => try Utf8View.init(remaining), + .can_encode_surrogate_half => try Wtf8View.init(remaining), + }; + var it = view.iterator(); + while (it.nextCodepoint()) |codepoint| { if (codepoint < 0x10000) { - const short = @as(u16, @intCast(codepoint)); - utf16le[dest_i] = mem.nativeToLittle(u16, short); - dest_i += 1; + utf16le[dest_index] = mem.nativeToLittle(u16, @intCast(codepoint)); + dest_index += 1; } else { const high = @as(u16, @intCast((codepoint - 0x10000) >> 10)) + 0xD800; const low = @as(u16, @intCast(codepoint & 0x3FF)) + 0xDC00; - utf16le[dest_i] = mem.nativeToLittle(u16, high); - utf16le[dest_i + 1] = mem.nativeToLittle(u16, low); - dest_i += 2; + utf16le[dest_index..][0..2].* = .{ mem.nativeToLittle(u16, high), mem.nativeToLittle(u16, low) }; + dest_index += 2; } - src_i = next_src_i; } - return dest_i; + return dest_index; } test "utf8ToUtf16Le" { - var utf16le: [2]u16 = [_]u16{0} ** 2; + var utf16le: [128]u16 = undefined; { const length = try utf8ToUtf16Le(utf16le[0..], "𐐷"); - try testing.expectEqual(@as(usize, 2), length); - try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..])); + try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16le[0..length])); } { const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}"); - try testing.expectEqual(@as(usize, 2), length); - try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..])); + try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16le[0..length])); } { const result = utf8ToUtf16Le(utf16le[0..], "\xf4\x90\x80\x80"); try testing.expectError(error.InvalidUtf8, result); } + { + const length = try utf8ToUtf16Le(utf16le[0..], "This string has been designed to test the vectorized implementat" ++ + "ion by beginning with one hundred twenty-seven ASCII characters¡"); + try testing.expectEqualSlices(u8, &.{ + 'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ', 0, + 'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o', 0, + ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r', 0, + 'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't', 0, + 'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g', 0, + ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e', 0, + 'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A', 0, + 'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0, + }, mem.sliceAsBytes(utf16le[0..length])); + } } test utf8ToUtf16LeArrayList { @@ -1339,25 +1321,40 @@ test utf8ToUtf16LeAllocZ { { const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "𐐷"); defer testing.allocator.free(utf16); - try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16[0..])); + try testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", mem.sliceAsBytes(utf16)); try testing.expect(utf16[2] == 0); } { const utf16 = try utf8ToUtf16LeAllocZ(testing.allocator, "\u{10FFFF}"); defer testing.allocator.free(utf16); - try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16[0..])); + try testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", mem.sliceAsBytes(utf16)); try testing.expect(utf16[2] == 0); } { const result = utf8ToUtf16LeAllocZ(testing.allocator, "\xf4\x90\x80\x80"); try testing.expectError(error.InvalidUtf8, result); } + { + const utf16 = try utf8ToUtf16LeWithNull(testing.allocator, "This string has been designed to test the vectorized implementat" ++ + "ion by beginning with one hundred twenty-seven ASCII characters¡"); + defer testing.allocator.free(utf16); + try testing.expectEqualSlices(u8, &.{ + 'T', 0, 'h', 0, 'i', 0, 's', 0, ' ', 0, 's', 0, 't', 0, 'r', 0, 'i', 0, 'n', 0, 'g', 0, ' ', 0, 'h', 0, 'a', 0, 's', 0, ' ', 0, + 'b', 0, 'e', 0, 'e', 0, 'n', 0, ' ', 0, 'd', 0, 'e', 0, 's', 0, 'i', 0, 'g', 0, 'n', 0, 'e', 0, 'd', 0, ' ', 0, 't', 0, 'o', 0, + ' ', 0, 't', 0, 'e', 0, 's', 0, 't', 0, ' ', 0, 't', 0, 'h', 0, 'e', 0, ' ', 0, 'v', 0, 'e', 0, 'c', 0, 't', 0, 'o', 0, 'r', 0, + 'i', 0, 'z', 0, 'e', 0, 'd', 0, ' ', 0, 'i', 0, 'm', 0, 'p', 0, 'l', 0, 'e', 0, 'm', 0, 'e', 0, 'n', 0, 't', 0, 'a', 0, 't', 0, + 'i', 0, 'o', 0, 'n', 0, ' ', 0, 'b', 0, 'y', 0, ' ', 0, 'b', 0, 'e', 0, 'g', 0, 'i', 0, 'n', 0, 'n', 0, 'i', 0, 'n', 0, 'g', 0, + ' ', 0, 'w', 0, 'i', 0, 't', 0, 'h', 0, ' ', 0, 'o', 0, 'n', 0, 'e', 0, ' ', 0, 'h', 0, 'u', 0, 'n', 0, 'd', 0, 'r', 0, 'e', 0, + 'd', 0, ' ', 0, 't', 0, 'w', 0, 'e', 0, 'n', 0, 't', 0, 'y', 0, '-', 0, 's', 0, 'e', 0, 'v', 0, 'e', 0, 'n', 0, ' ', 0, 'A', 0, + 'S', 0, 'C', 0, 'I', 0, 'I', 0, ' ', 0, 'c', 0, 'h', 0, 'a', 0, 'r', 0, 'a', 0, 'c', 0, 't', 0, 'e', 0, 'r', 0, 's', 0, '¡', 0, + }, mem.sliceAsBytes(utf16)); + } } /// Converts a UTF-8 string literal into a UTF-16LE string literal. -pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 { +pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch |err| @compileError(err):0]u16 { return comptime blk: { - const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err); + const len: usize = calcUtf16LeLen(utf8) catch unreachable; var utf16le: [len:0]u16 = [_:0]u16{0} ** len; const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err); assert(len == utf16le_len); @@ -1438,12 +1435,12 @@ test "fmtUtf16Le" { try expectFmt("", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral(""))}); try expectFmt("foo", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("foo"))}); try expectFmt("𐐷", "{}", .{fmtUtf16Le(utf8ToUtf16LeStringLiteral("𐐷"))}); - try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xd7", native_endian)})}); - try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xd8", native_endian)})}); - try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdb", native_endian)})}); - try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xdc", native_endian)})}); - try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\xff\xdf", native_endian)})}); - try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{std.mem.readInt(u16, "\x00\xe0", native_endian)})}); + try expectFmt("퟿", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xd7", native_endian)})}); + try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xd8", native_endian)})}); + try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdb", native_endian)})}); + try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xdc", native_endian)})}); + try expectFmt("�", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\xff\xdf", native_endian)})}); + try expectFmt("", "{}", .{fmtUtf16Le(&[_]u16{mem.readInt(u16, "\x00\xe0", native_endian)})}); } test "utf8ToUtf16LeStringLiteral" { @@ -1686,8 +1683,9 @@ pub const Wtf8Iterator = struct { } }; -pub fn wtf16LeToWtf8ArrayList(array_list: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void { - return utf16LeToUtf8ArrayListImpl(array_list, utf16le, .can_encode_surrogate_half); +pub fn wtf16LeToWtf8ArrayList(result: *std.ArrayList(u8), utf16le: []const u16) mem.Allocator.Error!void { + try result.ensureTotalCapacityPrecise(utf16le.len); + return utf16LeToUtf8ArrayListImpl(result, utf16le, .can_encode_surrogate_half); } /// Caller must free returned memory. @@ -1696,8 +1694,7 @@ pub fn wtf16LeToWtf8Alloc(allocator: mem.Allocator, wtf16le: []const u16) mem.Al var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len); errdefer result.deinit(); - try wtf16LeToWtf8ArrayList(&result, wtf16le); - + try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half); return result.toOwnedSlice(); } @@ -1707,8 +1704,7 @@ pub fn wtf16LeToWtf8AllocZ(allocator: mem.Allocator, wtf16le: []const u16) mem.A var result = try std.ArrayList(u8).initCapacity(allocator, wtf16le.len + 1); errdefer result.deinit(); - try wtf16LeToWtf8ArrayList(&result, wtf16le); - + try utf16LeToUtf8ArrayListImpl(&result, wtf16le, .can_encode_surrogate_half); return result.toOwnedSliceSentinel(0); } @@ -1716,8 +1712,9 @@ pub fn wtf16LeToWtf8(wtf8: []u8, wtf16le: []const u16) usize { return utf16LeToUtf8Impl(wtf8, wtf16le, .can_encode_surrogate_half) catch |err| switch (err) {}; } -pub fn wtf8ToWtf16LeArrayList(array_list: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void { - return utf8ToUtf16LeArrayListImpl(array_list, wtf8, .can_encode_surrogate_half); +pub fn wtf8ToWtf16LeArrayList(result: *std.ArrayList(u16), wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }!void { + try result.ensureTotalCapacityPrecise(wtf8.len); + return utf8ToUtf16LeArrayListImpl(result, wtf8, .can_encode_surrogate_half); } pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ InvalidWtf8, OutOfMemory }![]u16 { @@ -1726,7 +1723,6 @@ pub fn wtf8ToWtf16LeAlloc(allocator: mem.Allocator, wtf8: []const u8) error{ Inv errdefer result.deinit(); try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half); - return result.toOwnedSlice(); } @@ -1736,7 +1732,6 @@ pub fn wtf8ToWtf16LeAllocZ(allocator: mem.Allocator, wtf8: []const u8) error{ In errdefer result.deinit(); try utf8ToUtf16LeArrayListImpl(&result, wtf8, .can_encode_surrogate_half); - return result.toOwnedSliceSentinel(0); } @@ -1895,7 +1890,7 @@ pub const Wtf16LeIterator = struct { pub fn init(s: []const u16) Wtf16LeIterator { return Wtf16LeIterator{ - .bytes = std.mem.sliceAsBytes(s), + .bytes = mem.sliceAsBytes(s), .i = 0, }; } @@ -1908,12 +1903,12 @@ pub const Wtf16LeIterator = struct { assert(it.i <= it.bytes.len); if (it.i == it.bytes.len) return null; var code_units: [2]u16 = undefined; - code_units[0] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little); + code_units[0] = mem.readInt(u16, it.bytes[it.i..][0..2], .little); it.i += 2; surrogate_pair: { if (utf16IsHighSurrogate(code_units[0])) { if (it.i >= it.bytes.len) break :surrogate_pair; - code_units[1] = std.mem.readInt(u16, it.bytes[it.i..][0..2], .little); + code_units[1] = mem.readInt(u16, it.bytes[it.i..][0..2], .little); const codepoint = utf16DecodeSurrogatePair(&code_units) catch break :surrogate_pair; it.i += 2; return codepoint; @@ -2030,31 +2025,31 @@ fn testRoundtripWtf16(wtf16le: []const u16) !void { test "well-formed WTF-16 roundtrips" { try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xD83D), // high surrogate - std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate + mem.nativeToLittle(u16, 0xD83D), // high surrogate + mem.nativeToLittle(u16, 0xDCA9), // low surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xD83D), // high surrogate - std.mem.nativeToLittle(u16, ' '), // not surrogate - std.mem.nativeToLittle(u16, 0xDCA9), // low surrogate + mem.nativeToLittle(u16, 0xD83D), // high surrogate + mem.nativeToLittle(u16, ' '), // not surrogate + mem.nativeToLittle(u16, 0xDCA9), // low surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xD800), // high surrogate - std.mem.nativeToLittle(u16, 0xDBFF), // high surrogate + mem.nativeToLittle(u16, 0xD800), // high surrogate + mem.nativeToLittle(u16, 0xDBFF), // high surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xD800), // high surrogate - std.mem.nativeToLittle(u16, 0xE000), // not surrogate + mem.nativeToLittle(u16, 0xD800), // high surrogate + mem.nativeToLittle(u16, 0xE000), // not surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xD7FF), // not surrogate - std.mem.nativeToLittle(u16, 0xDC00), // low surrogate + mem.nativeToLittle(u16, 0xD7FF), // not surrogate + mem.nativeToLittle(u16, 0xDC00), // low surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0x61), // not surrogate - std.mem.nativeToLittle(u16, 0xDC00), // low surrogate + mem.nativeToLittle(u16, 0x61), // not surrogate + mem.nativeToLittle(u16, 0xDC00), // low surrogate }); try testRoundtripWtf16(&[_]u16{ - std.mem.nativeToLittle(u16, 0xDC00), // low surrogate + mem.nativeToLittle(u16, 0xDC00), // low surrogate }); } diff --git a/lib/std/zig/c_translation.zig b/lib/std/zig/c_translation.zig index dfa888e94b..337149e97d 100644 --- a/lib/std/zig/c_translation.zig +++ b/lib/std/zig/c_translation.zig @@ -308,14 +308,12 @@ test "promoteIntLiteral" { /// Convert from clang __builtin_shufflevector index to Zig @shuffle index /// clang requires __builtin_shufflevector index arguments to be integer constants. -/// negative values for `this_index` indicate "don't care" so we arbitrarily choose 0 +/// negative values for `this_index` indicate "don't care". /// clang enforces that `this_index` is less than the total number of vector elements /// See https://ziglang.org/documentation/master/#shuffle /// See https://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len: usize) i32 { - if (this_index <= 0) return 0; - - const positive_index = @as(usize, @intCast(this_index)); + const positive_index = std.math.cast(usize, this_index) orelse return undefined; if (positive_index < source_vector_len) return @as(i32, @intCast(this_index)); const b_index = positive_index - source_vector_len; return ~@as(i32, @intCast(b_index)); @@ -324,7 +322,7 @@ pub fn shuffleVectorIndex(comptime this_index: c_int, comptime source_vector_len test "shuffleVectorIndex" { const vector_len: usize = 4; - try testing.expect(shuffleVectorIndex(-1, vector_len) == 0); + _ = shuffleVectorIndex(-1, vector_len); try testing.expect(shuffleVectorIndex(0, vector_len) == 0); try testing.expect(shuffleVectorIndex(1, vector_len) == 1); diff --git a/src/InternPool.zig b/src/InternPool.zig index a9f2f68d4f..46676097bf 100644 --- a/src/InternPool.zig +++ b/src/InternPool.zig @@ -3587,6 +3587,7 @@ pub const Alignment = enum(u6) { @"8" = 3, @"16" = 4, @"32" = 5, + @"64" = 6, none = std.math.maxInt(u6), _, @@ -7403,10 +7404,14 @@ pub fn isIntegerType(ip: *const InternPool, ty: Index) bool { .c_ulong_type, .c_longlong_type, .c_ulonglong_type, - .c_longdouble_type, .comptime_int_type, => true, - else => ip.indexToKey(ty) == .int_type, + else => switch (ip.items.items(.tag)[@intFromEnum(ty)]) { + .type_int_signed, + .type_int_unsigned, + => true, + else => false, + }, }; } diff --git a/src/Sema.zig b/src/Sema.zig index 309e968482..bfc9aa3adc 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -23315,7 +23315,8 @@ fn checkVectorElemType( const mod = sema.mod; switch (ty.zigTypeTag(mod)) { .Int, .Float, .Bool => return, - else => if (ty.isPtrAtRuntime(mod)) return, + .Optional, .Pointer => if (ty.isPtrAtRuntime(mod)) return, + else => {}, } return sema.fail(block, ty_src, "expected integer, float, bool, or pointer for the vector element type; found '{}'", .{ty.fmt(mod)}); } @@ -28442,7 +28443,7 @@ const CoerceOpts = struct { report_err: bool = true, /// Ignored if `report_err == false`. is_ret: bool = false, - /// Should coercion to comptime_int ermit an error message. + /// Should coercion to comptime_int emit an error message. no_cast_to_comptime_int: bool = false, param_src: struct { @@ -31845,6 +31846,34 @@ fn coerceArrayLike( } const dest_elem_ty = dest_ty.childType(mod); + if (dest_ty.isVector(mod) and inst_ty.isVector(mod) and (try sema.resolveValue(inst)) == null) { + const inst_elem_ty = inst_ty.childType(mod); + switch (dest_elem_ty.zigTypeTag(mod)) { + .Int => if (inst_elem_ty.isInt(mod)) { + // integer widening + const dst_info = dest_elem_ty.intInfo(mod); + const src_info = inst_elem_ty.intInfo(mod); + if ((src_info.signedness == dst_info.signedness and dst_info.bits >= src_info.bits) or + // small enough unsigned ints can get casted to large enough signed ints + (dst_info.signedness == .signed and dst_info.bits > src_info.bits)) + { + try sema.requireRuntimeBlock(block, inst_src, null); + return block.addTyOp(.intcast, dest_ty, inst); + } + }, + .Float => if (inst_elem_ty.isRuntimeFloat()) { + // float widening + const src_bits = inst_elem_ty.floatBits(target); + const dst_bits = dest_elem_ty.floatBits(target); + if (dst_bits >= src_bits) { + try sema.requireRuntimeBlock(block, inst_src, null); + return block.addTyOp(.fpext, dest_ty, inst); + } + }, + else => {}, + } + } + const element_vals = try sema.arena.alloc(InternPool.Index, dest_len); const element_refs = try sema.arena.alloc(Air.Inst.Ref, dest_len); var runtime_src: ?LazySrcLoc = null; diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index f9a291f40b..4ca2ae44bb 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1547,6 +1547,27 @@ fn asmRegisterRegisterMemory( }); } +fn asmRegisterRegisterMemoryRegister( + self: *Self, + tag: Mir.Inst.FixedTag, + reg1: Register, + reg2: Register, + m: Memory, + reg3: Register, +) !void { + _ = try self.addInst(.{ + .tag = tag[1], + .ops = .rrmr, + .data = .{ .rrrx = .{ + .fixes = tag[0], + .r1 = reg1, + .r2 = reg2, + .r3 = reg3, + .payload = try self.addExtra(Mir.Memory.encode(m)), + } }, + }); +} + fn asmMemory(self: *Self, tag: Mir.Inst.FixedTag, m: Memory) !void { _ = try self.addInst(.{ .tag = tag[1], @@ -1570,6 +1591,25 @@ fn asmRegisterMemory(self: *Self, tag: Mir.Inst.FixedTag, reg: Register, m: Memo }); } +fn asmRegisterMemoryRegister( + self: *Self, + tag: Mir.Inst.FixedTag, + reg1: Register, + m: Memory, + reg2: Register, +) !void { + _ = try self.addInst(.{ + .tag = tag[1], + .ops = .rmr, + .data = .{ .rrx = .{ + .fixes = tag[0], + .r1 = reg1, + .r2 = reg2, + .payload = try self.addExtra(Mir.Memory.encode(m)), + } }, + }); +} + fn asmRegisterMemoryImmediate( self: *Self, tag: Mir.Inst.FixedTag, @@ -2570,7 +2610,8 @@ fn restoreState(self: *Self, state: State, deaths: []const Air.Inst.Index, compt const ExpectedContents = [@typeInfo(RegisterManager.TrackedRegisters).Array.len]RegisterLock; var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) = - if (opts.update_tracking) ({}) else std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa); + if (opts.update_tracking) + {} else std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa); var reg_locks = if (opts.update_tracking) {} else try std.ArrayList(RegisterLock).initCapacity( stack.get(), @@ -2812,11 +2853,14 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { } fn airFpext(self: *Self, inst: Air.Inst.Index) !void { + const mod = self.bin_file.comp.module.?; const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; const dst_ty = self.typeOfIndex(inst); - const dst_bits = dst_ty.floatBits(self.target.*); + const dst_scalar_ty = dst_ty.scalarType(mod); + const dst_bits = dst_scalar_ty.floatBits(self.target.*); const src_ty = self.typeOf(ty_op.operand); - const src_bits = src_ty.floatBits(self.target.*); + const src_scalar_ty = src_ty.scalarType(mod); + const src_bits = src_scalar_ty.floatBits(self.target.*); const result = result: { if (switch (src_bits) { @@ -2840,94 +2884,290 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { }, else => unreachable, }) { + if (dst_ty.isVector(mod)) break :result null; var callee_buf: ["__extend?f?f2".len]u8 = undefined; break :result try self.genCall(.{ .lib = .{ - .return_type = self.floatCompilerRtAbiType(dst_ty, src_ty).toIntern(), - .param_types = &.{self.floatCompilerRtAbiType(src_ty, dst_ty).toIntern()}, + .return_type = self.floatCompilerRtAbiType(dst_scalar_ty, src_scalar_ty).toIntern(), + .param_types = &.{self.floatCompilerRtAbiType(src_scalar_ty, dst_scalar_ty).toIntern()}, .callee = std.fmt.bufPrint(&callee_buf, "__extend{c}f{c}f2", .{ floatCompilerRtAbiName(src_bits), floatCompilerRtAbiName(dst_bits), }) catch unreachable, - } }, &.{src_ty}, &.{.{ .air_ref = ty_op.operand }}); + } }, &.{src_scalar_ty}, &.{.{ .air_ref = ty_op.operand }}); } + const src_abi_size: u32 = @intCast(src_ty.abiSize(mod)); const src_mcv = try self.resolveInst(ty_op.operand); const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) src_mcv else try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); - const dst_reg = dst_mcv.getReg().?.to128(); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, @intCast(@max(dst_ty.abiSize(mod), 16))); const dst_lock = self.register_manager.lockReg(dst_reg); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + const vec_len = if (dst_ty.isVector(mod)) dst_ty.vectorLen(mod) else 1; if (src_bits == 16) { assert(self.hasFeature(.f16c)); const mat_src_reg = if (src_mcv.isRegister()) src_mcv.getReg().? else try self.copyToTmpRegister(src_ty, src_mcv); - try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, mat_src_reg.to128()); + try self.asmRegisterRegister( + .{ .v_ps, .cvtph2 }, + dst_alias, + registerAlias(mat_src_reg, src_abi_size), + ); switch (dst_bits) { 32 => {}, 64 => try self.asmRegisterRegisterRegister( .{ .v_sd, .cvtss2 }, - dst_reg, - dst_reg, - dst_reg, + dst_alias, + dst_alias, + dst_alias, ), else => unreachable, } } else { assert(src_bits == 32 and dst_bits == 64); - if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( - .{ .v_sd, .cvtss2 }, - dst_reg, - dst_reg, - try src_mcv.mem(self, .dword), - ) else try self.asmRegisterRegisterRegister( - .{ .v_sd, .cvtss2 }, - dst_reg, - dst_reg, - (if (src_mcv.isRegister()) - src_mcv.getReg().? - else - try self.copyToTmpRegister(src_ty, src_mcv)).to128(), - ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ ._sd, .cvtss2 }, - dst_reg, - try src_mcv.mem(self, .dword), + if (self.hasFeature(.avx)) switch (vec_len) { + 1 => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( + .{ .v_sd, .cvtss2 }, + dst_alias, + dst_alias, + try src_mcv.mem(self, self.memSize(src_ty)), + ) else try self.asmRegisterRegisterRegister( + .{ .v_sd, .cvtss2 }, + dst_alias, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size), + ), + 2...4 => if (src_mcv.isMemory()) try self.asmRegisterMemory( + .{ .v_pd, .cvtps2 }, + dst_alias, + try src_mcv.mem(self, self.memSize(src_ty)), + ) else try self.asmRegisterRegister( + .{ .v_pd, .cvtps2 }, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size), + ), + else => break :result null, + } else if (src_mcv.isMemory()) try self.asmRegisterMemory( + switch (vec_len) { + 1 => .{ ._sd, .cvtss2 }, + 2 => .{ ._pd, .cvtps2 }, + else => break :result null, + }, + dst_alias, + try src_mcv.mem(self, self.memSize(src_ty)), ) else try self.asmRegisterRegister( - .{ ._sd, .cvtss2 }, - dst_reg, - (if (src_mcv.isRegister()) + switch (vec_len) { + 1 => .{ ._sd, .cvtss2 }, + 2 => .{ ._pd, .cvtps2 }, + else => break :result null, + }, + dst_alias, + registerAlias(if (src_mcv.isRegister()) src_mcv.getReg().? else - try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size), ); } break :result dst_mcv; - }; + } orelse return self.fail("TODO implement airFpext from {} to {}", .{ + src_ty.fmt(mod), dst_ty.fmt(mod), + }); return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airIntCast(self: *Self, inst: Air.Inst.Index) !void { const mod = self.bin_file.comp.module.?; const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; - const result: MCValue = result: { - const src_ty = self.typeOf(ty_op.operand); + const src_ty = self.typeOf(ty_op.operand); + const dst_ty = self.typeOfIndex(inst); + + const result = @as(?MCValue, result: { + const dst_abi_size: u32 = @intCast(dst_ty.abiSize(mod)); + const src_int_info = src_ty.intInfo(mod); - - const dst_ty = self.typeOfIndex(inst); const dst_int_info = dst_ty.intInfo(mod); - const abi_size: u32 = @intCast(dst_ty.abiSize(mod)); - - const min_ty = if (dst_int_info.bits < src_int_info.bits) dst_ty else src_ty; const extend = switch (src_int_info.signedness) { .signed => dst_int_info, .unsigned => src_int_info, }.signedness; const src_mcv = try self.resolveInst(ty_op.operand); + if (dst_ty.isVector(mod)) { + const src_abi_size: u32 = @intCast(src_ty.abiSize(mod)); + const max_abi_size = @max(dst_abi_size, src_abi_size); + if (max_abi_size > @as(u32, if (self.hasFeature(.avx2)) 32 else 16)) break :result null; + const has_avx = self.hasFeature(.avx); + + const dst_elem_abi_size = dst_ty.childType(mod).abiSize(mod); + const src_elem_abi_size = src_ty.childType(mod).abiSize(mod); + switch (math.order(dst_elem_abi_size, src_elem_abi_size)) { + .lt => { + const mir_tag: Mir.Inst.FixedTag = switch (dst_elem_abi_size) { + else => break :result null, + 1 => switch (src_elem_abi_size) { + else => break :result null, + 2 => switch (dst_int_info.signedness) { + .signed => if (has_avx) .{ .vp_b, .ackssw } else .{ .p_b, .ackssw }, + .unsigned => if (has_avx) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw }, + }, + }, + 2 => switch (src_elem_abi_size) { + else => break :result null, + 4 => switch (dst_int_info.signedness) { + .signed => if (has_avx) .{ .vp_w, .ackssd } else .{ .p_w, .ackssd }, + .unsigned => if (has_avx) + .{ .vp_w, .ackusd } + else if (self.hasFeature(.sse4_1)) + .{ .p_w, .ackusd } + else + break :result null, + }, + }, + }; + + const dst_mcv: MCValue = if (src_mcv.isRegister() and + self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else if (has_avx and src_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, src_ty, src_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, dst_abi_size); + + if (has_avx) try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + dst_reg, src_abi_size), + dst_alias, + ) else try self.asmRegisterRegister( + mir_tag, + dst_alias, + dst_alias, + ); + break :result dst_mcv; + }, + .eq => if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + break :result src_mcv + else { + const dst_mcv = try self.allocRegOrMem(inst, true); + try self.genCopy(dst_ty, dst_mcv, src_mcv, .{}); + break :result dst_mcv; + }, + .gt => if (self.hasFeature(.sse4_1)) { + const mir_tag: Mir.Inst.FixedTag = .{ switch (dst_elem_abi_size) { + else => break :result null, + 2 => if (has_avx) .vp_w else .p_w, + 4 => if (has_avx) .vp_d else .p_d, + 8 => if (has_avx) .vp_q else .p_q, + }, switch (src_elem_abi_size) { + else => break :result null, + 1 => switch (extend) { + .signed => .movsxb, + .unsigned => .movzxb, + }, + 2 => switch (extend) { + .signed => .movsxw, + .unsigned => .movzxw, + }, + 4 => switch (extend) { + .signed => .movsxd, + .unsigned => .movzxd, + }, + } }; + + const dst_mcv: MCValue = if (src_mcv.isRegister() and + self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) }; + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, dst_abi_size); + + if (src_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, + dst_alias, + try src_mcv.mem(self, self.memSize(src_ty)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv), src_abi_size), + ); + break :result dst_mcv; + } else { + const mir_tag: Mir.Inst.FixedTag = switch (dst_elem_abi_size) { + else => break :result null, + 2 => switch (src_elem_abi_size) { + else => break :result null, + 1 => .{ .p_, .unpcklbw }, + }, + 4 => switch (src_elem_abi_size) { + else => break :result null, + 2 => .{ .p_, .unpcklwd }, + }, + 8 => switch (src_elem_abi_size) { + else => break :result null, + 2 => .{ .p_, .unpckldq }, + }, + }; + + const dst_mcv: MCValue = if (src_mcv.isRegister() and + self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); + const dst_reg = dst_mcv.getReg().?; + + const ext_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse); + const ext_alias = registerAlias(ext_reg, src_abi_size); + const ext_lock = self.register_manager.lockRegAssumeUnused(ext_reg); + defer self.register_manager.unlockReg(ext_lock); + + try self.asmRegisterRegister(.{ .p_, .xor }, ext_alias, ext_alias); + switch (extend) { + .signed => try self.asmRegisterRegister( + .{ switch (src_elem_abi_size) { + else => unreachable, + 1 => .p_b, + 2 => .p_w, + 4 => .p_d, + }, .cmpgt }, + ext_alias, + registerAlias(dst_reg, src_abi_size), + ), + .unsigned => {}, + } + try self.asmRegisterRegister( + mir_tag, + registerAlias(dst_reg, dst_abi_size), + registerAlias(ext_reg, dst_abi_size), + ); + break :result dst_mcv; + }, + } + @compileError("unreachable"); + } + + const min_ty = if (dst_int_info.bits < src_int_info.bits) dst_ty else src_ty; + const src_storage_bits: u16 = switch (src_mcv) { .register, .register_offset => 64, .register_pair => 128, @@ -2945,13 +3185,13 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void { }; if (dst_int_info.bits <= src_int_info.bits) break :result if (dst_mcv.isRegister()) - .{ .register = registerAlias(dst_mcv.getReg().?, abi_size) } + .{ .register = registerAlias(dst_mcv.getReg().?, dst_abi_size) } else dst_mcv; if (dst_mcv.isRegister()) { try self.truncateRegister(src_ty, dst_mcv.getReg().?); - break :result .{ .register = registerAlias(dst_mcv.getReg().?, abi_size) }; + break :result .{ .register = registerAlias(dst_mcv.getReg().?, dst_abi_size) }; } const src_limbs_len = math.divCeil(u16, src_int_info.bits, 64) catch unreachable; @@ -2999,7 +3239,9 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void { ); break :result dst_mcv; - }; + }) orelse return self.fail("TODO implement airIntCast from {} to {}", .{ + src_ty.fmt(mod), dst_ty.fmt(mod), + }); return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } @@ -3022,7 +3264,7 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { src_mcv else if (dst_abi_size <= 8) try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv) - else if (dst_abi_size <= 16) dst: { + else if (dst_abi_size <= 16 and !dst_ty.isVector(mod)) dst: { const dst_regs = try self.register_manager.allocRegs(2, .{ inst, inst }, abi.RegisterClass.gp); const dst_mcv: MCValue = .{ .register_pair = dst_regs }; @@ -3032,26 +3274,29 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { try self.genCopy(dst_ty, dst_mcv, src_mcv, .{}); break :dst dst_mcv; } else dst: { - const dst_mcv = try self.allocRegOrMem(inst, true); - try self.genCopy(dst_ty, dst_mcv, src_mcv, .{}); + const dst_mcv = try self.allocRegOrMemAdvanced(src_ty, inst, true); + try self.genCopy(src_ty, dst_mcv, src_mcv, .{}); break :dst dst_mcv; }; if (dst_ty.zigTypeTag(mod) == .Vector) { assert(src_ty.zigTypeTag(mod) == .Vector and dst_ty.vectorLen(mod) == src_ty.vectorLen(mod)); - const dst_info = dst_ty.childType(mod).intInfo(mod); - const src_info = src_ty.childType(mod).intInfo(mod); - const mir_tag = @as(?Mir.Inst.FixedTag, switch (dst_info.bits) { - 8 => switch (src_info.bits) { - 16 => switch (dst_ty.vectorLen(mod)) { + const dst_elem_ty = dst_ty.childType(mod); + const dst_elem_abi_size: u32 = @intCast(dst_elem_ty.abiSize(mod)); + const src_elem_ty = src_ty.childType(mod); + const src_elem_abi_size: u32 = @intCast(src_elem_ty.abiSize(mod)); + + const mir_tag = @as(?Mir.Inst.FixedTag, switch (dst_elem_abi_size) { + 1 => switch (src_elem_abi_size) { + 2 => switch (dst_ty.vectorLen(mod)) { 1...8 => if (self.hasFeature(.avx)) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw }, 9...16 => if (self.hasFeature(.avx2)) .{ .vp_b, .ackusw } else null, else => null, }, else => null, }, - 16 => switch (src_info.bits) { - 32 => switch (dst_ty.vectorLen(mod)) { + 2 => switch (src_elem_abi_size) { + 4 => switch (dst_ty.vectorLen(mod)) { 1...4 => if (self.hasFeature(.avx)) .{ .vp_w, .ackusd } else if (self.hasFeature(.sse4_1)) @@ -3066,12 +3311,14 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { else => null, }) orelse return self.fail("TODO implement airTrunc for {}", .{dst_ty.fmt(mod)}); - const elem_ty = src_ty.childType(mod); - const mask_val = try mod.intValue(elem_ty, @as(u64, math.maxInt(u64)) >> @intCast(64 - dst_info.bits)); + const dst_info = dst_elem_ty.intInfo(mod); + const src_info = src_elem_ty.intInfo(mod); + + const mask_val = try mod.intValue(src_elem_ty, @as(u64, math.maxInt(u64)) >> @intCast(64 - dst_info.bits)); const splat_ty = try mod.vectorType(.{ .len = @intCast(@divExact(@as(u64, if (src_abi_size > 16) 256 else 128), src_info.bits)), - .child = elem_ty.ip_index, + .child = src_elem_ty.ip_index, }); const splat_abi_size: u32 = @intCast(splat_ty.abiSize(mod)); @@ -3086,22 +3333,40 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) }, }; - const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, src_abi_size); if (self.hasFeature(.avx)) { try self.asmRegisterRegisterMemory( .{ .vp_, .@"and" }, - dst_reg, - dst_reg, + dst_alias, + dst_alias, try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)), ); - try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg); + if (src_abi_size > 16) { + const temp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse); + const temp_lock = self.register_manager.lockRegAssumeUnused(temp_reg); + defer self.register_manager.unlockReg(temp_lock); + + try self.asmRegisterRegisterImmediate( + .{ if (self.hasFeature(.avx2)) .v_i128 else .v_f128, .extract }, + registerAlias(temp_reg, dst_abi_size), + dst_alias, + Immediate.u(1), + ); + try self.asmRegisterRegisterRegister( + mir_tag, + registerAlias(dst_reg, dst_abi_size), + registerAlias(dst_reg, dst_abi_size), + registerAlias(temp_reg, dst_abi_size), + ); + } else try self.asmRegisterRegisterRegister(mir_tag, dst_alias, dst_alias, dst_alias); } else { try self.asmRegisterMemory( .{ .p_, .@"and" }, - dst_reg, + dst_alias, try splat_addr_mcv.deref().mem(self, Memory.Size.fromSize(splat_abi_size)), ); - try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg); + try self.asmRegisterRegister(mir_tag, dst_alias, dst_alias); } break :result dst_mcv; } @@ -4045,7 +4310,7 @@ fn airMulWithOverflow(self: *Self, inst: Air.Inst.Index) !void { if (dst_info.bits > 128 and dst_info.signedness == .unsigned) { const slow_inc = self.hasFeature(.slow_incdec); const abi_size: u32 = @intCast(dst_ty.abiSize(mod)); - const limb_len = std.math.divCeil(u32, abi_size, 8) catch unreachable; + const limb_len = math.divCeil(u32, abi_size, 8) catch unreachable; try self.spillRegisters(&.{ .rax, .rcx, .rdx }); const reg_locks = self.register_manager.lockRegsAssumeUnused(3, .{ .rax, .rcx, .rdx }); @@ -4534,7 +4799,7 @@ fn airShlShrBinOp(self: *Self, inst: Air.Inst.Index) !void { switch (lhs_ty.zigTypeTag(mod)) { .Int => { try self.spillRegisters(&.{.rcx}); - try self.register_manager.getReg(.rcx, null); + try self.register_manager.getKnownReg(.rcx, null); const lhs_mcv = try self.resolveInst(bin_op.lhs); const rhs_mcv = try self.resolveInst(bin_op.rhs); @@ -6560,7 +6825,7 @@ fn floatSign(self: *Self, inst: Air.Inst.Index, operand: Air.Inst.Ref, ty: Type) const dst_mcv: MCValue = .{ .register = .st0 }; if (!std.meta.eql(src_mcv, dst_mcv) or !self.reuseOperand(inst, operand, 0, src_mcv)) - try self.register_manager.getReg(.st0, inst); + try self.register_manager.getKnownReg(.st0, inst); try self.genCopy(ty, dst_mcv, src_mcv, .{}); switch (tag) { @@ -6894,7 +7159,7 @@ fn airAbs(self: *Self, inst: Air.Inst.Index) !void { }, else => { const abi_size: u31 = @intCast(ty.abiSize(mod)); - const limb_len = std.math.divCeil(u31, abi_size, 8) catch unreachable; + const limb_len = math.divCeil(u31, abi_size, 8) catch unreachable; const tmp_regs = try self.register_manager.allocRegs(3, .{null} ** 3, abi.RegisterClass.gp); @@ -8181,7 +8446,7 @@ fn genShiftBinOpMir( try self.asmRegisterImmediate( .{ ._, .@"and" }, .cl, - Immediate.u(std.math.maxInt(u6)), + Immediate.u(math.maxInt(u6)), ); try self.asmRegisterImmediate( .{ ._r, .sh }, @@ -8218,7 +8483,7 @@ fn genShiftBinOpMir( try self.asmRegisterImmediate( .{ ._, .@"and" }, .cl, - Immediate.u(std.math.maxInt(u6)), + Immediate.u(math.maxInt(u6)), ); try self.asmRegisterImmediate( .{ ._r, .sh }, @@ -8283,7 +8548,7 @@ fn genShiftBinOpMir( }, .sh }, temp_regs[2].to64(), temp_regs[3].to64(), - Immediate.u(shift_imm & std.math.maxInt(u6)), + Immediate.u(shift_imm & math.maxInt(u6)), ), else => try self.asmRegisterRegisterRegister(.{ switch (tag[0]) { ._l => ._ld, @@ -8338,7 +8603,7 @@ fn genShiftBinOpMir( .immediate => |shift_imm| try self.asmRegisterImmediate( tag, temp_regs[2].to64(), - Immediate.u(shift_imm & std.math.maxInt(u6)), + Immediate.u(shift_imm & math.maxInt(u6)), ), else => try self.asmRegisterRegister(tag, temp_regs[2].to64(), .cl), } @@ -8794,7 +9059,7 @@ fn genShiftBinOp( lhs_ty.fmt(mod), }); - try self.register_manager.getReg(.rcx, null); + try self.register_manager.getKnownReg(.rcx, null); const rcx_lock = self.register_manager.lockReg(.rcx); defer if (rcx_lock) |lock| self.register_manager.unlockReg(lock); @@ -8933,7 +9198,7 @@ fn genMulDivBinOp( switch (tag) { .mul, .mul_wrap => { const slow_inc = self.hasFeature(.slow_incdec); - const limb_len = std.math.divCeil(u32, src_abi_size, 8) catch unreachable; + const limb_len = math.divCeil(u32, src_abi_size, 8) catch unreachable; try self.spillRegisters(&.{ .rax, .rcx, .rdx }); const reg_locks = self.register_manager.lockRegs(3, .{ .rax, .rcx, .rdx }); @@ -9117,8 +9382,8 @@ fn genMulDivBinOp( .rem => maybe_inst, else => null, }; - try self.register_manager.getReg(.rax, track_inst_rax); - try self.register_manager.getReg(.rdx, track_inst_rdx); + try self.register_manager.getKnownReg(.rax, track_inst_rax); + try self.register_manager.getKnownReg(.rdx, track_inst_rdx); try self.genIntMulDivOpMir(switch (signedness) { .signed => switch (tag) { @@ -9158,8 +9423,11 @@ fn genMulDivBinOp( }, .mod => { - try self.register_manager.getReg(.rax, null); - try self.register_manager.getReg(.rdx, if (signedness == .unsigned) maybe_inst else null); + try self.register_manager.getKnownReg(.rax, null); + try self.register_manager.getKnownReg( + .rdx, + if (signedness == .unsigned) maybe_inst else null, + ); switch (signedness) { .signed => { @@ -9200,8 +9468,11 @@ fn genMulDivBinOp( }, .div_floor => { - try self.register_manager.getReg(.rax, if (signedness == .unsigned) maybe_inst else null); - try self.register_manager.getReg(.rdx, null); + try self.register_manager.getKnownReg( + .rax, + if (signedness == .unsigned) maybe_inst else null, + ); + try self.register_manager.getKnownReg(.rdx, null); const lhs_lock: ?RegisterLock = switch (lhs_mcv) { .register => |reg| self.register_manager.lockRegAssumeUnused(reg), @@ -9445,7 +9716,7 @@ fn genBinOp( .rem, .mod => unreachable, .max, .min => if (lhs_ty.scalarType(mod).isRuntimeFloat()) registerAlias( if (!self.hasFeature(.avx) and self.hasFeature(.sse4_1)) mask: { - try self.register_manager.getReg(.xmm0, null); + try self.register_manager.getKnownReg(.xmm0, null); break :mask .xmm0; } else try self.register_manager.allocReg(null, abi.RegisterClass.sse), abi_size, @@ -10820,96 +11091,35 @@ fn genBinOp( lhs_copy_reg.?, mask_reg, ) else { - try self.asmRegisterRegister( - @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) { - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => .{ ._ps, .@"and" }, - 64 => .{ ._pd, .@"and" }, + const mir_fixes = @as(?Mir.Inst.Fixes, switch (lhs_ty.zigTypeTag(mod)) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => ._ps, + 64 => ._pd, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) { + .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen(mod)) { + 1...4 => ._ps, + else => null, + }, + 64 => switch (lhs_ty.vectorLen(mod)) { + 1...2 => ._pd, + else => null, + }, 16, 80, 128 => null, else => unreachable, }, - .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) { - .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) { - 32 => switch (lhs_ty.vectorLen(mod)) { - 1...4 => .{ ._ps, .@"and" }, - else => null, - }, - 64 => switch (lhs_ty.vectorLen(mod)) { - 1...2 => .{ ._pd, .@"and" }, - else => null, - }, - 16, 80, 128 => null, - else => unreachable, - }, - else => unreachable, - }, else => unreachable, - }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(air_tag), lhs_ty.fmt(mod), - }), - dst_reg, - mask_reg, - ); - try self.asmRegisterRegister( - @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) { - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => .{ ._ps, .andn }, - 64 => .{ ._pd, .andn }, - 16, 80, 128 => null, - else => unreachable, - }, - .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) { - .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) { - 32 => switch (lhs_ty.vectorLen(mod)) { - 1...4 => .{ ._ps, .andn }, - else => null, - }, - 64 => switch (lhs_ty.vectorLen(mod)) { - 1...2 => .{ ._pd, .andn }, - else => null, - }, - 16, 80, 128 => null, - else => unreachable, - }, - else => unreachable, - }, - else => unreachable, - }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(air_tag), lhs_ty.fmt(mod), - }), - mask_reg, - lhs_copy_reg.?, - ); - try self.asmRegisterRegister( - @as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag(mod)) { - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => .{ ._ps, .@"or" }, - 64 => .{ ._pd, .@"or" }, - 16, 80, 128 => null, - else => unreachable, - }, - .Vector => switch (lhs_ty.childType(mod).zigTypeTag(mod)) { - .Float => switch (lhs_ty.childType(mod).floatBits(self.target.*)) { - 32 => switch (lhs_ty.vectorLen(mod)) { - 1...4 => .{ ._ps, .@"or" }, - else => null, - }, - 64 => switch (lhs_ty.vectorLen(mod)) { - 1...2 => .{ ._pd, .@"or" }, - else => null, - }, - 16, 80, 128 => null, - else => unreachable, - }, - else => unreachable, - }, - else => unreachable, - }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(air_tag), lhs_ty.fmt(mod), - }), - dst_reg, - mask_reg, - ); + }, + else => unreachable, + }) orelse return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(mod), + }); + try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_reg, mask_reg); + try self.asmRegisterRegister(.{ mir_fixes, .andn }, mask_reg, lhs_copy_reg.?); + try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_reg, mask_reg); } }, .cmp_lt, .cmp_lte, .cmp_eq, .cmp_gte, .cmp_gt, .cmp_neq => { @@ -12192,9 +12402,36 @@ fn airRetLoad(self: *Self, inst: Air.Inst.Index) !void { fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { const mod = self.bin_file.comp.module.?; const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op; - const ty = self.typeOf(bin_op.lhs); + var ty = self.typeOf(bin_op.lhs); + var null_compare: ?Mir.Inst.Index = null; const result: Condition = result: { + try self.spillEflagsIfOccupied(); + + const lhs_mcv = try self.resolveInst(bin_op.lhs); + const lhs_locks: [2]?RegisterLock = switch (lhs_mcv) { + .register => |lhs_reg| .{ self.register_manager.lockRegAssumeUnused(lhs_reg), null }, + .register_pair => |lhs_regs| locks: { + const locks = self.register_manager.lockRegsAssumeUnused(2, lhs_regs); + break :locks .{ locks[0], locks[1] }; + }, + .register_offset => |lhs_ro| .{ + self.register_manager.lockRegAssumeUnused(lhs_ro.reg), + null, + }, + else => .{null} ** 2, + }; + defer for (lhs_locks) |lhs_lock| if (lhs_lock) |lock| self.register_manager.unlockReg(lock); + + const rhs_mcv = try self.resolveInst(bin_op.rhs); + const rhs_locks: [2]?RegisterLock = switch (rhs_mcv) { + .register => |rhs_reg| .{ self.register_manager.lockReg(rhs_reg), null }, + .register_pair => |rhs_regs| self.register_manager.lockRegs(2, rhs_regs), + .register_offset => |rhs_ro| .{ self.register_manager.lockReg(rhs_ro.reg), null }, + else => .{null} ** 2, + }; + defer for (rhs_locks) |rhs_lock| if (rhs_lock) |lock| self.register_manager.unlockReg(lock); + switch (ty.zigTypeTag(mod)) { .Float => { const float_bits = ty.floatBits(self.target.*); @@ -12231,35 +12468,67 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { }; } }, + .Optional => if (!ty.optionalReprIsPayload(mod)) { + const opt_ty = ty; + const opt_abi_size: u31 = @intCast(opt_ty.abiSize(mod)); + ty = opt_ty.optionalChild(mod); + const payload_abi_size: u31 = @intCast(ty.abiSize(mod)); + + const temp_lhs_reg = try self.register_manager.allocReg(null, abi.RegisterClass.gp); + const temp_lhs_lock = self.register_manager.lockRegAssumeUnused(temp_lhs_reg); + defer self.register_manager.unlockReg(temp_lhs_lock); + + if (lhs_mcv.isMemory()) try self.asmRegisterMemory( + .{ ._, .mov }, + temp_lhs_reg.to8(), + try lhs_mcv.address().offset(payload_abi_size).deref().mem(self, .byte), + ) else { + try self.genSetReg(temp_lhs_reg, opt_ty, lhs_mcv, .{}); + try self.asmRegisterImmediate( + .{ ._r, .sh }, + registerAlias(temp_lhs_reg, opt_abi_size), + Immediate.u(payload_abi_size * 8), + ); + } + + const payload_compare = payload_compare: { + if (rhs_mcv.isMemory()) { + const rhs_mem = + try rhs_mcv.address().offset(payload_abi_size).deref().mem(self, .byte); + try self.asmMemoryRegister(.{ ._, .@"test" }, rhs_mem, temp_lhs_reg.to8()); + const payload_compare = try self.asmJccReloc(.nz, undefined); + try self.asmRegisterMemory(.{ ._, .cmp }, temp_lhs_reg.to8(), rhs_mem); + break :payload_compare payload_compare; + } + + const temp_rhs_reg = try self.copyToTmpRegister(opt_ty, rhs_mcv); + const temp_rhs_lock = self.register_manager.lockRegAssumeUnused(temp_rhs_reg); + defer self.register_manager.unlockReg(temp_rhs_lock); + + try self.asmRegisterImmediate( + .{ ._r, .sh }, + registerAlias(temp_rhs_reg, opt_abi_size), + Immediate.u(payload_abi_size * 8), + ); + try self.asmRegisterRegister( + .{ ._, .@"test" }, + temp_lhs_reg.to8(), + temp_rhs_reg.to8(), + ); + const payload_compare = try self.asmJccReloc(.nz, undefined); + try self.asmRegisterRegister( + .{ ._, .cmp }, + temp_lhs_reg.to8(), + temp_rhs_reg.to8(), + ); + break :payload_compare payload_compare; + }; + null_compare = try self.asmJmpReloc(undefined); + self.performReloc(payload_compare); + }, else => {}, } - try self.spillEflagsIfOccupied(); - - const lhs_mcv = try self.resolveInst(bin_op.lhs); - const lhs_locks: [2]?RegisterLock = switch (lhs_mcv) { - .register => |lhs_reg| .{ self.register_manager.lockRegAssumeUnused(lhs_reg), null }, - .register_pair => |lhs_regs| locks: { - const locks = self.register_manager.lockRegsAssumeUnused(2, lhs_regs); - break :locks .{ locks[0], locks[1] }; - }, - .register_offset => |lhs_ro| .{ - self.register_manager.lockRegAssumeUnused(lhs_ro.reg), - null, - }, - else => .{null} ** 2, - }; - defer for (lhs_locks) |lhs_lock| if (lhs_lock) |lock| self.register_manager.unlockReg(lock); - - const rhs_mcv = try self.resolveInst(bin_op.rhs); - const rhs_locks: [2]?RegisterLock = switch (rhs_mcv) { - .register => |rhs_reg| .{ self.register_manager.lockReg(rhs_reg), null }, - .register_pair => |rhs_regs| self.register_manager.lockRegs(2, rhs_regs), - .register_offset => |rhs_ro| .{ self.register_manager.lockReg(rhs_ro.reg), null }, - else => .{null} ** 2, - }; - defer for (rhs_locks) |rhs_lock| if (rhs_lock) |lock| self.register_manager.unlockReg(lock); - switch (ty.zigTypeTag(mod)) { else => { const abi_size: u16 = @intCast(ty.abiSize(mod)); @@ -12571,6 +12840,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { } }; + if (null_compare) |reloc| self.performReloc(reloc); self.eflags_inst = inst; return self.finishAir(inst, .{ .eflags = result }, .{ bin_op.lhs, bin_op.rhs, .none }); } @@ -13521,6 +13791,7 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void { } else if (constraint.len == 1 and std.ascii.isDigit(constraint[0])) arg: { const index = std.fmt.charToDigit(constraint[0], 10) catch unreachable; if (index >= args.items.len) return self.fail("constraint out of bounds: '{s}'", .{constraint}); + try self.genCopy(ty, args.items[index], input_mcv, .{}); break :arg args.items[index]; } else return self.fail("invalid constraint: '{s}'", .{constraint}); if (arg_mcv.getReg()) |reg| if (RegisterManager.indexOfRegIntoTracked(reg)) |_| { @@ -13619,25 +13890,26 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void { label_gop.value_ptr.target = @intCast(self.mir_instructions.len); } else continue; - var mnem_size: ?Memory.Size = null; - const mnem_tag = mnem: { - mnem_size = if (mem.endsWith(u8, mnem_str, "b")) - .byte - else if (mem.endsWith(u8, mnem_str, "w")) - .word - else if (mem.endsWith(u8, mnem_str, "l")) - .dword - else if (mem.endsWith(u8, mnem_str, "q")) - .qword - else if (mem.endsWith(u8, mnem_str, "t")) - .tbyte - else - break :mnem null; - break :mnem std.meta.stringToEnum(Instruction.Mnemonic, mnem_str[0 .. mnem_str.len - 1]); - } orelse mnem: { + var mnem_size: ?Memory.Size = if (mem.endsWith(u8, mnem_str, "b")) + .byte + else if (mem.endsWith(u8, mnem_str, "w")) + .word + else if (mem.endsWith(u8, mnem_str, "l")) + .dword + else if (mem.endsWith(u8, mnem_str, "q") and + (std.mem.indexOfScalar(u8, "vp", mnem_str[0]) == null or !mem.endsWith(u8, mnem_str, "dq"))) + .qword + else if (mem.endsWith(u8, mnem_str, "t")) + .tbyte + else + null; + const mnem_tag = while (true) break std.meta.stringToEnum( + Instruction.Mnemonic, + mnem_str[0 .. mnem_str.len - @intFromBool(mnem_size != null)], + ) orelse if (mnem_size) |_| { mnem_size = null; - break :mnem std.meta.stringToEnum(Instruction.Mnemonic, mnem_str); - } orelse return self.fail("invalid mnemonic: '{s}'", .{mnem_str}); + continue; + } else return self.fail("invalid mnemonic: '{s}'", .{mnem_str}); if (@as(?Memory.Size, switch (mnem_tag) { .clflush => .byte, .fldenv, .fnstenv, .fstenv => .none, @@ -14135,30 +14407,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo else => {}, }, .Int => switch (ty.childType(mod).intInfo(mod).bits) { - 8 => switch (ty.vectorLen(mod)) { - 1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{ - .insert = .{ .vp_b, .insr }, - .extract = .{ .vp_b, .extr }, - } } else if (self.hasFeature(.sse4_2)) return .{ .insert_extract = .{ - .insert = .{ .p_b, .insr }, - .extract = .{ .p_b, .extr }, - } }, - 2 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ - .insert = .{ .vp_w, .insr }, - .extract = .{ .vp_w, .extr }, - } } else .{ .insert_extract = .{ - .insert = .{ .p_w, .insr }, - .extract = .{ .p_w, .extr }, - } }, - 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_d, .mov } - else - .{ ._d, .mov } }, - 5...8 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 9...16 => return .{ .move = if (self.hasFeature(.avx)) + 1...8 => switch (ty.vectorLen(mod)) { + 1...16 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 17...32 => if (self.hasFeature(.avx)) @@ -14168,23 +14418,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo .{ .v_, .movdqu } }, else => {}, }, - 16 => switch (ty.vectorLen(mod)) { - 1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ - .insert = .{ .vp_w, .insr }, - .extract = .{ .vp_w, .extr }, - } } else .{ .insert_extract = .{ - .insert = .{ .p_w, .insr }, - .extract = .{ .p_w, .extr }, - } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_d, .mov } - else - .{ ._d, .mov } }, - 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 5...8 => return .{ .move = if (self.hasFeature(.avx)) + 9...16 => switch (ty.vectorLen(mod)) { + 1...8 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 9...16 => if (self.hasFeature(.avx)) @@ -14194,16 +14429,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo .{ .v_, .movdqu } }, else => {}, }, - 32 => switch (ty.vectorLen(mod)) { - 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_d, .mov } - else - .{ ._d, .mov } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 3...4 => return .{ .move = if (self.hasFeature(.avx)) + 17...32 => switch (ty.vectorLen(mod)) { + 1...4 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 5...8 => if (self.hasFeature(.avx)) @@ -14213,12 +14440,8 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo .{ .v_, .movdqu } }, else => {}, }, - 64 => switch (ty.vectorLen(mod)) { - 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) + 33...64 => switch (ty.vectorLen(mod)) { + 1...2 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 3...4 => if (self.hasFeature(.avx)) @@ -14228,7 +14451,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo .{ .v_, .movdqu } }, else => {}, }, - 128 => switch (ty.vectorLen(mod)) { + 65...128 => switch (ty.vectorLen(mod)) { 1 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, @@ -14239,7 +14462,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo .{ .v_, .movdqu } }, else => {}, }, - 256 => switch (ty.vectorLen(mod)) { + 129...256 => switch (ty.vectorLen(mod)) { 1 => if (self.hasFeature(.avx)) return .{ .move = if (aligned) .{ .v_, .movdqa } @@ -14251,11 +14474,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo }, .Pointer, .Optional => if (ty.childType(mod).isPtrAtRuntime(mod)) switch (ty.vectorLen(mod)) { - 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) + 1...2 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 3...4 => if (self.hasFeature(.avx)) @@ -14269,22 +14488,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo unreachable, .Float => switch (ty.childType(mod).floatBits(self.target.*)) { 16 => switch (ty.vectorLen(mod)) { - 1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ - .insert = .{ .vp_w, .insr }, - .extract = .{ .vp_w, .extr }, - } } else .{ .insert_extract = .{ - .insert = .{ .p_w, .insr }, - .extract = .{ .p_w, .extr }, - } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_d, .mov } - else - .{ ._d, .mov } }, - 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_q, .mov } - else - .{ ._q, .mov } }, - 5...8 => return .{ .move = if (self.hasFeature(.avx)) + 1...8 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 9...16 => if (self.hasFeature(.avx)) @@ -14295,15 +14499,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo else => {}, }, 32 => switch (ty.vectorLen(mod)) { - 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ss, .mov } - else - .{ ._ss, .mov } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } - else - .{ ._sd, .mov } }, - 3...4 => return .{ .move = if (self.hasFeature(.avx)) + 1...4 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, 5...8 => if (self.hasFeature(.avx)) @@ -14314,11 +14510,7 @@ fn moveStrategy(self: *Self, ty: Type, class: Register.Class, aligned: bool) !Mo else => {}, }, 64 => switch (ty.vectorLen(mod)) { - 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } - else - .{ ._sd, .mov } }, - 2 => return .{ .move = if (self.hasFeature(.avx)) + 1...2 => return .{ .move = if (self.hasFeature(.avx)) if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu } else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } }, 3...4 => if (self.hasFeature(.avx)) @@ -14633,7 +14825,7 @@ fn genSetReg( ty, dst_reg.class(), self.getFrameAddrAlignment(frame_addr).compare(.gte, Alignment.fromLog2Units( - std.math.log2_int_ceil(u10, @divExact(dst_reg.bitSize(), 8)), + math.log2_int_ceil(u10, @divExact(dst_reg.bitSize(), 8)), )), ), .lea_frame => .{ .move = .{ ._, .lea } }, @@ -16296,7 +16488,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void { }, 65...128 => switch (vector_len) { else => null, - 1...2 => .{ .vp_i128, .broadcast }, + 1...2 => .{ .v_i128, .broadcast }, }, }) orelse break :avx2; @@ -16310,7 +16502,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void { registerAlias(dst_reg, @intCast(vector_ty.abiSize(mod))), try src_mcv.mem(self, self.memSize(scalar_ty)), ) else { - if (mir_tag[0] == .vp_i128) break :avx2; + if (mir_tag[0] == .v_i128) break :avx2; try self.genSetReg(dst_reg, scalar_ty, src_mcv, .{}); try self.asmRegisterRegister( mir_tag, @@ -16352,7 +16544,7 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void { .{ if (self.hasFeature(.avx)) .vp_w else .p_w, .shufl }, dst_alias, dst_alias, - Immediate.u(0), + Immediate.u(0b00_00_00_00), ); if (switch (scalar_bits) { 1...8 => vector_len > 4, @@ -16563,18 +16755,1158 @@ fn airSplat(self: *Self, inst: Air.Inst.Index) !void { } fn airSelect(self: *Self, inst: Air.Inst.Index) !void { + const mod = self.bin_file.comp.module.?; const pl_op = self.air.instructions.items(.data)[@intFromEnum(inst)].pl_op; const extra = self.air.extraData(Air.Bin, pl_op.payload).data; - _ = extra; - return self.fail("TODO implement airSelect for x86_64", .{}); - //return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs }); + const ty = self.typeOfIndex(inst); + const vec_len = ty.vectorLen(mod); + const elem_ty = ty.childType(mod); + const elem_abi_size: u32 = @intCast(elem_ty.abiSize(mod)); + const abi_size: u32 = @intCast(ty.abiSize(mod)); + const pred_ty = self.typeOf(pl_op.operand); + + const result = result: { + const has_blend = self.hasFeature(.sse4_1); + const has_avx = self.hasFeature(.avx); + const need_xmm0 = has_blend and !has_avx; + const pred_mcv = try self.resolveInst(pl_op.operand); + const mask_reg = mask: { + switch (pred_mcv) { + .register => |pred_reg| switch (pred_reg.class()) { + .general_purpose => {}, + .sse => if (need_xmm0 and pred_reg.id() != comptime Register.xmm0.id()) { + try self.register_manager.getKnownReg(.xmm0, null); + try self.genSetReg(.xmm0, pred_ty, pred_mcv, .{}); + break :mask .xmm0; + } else break :mask if (has_blend) + pred_reg + else + try self.copyToTmpRegister(pred_ty, pred_mcv), + else => unreachable, + }, + else => {}, + } + const mask_reg: Register = if (need_xmm0) mask_reg: { + try self.register_manager.getKnownReg(.xmm0, null); + break :mask_reg .xmm0; + } else try self.register_manager.allocReg(null, abi.RegisterClass.sse); + const mask_alias = registerAlias(mask_reg, abi_size); + const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg); + defer self.register_manager.unlockReg(mask_lock); + + const pred_fits_in_elem = vec_len <= elem_abi_size; + if (self.hasFeature(.avx2) and abi_size <= 32) { + if (pred_mcv.isRegister()) broadcast: { + try self.asmRegisterRegister( + .{ .v_d, .mov }, + mask_reg.to128(), + pred_mcv.getReg().?.to32(), + ); + if (pred_fits_in_elem and vec_len > 1) try self.asmRegisterRegister( + .{ switch (elem_abi_size) { + 1 => .vp_b, + 2 => .vp_w, + 3...4 => .vp_d, + 5...8 => .vp_q, + 9...16 => { + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_f128, .insert }, + mask_alias, + mask_alias, + mask_reg.to128(), + Immediate.u(1), + ); + break :broadcast; + }, + 17...32 => break :broadcast, + else => unreachable, + }, .broadcast }, + mask_alias, + mask_reg.to128(), + ); + } else try self.asmRegisterMemory( + .{ switch (vec_len) { + 1...8 => .vp_b, + 9...16 => .vp_w, + 17...32 => .vp_d, + else => unreachable, + }, .broadcast }, + mask_alias, + if (pred_mcv.isMemory()) try pred_mcv.mem(self, .byte) else .{ + .base = .{ .reg = (try self.copyToTmpRegister( + Type.usize, + pred_mcv.address(), + )).to64() }, + .mod = .{ .rm = .{ .size = .byte } }, + }, + ); + } else if (abi_size <= 16) broadcast: { + try self.asmRegisterRegister( + .{ if (has_avx) .v_d else ._d, .mov }, + mask_alias, + (if (pred_mcv.isRegister()) + pred_mcv.getReg().? + else + try self.copyToTmpRegister(pred_ty, pred_mcv.address())).to32(), + ); + if (!pred_fits_in_elem or vec_len == 1) break :broadcast; + if (elem_abi_size <= 1) { + if (has_avx) try self.asmRegisterRegisterRegister( + .{ .vp_, .unpcklbw }, + mask_alias, + mask_alias, + mask_alias, + ) else try self.asmRegisterRegister( + .{ .p_, .unpcklbw }, + mask_alias, + mask_alias, + ); + if (abi_size <= 2) break :broadcast; + } + if (elem_abi_size <= 2) { + try self.asmRegisterRegisterImmediate( + .{ if (has_avx) .vp_w else .p_w, .shufl }, + mask_alias, + mask_alias, + Immediate.u(0b00_00_00_00), + ); + if (abi_size <= 8) break :broadcast; + } + try self.asmRegisterRegisterImmediate( + .{ if (has_avx) .vp_d else .p_d, .shuf }, + mask_alias, + mask_alias, + Immediate.u(switch (elem_abi_size) { + 1...2, 5...8 => 0b01_00_01_00, + 3...4 => 0b00_00_00_00, + else => unreachable, + }), + ); + } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)}); + const elem_bits: u16 = @intCast(elem_abi_size * 8); + const mask_elem_ty = try mod.intType(.unsigned, elem_bits); + const mask_ty = try mod.vectorType(.{ .len = vec_len, .child = mask_elem_ty.toIntern() }); + if (!pred_fits_in_elem) if (self.hasFeature(.ssse3)) { + var mask_elems: [32]InternPool.Index = undefined; + for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{ + .ty = mask_elem_ty.toIntern(), + .storage = .{ .u64 = bit / elem_bits }, + } }); + const mask_mcv = try self.genTypedValue(.{ + .ty = mask_ty, + .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{ + .ty = mask_ty.toIntern(), + .storage = .{ .elems = mask_elems[0..vec_len] }, + } })), + }); + const mask_mem: Memory = .{ + .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) }, + .mod = .{ .rm = .{ .size = self.memSize(ty) } }, + }; + if (has_avx) try self.asmRegisterRegisterMemory( + .{ .vp_b, .shuf }, + mask_alias, + mask_alias, + mask_mem, + ) else try self.asmRegisterMemory( + .{ .p_b, .shuf }, + mask_alias, + mask_mem, + ); + } else return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)}); + { + var mask_elems: [32]InternPool.Index = undefined; + for (mask_elems[0..vec_len], 0..) |*elem, bit| elem.* = try mod.intern(.{ .int = .{ + .ty = mask_elem_ty.toIntern(), + .storage = .{ .u64 = @as(u32, 1) << @intCast(bit & (elem_bits - 1)) }, + } }); + const mask_mcv = try self.genTypedValue(.{ + .ty = mask_ty, + .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{ + .ty = mask_ty.toIntern(), + .storage = .{ .elems = mask_elems[0..vec_len] }, + } })), + }); + const mask_mem: Memory = .{ + .base = .{ .reg = try self.copyToTmpRegister(Type.usize, mask_mcv.address()) }, + .mod = .{ .rm = .{ .size = self.memSize(ty) } }, + }; + if (has_avx) { + try self.asmRegisterRegisterMemory( + .{ .vp_, .@"and" }, + mask_alias, + mask_alias, + mask_mem, + ); + try self.asmRegisterRegisterMemory( + .{ .vp_d, .cmpeq }, + mask_alias, + mask_alias, + mask_mem, + ); + } else { + try self.asmRegisterMemory( + .{ .p_, .@"and" }, + mask_alias, + mask_mem, + ); + try self.asmRegisterMemory( + .{ .p_d, .cmpeq }, + mask_alias, + mask_mem, + ); + } + } + break :mask mask_reg; + }; + const mask_alias = registerAlias(mask_reg, abi_size); + const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg); + defer self.register_manager.unlockReg(mask_lock); + + const lhs_mcv = try self.resolveInst(extra.lhs); + const lhs_lock = switch (lhs_mcv) { + .register => |lhs_reg| self.register_manager.lockRegAssumeUnused(lhs_reg), + else => null, + }; + defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock); + + const rhs_mcv = try self.resolveInst(extra.rhs); + const rhs_lock = switch (rhs_mcv) { + .register => |rhs_reg| self.register_manager.lockReg(rhs_reg), + else => null, + }; + defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock); + + const reuse_mcv = if (has_blend) rhs_mcv else lhs_mcv; + const dst_mcv: MCValue = if (reuse_mcv.isRegister() and self.reuseOperand( + inst, + if (has_blend) extra.rhs else extra.lhs, + @intFromBool(has_blend), + reuse_mcv, + )) reuse_mcv else if (has_avx) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, ty, reuse_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, abi_size); + const dst_lock = self.register_manager.lockReg(dst_reg); + defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + + const mir_tag = @as(?Mir.Inst.FixedTag, switch (ty.childType(mod).zigTypeTag(mod)) { + else => null, + .Int => switch (abi_size) { + 0 => unreachable, + 1...16 => if (has_avx) + .{ .vp_b, .blendv } + else if (has_blend) + .{ .p_b, .blendv } + else + .{ .p_, undefined }, + 17...32 => if (self.hasFeature(.avx2)) + .{ .vp_b, .blendv } + else + null, + else => null, + }, + .Float => switch (ty.childType(mod).floatBits(self.target.*)) { + else => unreachable, + 16, 80, 128 => null, + 32 => switch (vec_len) { + 0 => unreachable, + 1...4 => if (has_avx) .{ .v_ps, .blendv } else .{ ._ps, .blendv }, + 5...8 => if (has_avx) .{ .v_ps, .blendv } else null, + else => null, + }, + 64 => switch (vec_len) { + 0 => unreachable, + 1...2 => if (has_avx) .{ .v_pd, .blendv } else .{ ._pd, .blendv }, + 3...4 => if (has_avx) .{ .v_pd, .blendv } else null, + else => null, + }, + }, + }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)}); + if (has_avx) { + const rhs_alias = if (rhs_mcv.isRegister()) + registerAlias(rhs_mcv.getReg().?, abi_size) + else rhs: { + try self.genSetReg(dst_reg, ty, rhs_mcv, .{}); + break :rhs dst_alias; + }; + if (lhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister( + mir_tag, + dst_alias, + rhs_alias, + try lhs_mcv.mem(self, self.memSize(ty)), + mask_alias, + ) else try self.asmRegisterRegisterRegisterRegister( + mir_tag, + dst_alias, + rhs_alias, + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + try self.copyToTmpRegister(ty, lhs_mcv), abi_size), + mask_alias, + ); + } else if (has_blend) if (lhs_mcv.isMemory()) try self.asmRegisterMemoryRegister( + mir_tag, + dst_alias, + try lhs_mcv.mem(self, self.memSize(ty)), + mask_alias, + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + try self.copyToTmpRegister(ty, lhs_mcv), abi_size), + mask_alias, + ) else { + const mir_fixes = @as(?Mir.Inst.Fixes, switch (elem_ty.zigTypeTag(mod)) { + else => null, + .Int => .p_, + .Float => switch (elem_ty.floatBits(self.target.*)) { + 32 => ._ps, + 64 => ._pd, + 16, 80, 128 => null, + else => unreachable, + }, + }) orelse return self.fail("TODO implement airSelect for {}", .{ty.fmt(mod)}); + try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias); + if (rhs_mcv.isMemory()) try self.asmRegisterMemory( + .{ mir_fixes, .andn }, + mask_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)), + ) else try self.asmRegisterRegister( + .{ mir_fixes, .andn }, + mask_alias, + if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(ty, rhs_mcv), + ); + try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias); + } + break :result dst_mcv; + }; + return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs }); } fn airShuffle(self: *Self, inst: Air.Inst.Index) !void { + const mod = self.bin_file.comp.module.?; const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - _ = ty_pl; - return self.fail("TODO implement airShuffle for x86_64", .{}); - //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); + const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data; + + const dst_ty = self.typeOfIndex(inst); + const elem_ty = dst_ty.childType(mod); + const elem_abi_size: u16 = @intCast(elem_ty.abiSize(mod)); + const dst_abi_size: u32 = @intCast(dst_ty.abiSize(mod)); + const lhs_ty = self.typeOf(extra.a); + const lhs_abi_size: u32 = @intCast(lhs_ty.abiSize(mod)); + const rhs_ty = self.typeOf(extra.b); + const rhs_abi_size: u32 = @intCast(rhs_ty.abiSize(mod)); + const max_abi_size = @max(dst_abi_size, lhs_abi_size, rhs_abi_size); + + const ExpectedContents = [32]?i32; + var stack align(@max(@alignOf(ExpectedContents), @alignOf(std.heap.StackFallbackAllocator(0)))) = + std.heap.stackFallback(@sizeOf(ExpectedContents), self.gpa); + const allocator = stack.get(); + + const mask_elems = try allocator.alloc(?i32, extra.mask_len); + defer allocator.free(mask_elems); + for (mask_elems, 0..) |*mask_elem, elem_index| { + const mask_elem_val = + Value.fromInterned(extra.mask).elemValue(mod, elem_index) catch unreachable; + mask_elem.* = if (mask_elem_val.isUndef(mod)) + null + else + @intCast(mask_elem_val.toSignedInt(mod)); + } + + const has_avx = self.hasFeature(.avx); + const result = @as(?MCValue, result: { + for (mask_elems) |mask_elem| { + if (mask_elem) |_| break; + } else break :result try self.allocRegOrMem(inst, true); + + for (mask_elems, 0..) |mask_elem, elem_index| { + if (mask_elem orelse continue != elem_index) break; + } else { + const lhs_mcv = try self.resolveInst(extra.a); + if (self.reuseOperand(inst, extra.a, 0, lhs_mcv)) break :result lhs_mcv; + const dst_mcv = try self.allocRegOrMem(inst, true); + try self.genCopy(dst_ty, dst_mcv, lhs_mcv, .{}); + break :result dst_mcv; + } + + for (mask_elems, 0..) |mask_elem, elem_index| { + if (~(mask_elem orelse continue) != elem_index) break; + } else { + const rhs_mcv = try self.resolveInst(extra.b); + if (self.reuseOperand(inst, extra.b, 1, rhs_mcv)) break :result rhs_mcv; + const dst_mcv = try self.allocRegOrMem(inst, true); + try self.genCopy(dst_ty, dst_mcv, rhs_mcv, .{}); + break :result dst_mcv; + } + + for ([_]Mir.Inst.Tag{ .unpckl, .unpckh }) |variant| unpck: { + if (elem_abi_size > 8) break :unpck; + if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4) + has_avx + else + self.hasFeature(.avx2)) 32 else 16)) break :unpck; + + var sources = [1]?u1{null} ** 2; + for (mask_elems, 0..) |maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index = + math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :unpck; + const elem_byte = (elem_index >> 1) * elem_abi_size; + if (mask_elem_index * elem_abi_size != (elem_byte & 0b0111) | @as(u4, switch (variant) { + .unpckl => 0b0000, + .unpckh => 0b1000, + else => unreachable, + }) | (elem_byte << 1 & 0b10000)) break :unpck; + + const source = @intFromBool(mask_elem < 0); + if (sources[elem_index & 0b00001]) |prev_source| { + if (source != prev_source) break :unpck; + } else sources[elem_index & 0b00001] = source; + } + if (sources[0] orelse break :unpck == sources[1] orelse break :unpck) break :unpck; + + const operands = [2]Air.Inst.Ref{ extra.a, extra.b }; + const operand_tys = [2]Type{ lhs_ty, rhs_ty }; + const lhs_mcv = try self.resolveInst(operands[sources[0].?]); + const rhs_mcv = try self.resolveInst(operands[sources[1].?]); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, max_abi_size); + + const mir_tag: Mir.Inst.FixedTag = if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or + (dst_abi_size > 16 and !self.hasFeature(.avx2))) .{ switch (elem_abi_size) { + 4 => if (has_avx) .v_ps else ._ps, + 8 => if (has_avx) .v_pd else ._pd, + else => unreachable, + }, variant } else .{ if (has_avx) .vp_ else .p_, switch (variant) { + .unpckl => switch (elem_abi_size) { + 1 => .unpcklbw, + 2 => .unpcklwd, + 4 => .unpckldq, + 8 => .unpcklqdq, + else => unreachable, + }, + .unpckh => switch (elem_abi_size) { + 1 => .unpckhbw, + 2 => .unpckhwd, + 4 => .unpckhdq, + 8 => .unpckhqdq, + else => unreachable, + }, + else => unreachable, + } }; + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemory( + mir_tag, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, + dst_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + ); + break :result dst_mcv; + } + + pshufd: { + if (elem_abi_size != 4) break :pshufd; + if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :pshufd; + + var control: u8 = 0b00_00_00_00; + var sources = [1]?u1{null} ** 1; + for (mask_elems, 0..) |maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem); + if (mask_elem_index & 0b100 != elem_index & 0b100) break :pshufd; + + const source = @intFromBool(mask_elem < 0); + if (sources[0]) |prev_source| { + if (source != prev_source) break :pshufd; + } else sources[(elem_index & 0b010) >> 1] = source; + + const select_bit: u3 = @intCast((elem_index & 0b011) << 1); + const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit; + if (elem_index & 0b100 == 0) + control |= select + else if (control & @as(u8, 0b11) << select_bit != select) break :pshufd; + } + + const operands = [2]Air.Inst.Ref{ extra.a, extra.b }; + const operand_tys = [2]Type{ lhs_ty, rhs_ty }; + const src_mcv = try self.resolveInst(operands[sources[0] orelse break :pshufd]); + + const dst_reg = if (src_mcv.isRegister() and + self.reuseOperand(inst, operands[sources[0].?], sources[0].?, src_mcv)) + src_mcv.getReg().? + else + try self.register_manager.allocReg(inst, abi.RegisterClass.sse); + const dst_alias = registerAlias(dst_reg, max_abi_size); + + if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .{ if (has_avx) .vp_d else .p_d, .shuf }, + dst_alias, + try src_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + Immediate.u(control), + ) else try self.asmRegisterRegisterImmediate( + .{ if (has_avx) .vp_d else .p_d, .shuf }, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[0].?], src_mcv), max_abi_size), + Immediate.u(control), + ); + break :result .{ .register = dst_reg }; + } + + shufps: { + if (elem_abi_size != 4) break :shufps; + if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufps; + + var control: u8 = 0b00_00_00_00; + var sources = [1]?u1{null} ** 2; + for (mask_elems, 0..) |maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index: u3 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem); + if (mask_elem_index & 0b100 != elem_index & 0b100) break :shufps; + + const source = @intFromBool(mask_elem < 0); + if (sources[(elem_index & 0b010) >> 1]) |prev_source| { + if (source != prev_source) break :shufps; + } else sources[(elem_index & 0b010) >> 1] = source; + + const select_bit: u3 = @intCast((elem_index & 0b011) << 1); + const select = @as(u8, @intCast(mask_elem_index & 0b011)) << select_bit; + if (elem_index & 0b100 == 0) + control |= select + else if (control & @as(u8, 0b11) << select_bit != select) break :shufps; + } + if (sources[0] orelse break :shufps == sources[1] orelse break :shufps) break :shufps; + + const operands = [2]Air.Inst.Ref{ extra.a, extra.b }; + const operand_tys = [2]Type{ lhs_ty, rhs_ty }; + const lhs_mcv = try self.resolveInst(operands[sources[0].?]); + const rhs_mcv = try self.resolveInst(operands[sources[1].?]); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, max_abi_size); + + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .{ .v_ps, .shuf }, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + Immediate.u(control), + ) else try self.asmRegisterRegisterRegisterImmediate( + .{ .v_ps, .shuf }, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + Immediate.u(control), + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .{ ._ps, .shuf }, + dst_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + Immediate.u(control), + ) else try self.asmRegisterRegisterImmediate( + .{ ._ps, .shuf }, + dst_alias, + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + Immediate.u(control), + ); + break :result dst_mcv; + } + + shufpd: { + if (elem_abi_size != 8) break :shufpd; + if (max_abi_size > @as(u32, if (has_avx) 32 else 16)) break :shufpd; + + var control: u4 = 0b0_0_0_0; + var sources = [1]?u1{null} ** 2; + for (mask_elems, 0..) |maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index: u2 = @intCast(if (mask_elem < 0) ~mask_elem else mask_elem); + if (mask_elem_index & 0b10 != elem_index & 0b10) break :shufpd; + + const source = @intFromBool(mask_elem < 0); + if (sources[elem_index & 0b01]) |prev_source| { + if (source != prev_source) break :shufpd; + } else sources[elem_index & 0b01] = source; + + control |= @as(u4, @intCast(mask_elem_index & 0b01)) << @intCast(elem_index); + } + if (sources[0] orelse break :shufpd == sources[1] orelse break :shufpd) break :shufpd; + + const operands: [2]Air.Inst.Ref = .{ extra.a, extra.b }; + const operand_tys: [2]Type = .{ lhs_ty, rhs_ty }; + const lhs_mcv = try self.resolveInst(operands[sources[0].?]); + const rhs_mcv = try self.resolveInst(operands[sources[1].?]); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, operands[sources[0].?], sources[0].?, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, operand_tys[sources[0].?], lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, max_abi_size); + + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .{ .v_pd, .shuf }, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + Immediate.u(control), + ) else try self.asmRegisterRegisterRegisterImmediate( + .{ .v_pd, .shuf }, + dst_alias, + registerAlias(lhs_mcv.getReg() orelse dst_reg, max_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + Immediate.u(control), + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .{ ._pd, .shuf }, + dst_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(max_abi_size)), + Immediate.u(control), + ) else try self.asmRegisterRegisterImmediate( + .{ ._pd, .shuf }, + dst_alias, + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(operand_tys[sources[1].?], rhs_mcv), max_abi_size), + Immediate.u(control), + ); + break :result dst_mcv; + } + + blend: { + if (elem_abi_size < 2) break :blend; + if (dst_abi_size > @as(u32, if (has_avx) 32 else 16)) break :blend; + if (!self.hasFeature(.sse4_1)) break :blend; + + var control: u8 = 0b0_0_0_0_0_0_0_0; + for (mask_elems, 0..) |maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index = + math.cast(u4, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blend; + if (mask_elem_index != elem_index) break :blend; + + const select = @as(u8, @intFromBool(mask_elem < 0)) << @truncate(elem_index); + if (elem_index & 0b1000 == 0) + control |= select + else if (control & @as(u8, 0b1) << @truncate(elem_index) != select) break :blend; + } + + if (!elem_ty.isRuntimeFloat() and self.hasFeature(.avx2)) vpblendd: { + const expanded_control = switch (elem_abi_size) { + 4 => control, + 8 => @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) | + @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) | + @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) | + @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00), + else => break :vpblendd, + }; + + const lhs_mcv = try self.resolveInst(extra.a); + const lhs_reg = if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, lhs_mcv); + const lhs_lock = self.register_manager.lockReg(lhs_reg); + defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock); + + const rhs_mcv = try self.resolveInst(extra.b); + const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.sse); + if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .{ .vp_d, .blend }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(lhs_reg, dst_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + Immediate.u(expanded_control), + ) else try self.asmRegisterRegisterRegisterImmediate( + .{ .vp_d, .blend }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(lhs_reg, dst_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + Immediate.u(expanded_control), + ); + break :result .{ .register = dst_reg }; + } + + if (!elem_ty.isRuntimeFloat() or elem_abi_size == 2) pblendw: { + const expanded_control = switch (elem_abi_size) { + 2 => control, + 4 => if (dst_abi_size <= 16 or + @as(u4, @intCast(control >> 4)) == @as(u4, @truncate(control >> 0))) + @as(u8, if (control & 0b0001 != 0) 0b00_00_00_11 else 0b00_00_00_00) | + @as(u8, if (control & 0b0010 != 0) 0b00_00_11_00 else 0b00_00_00_00) | + @as(u8, if (control & 0b0100 != 0) 0b00_11_00_00 else 0b00_00_00_00) | + @as(u8, if (control & 0b1000 != 0) 0b11_00_00_00 else 0b00_00_00_00) + else + break :pblendw, + 8 => if (dst_abi_size <= 16 or + @as(u2, @intCast(control >> 2)) == @as(u2, @truncate(control >> 0))) + @as(u8, if (control & 0b01 != 0) 0b0000_1111 else 0b0000_0000) | + @as(u8, if (control & 0b10 != 0) 0b1111_0000 else 0b0000_0000) + else + break :pblendw, + 16 => break :pblendw, + else => unreachable, + }; + + const lhs_mcv = try self.resolveInst(extra.a); + const rhs_mcv = try self.resolveInst(extra.b); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, extra.a, 0, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .{ .vp_w, .blend }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + dst_reg, dst_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + Immediate.u(expanded_control), + ) else try self.asmRegisterRegisterRegisterImmediate( + .{ .vp_w, .blend }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + dst_reg, dst_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + Immediate.u(expanded_control), + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .{ .p_w, .blend }, + registerAlias(dst_reg, dst_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + Immediate.u(expanded_control), + ) else try self.asmRegisterRegisterImmediate( + .{ .p_w, .blend }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + Immediate.u(expanded_control), + ); + break :result .{ .register = dst_reg }; + } + + const expanded_control = switch (elem_abi_size) { + 4, 8 => control, + 16 => @as(u4, if (control & 0b01 != 0) 0b00_11 else 0b00_00) | + @as(u4, if (control & 0b10 != 0) 0b11_00 else 0b00_00), + else => unreachable, + }; + + const lhs_mcv = try self.resolveInst(extra.a); + const rhs_mcv = try self.resolveInst(extra.b); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, extra.a, 0, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + switch (elem_abi_size) { + 4 => .{ .v_ps, .blend }, + 8, 16 => .{ .v_pd, .blend }, + else => unreachable, + }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + dst_reg, dst_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + Immediate.u(expanded_control), + ) else try self.asmRegisterRegisterRegisterImmediate( + switch (elem_abi_size) { + 4 => .{ .v_ps, .blend }, + 8, 16 => .{ .v_pd, .blend }, + else => unreachable, + }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + dst_reg, dst_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + Immediate.u(expanded_control), + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + switch (elem_abi_size) { + 4 => .{ ._ps, .blend }, + 8, 16 => .{ ._pd, .blend }, + else => unreachable, + }, + registerAlias(dst_reg, dst_abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + Immediate.u(expanded_control), + ) else try self.asmRegisterRegisterImmediate( + switch (elem_abi_size) { + 4 => .{ ._ps, .blend }, + 8, 16 => .{ ._pd, .blend }, + else => unreachable, + }, + registerAlias(dst_reg, dst_abi_size), + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + Immediate.u(expanded_control), + ); + break :result .{ .register = dst_reg }; + } + + blendv: { + if (dst_abi_size > @as(u32, if (if (elem_abi_size >= 4) + has_avx + else + self.hasFeature(.avx2)) 32 else 16)) break :blendv; + + const select_mask_elem_ty = try mod.intType(.unsigned, elem_abi_size * 8); + const select_mask_ty = try mod.vectorType(.{ + .len = @intCast(mask_elems.len), + .child = select_mask_elem_ty.toIntern(), + }); + var select_mask_elems: [32]InternPool.Index = undefined; + for ( + select_mask_elems[0..mask_elems.len], + mask_elems, + 0.., + ) |*select_mask_elem, maybe_mask_elem, elem_index| { + const mask_elem = maybe_mask_elem orelse continue; + const mask_elem_index = + math.cast(u5, if (mask_elem < 0) ~mask_elem else mask_elem) orelse break :blendv; + if (mask_elem_index != elem_index) break :blendv; + + select_mask_elem.* = (if (mask_elem < 0) + try select_mask_elem_ty.maxIntScalar(mod, select_mask_elem_ty) + else + try select_mask_elem_ty.minIntScalar(mod, select_mask_elem_ty)).toIntern(); + } + const select_mask_mcv = try self.genTypedValue(.{ + .ty = select_mask_ty, + .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{ + .ty = select_mask_ty.toIntern(), + .storage = .{ .elems = select_mask_elems[0..mask_elems.len] }, + } })), + }); + + if (self.hasFeature(.sse4_1)) { + const mir_tag: Mir.Inst.FixedTag = .{ + if ((elem_abi_size >= 4 and elem_ty.isRuntimeFloat()) or + (dst_abi_size > 16 and !self.hasFeature(.avx2))) switch (elem_abi_size) { + 4 => if (has_avx) .v_ps else ._ps, + 8 => if (has_avx) .v_pd else ._pd, + else => unreachable, + } else if (has_avx) .vp_b else .p_b, + .blendv, + }; + + const select_mask_reg = if (!has_avx) reg: { + try self.register_manager.getKnownReg(.xmm0, null); + try self.genSetReg(.xmm0, select_mask_elem_ty, select_mask_mcv, .{}); + break :reg .xmm0; + } else try self.copyToTmpRegister(select_mask_ty, select_mask_mcv); + const select_mask_alias = registerAlias(select_mask_reg, dst_abi_size); + const select_mask_lock = self.register_manager.lockRegAssumeUnused(select_mask_reg); + defer self.register_manager.unlockReg(select_mask_lock); + + const lhs_mcv = try self.resolveInst(extra.a); + const rhs_mcv = try self.resolveInst(extra.b); + + const dst_mcv: MCValue = if (lhs_mcv.isRegister() and + self.reuseOperand(inst, extra.a, 0, lhs_mcv)) + lhs_mcv + else if (has_avx and lhs_mcv.isRegister()) + .{ .register = try self.register_manager.allocReg(inst, abi.RegisterClass.sse) } + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, lhs_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, dst_abi_size); + + if (has_avx) if (rhs_mcv.isMemory()) try self.asmRegisterRegisterMemoryRegister( + mir_tag, + dst_alias, + if (lhs_mcv.isRegister()) + registerAlias(lhs_mcv.getReg().?, dst_abi_size) + else + dst_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + select_mask_alias, + ) else try self.asmRegisterRegisterRegisterRegister( + mir_tag, + dst_alias, + if (lhs_mcv.isRegister()) + registerAlias(lhs_mcv.getReg().?, dst_abi_size) + else + dst_alias, + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + select_mask_alias, + ) else if (rhs_mcv.isMemory()) try self.asmRegisterMemoryRegister( + mir_tag, + dst_alias, + try rhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + select_mask_alias, + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (rhs_mcv.isRegister()) + rhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, rhs_mcv), dst_abi_size), + select_mask_alias, + ); + break :result dst_mcv; + } + + const lhs_mcv = try self.resolveInst(extra.a); + const rhs_mcv = try self.resolveInst(extra.b); + + const dst_mcv: MCValue = if (rhs_mcv.isRegister() and + self.reuseOperand(inst, extra.b, 1, rhs_mcv)) + rhs_mcv + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, rhs_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_alias = registerAlias(dst_reg, dst_abi_size); + + const mask_reg = try self.copyToTmpRegister(select_mask_ty, select_mask_mcv); + const mask_alias = registerAlias(mask_reg, dst_abi_size); + const mask_lock = self.register_manager.lockRegAssumeUnused(mask_reg); + defer self.register_manager.unlockReg(mask_lock); + + const mir_fixes: Mir.Inst.Fixes = if (elem_ty.isRuntimeFloat()) + switch (elem_ty.floatBits(self.target.*)) { + 16, 80, 128 => .p_, + 32 => ._ps, + 64 => ._pd, + else => unreachable, + } + else + .p_; + try self.asmRegisterRegister(.{ mir_fixes, .@"and" }, dst_alias, mask_alias); + if (lhs_mcv.isMemory()) try self.asmRegisterMemory( + .{ mir_fixes, .andn }, + mask_alias, + try lhs_mcv.mem(self, Memory.Size.fromSize(dst_abi_size)), + ) else try self.asmRegisterRegister( + .{ mir_fixes, .andn }, + mask_alias, + if (lhs_mcv.isRegister()) + lhs_mcv.getReg().? + else + try self.copyToTmpRegister(dst_ty, lhs_mcv), + ); + try self.asmRegisterRegister(.{ mir_fixes, .@"or" }, dst_alias, mask_alias); + break :result dst_mcv; + } + + pshufb: { + if (max_abi_size > 16) break :pshufb; + if (!self.hasFeature(.ssse3)) break :pshufb; + + const temp_regs = + try self.register_manager.allocRegs(2, .{ inst, null }, abi.RegisterClass.sse); + const temp_locks = self.register_manager.lockRegsAssumeUnused(2, temp_regs); + defer for (temp_locks) |lock| self.register_manager.unlockReg(lock); + + const lhs_temp_alias = registerAlias(temp_regs[0], max_abi_size); + try self.genSetReg(temp_regs[0], lhs_ty, .{ .air_ref = extra.a }, .{}); + + const rhs_temp_alias = registerAlias(temp_regs[1], max_abi_size); + try self.genSetReg(temp_regs[1], rhs_ty, .{ .air_ref = extra.b }, .{}); + + var lhs_mask_elems: [16]InternPool.Index = undefined; + for (lhs_mask_elems[0..max_abi_size], 0..) |*lhs_mask_elem, byte_index| { + const elem_index = byte_index / elem_abi_size; + lhs_mask_elem.* = try mod.intern(.{ .int = .{ + .ty = .u8_type, + .storage = .{ .u64 = if (elem_index >= mask_elems.len) 0b1_00_00000 else elem: { + const mask_elem = mask_elems[elem_index] orelse break :elem 0b1_00_00000; + if (mask_elem < 0) break :elem 0b1_00_00000; + const mask_elem_index: u31 = @intCast(mask_elem); + const byte_off: u32 = @intCast(byte_index % elem_abi_size); + break :elem @intCast(mask_elem_index * elem_abi_size + byte_off); + } }, + } }); + } + const lhs_mask_ty = try mod.vectorType(.{ .len = max_abi_size, .child = .u8_type }); + const lhs_mask_mcv = try self.genTypedValue(.{ + .ty = lhs_mask_ty, + .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{ + .ty = lhs_mask_ty.toIntern(), + .storage = .{ .elems = lhs_mask_elems[0..max_abi_size] }, + } })), + }); + const lhs_mask_mem: Memory = .{ + .base = .{ .reg = try self.copyToTmpRegister(Type.usize, lhs_mask_mcv.address()) }, + .mod = .{ .rm = .{ .size = Memory.Size.fromSize(@max(max_abi_size, 16)) } }, + }; + if (has_avx) try self.asmRegisterRegisterMemory( + .{ .vp_b, .shuf }, + lhs_temp_alias, + lhs_temp_alias, + lhs_mask_mem, + ) else try self.asmRegisterMemory( + .{ .p_b, .shuf }, + lhs_temp_alias, + lhs_mask_mem, + ); + + var rhs_mask_elems: [16]InternPool.Index = undefined; + for (rhs_mask_elems[0..max_abi_size], 0..) |*rhs_mask_elem, byte_index| { + const elem_index = byte_index / elem_abi_size; + rhs_mask_elem.* = try mod.intern(.{ .int = .{ + .ty = .u8_type, + .storage = .{ .u64 = if (elem_index >= mask_elems.len) 0b1_00_00000 else elem: { + const mask_elem = mask_elems[elem_index] orelse break :elem 0b1_00_00000; + if (mask_elem >= 0) break :elem 0b1_00_00000; + const mask_elem_index: u31 = @intCast(~mask_elem); + const byte_off: u32 = @intCast(byte_index % elem_abi_size); + break :elem @intCast(mask_elem_index * elem_abi_size + byte_off); + } }, + } }); + } + const rhs_mask_ty = try mod.vectorType(.{ .len = max_abi_size, .child = .u8_type }); + const rhs_mask_mcv = try self.genTypedValue(.{ + .ty = rhs_mask_ty, + .val = Value.fromInterned(try mod.intern(.{ .aggregate = .{ + .ty = rhs_mask_ty.toIntern(), + .storage = .{ .elems = rhs_mask_elems[0..max_abi_size] }, + } })), + }); + const rhs_mask_mem: Memory = .{ + .base = .{ .reg = try self.copyToTmpRegister(Type.usize, rhs_mask_mcv.address()) }, + .mod = .{ .rm = .{ .size = Memory.Size.fromSize(@max(max_abi_size, 16)) } }, + }; + if (has_avx) try self.asmRegisterRegisterMemory( + .{ .vp_b, .shuf }, + rhs_temp_alias, + rhs_temp_alias, + rhs_mask_mem, + ) else try self.asmRegisterMemory( + .{ .p_b, .shuf }, + rhs_temp_alias, + rhs_mask_mem, + ); + + if (has_avx) try self.asmRegisterRegisterRegister( + .{ switch (elem_ty.zigTypeTag(mod)) { + else => break :result null, + .Int => .vp_, + .Float => switch (elem_ty.floatBits(self.target.*)) { + 32 => .v_ps, + 64 => .v_pd, + 16, 80, 128 => break :result null, + else => unreachable, + }, + }, .@"or" }, + lhs_temp_alias, + lhs_temp_alias, + rhs_temp_alias, + ) else try self.asmRegisterRegister( + .{ switch (elem_ty.zigTypeTag(mod)) { + else => break :result null, + .Int => .p_, + .Float => switch (elem_ty.floatBits(self.target.*)) { + 32 => ._ps, + 64 => ._pd, + 16, 80, 128 => break :result null, + else => unreachable, + }, + }, .@"or" }, + lhs_temp_alias, + rhs_temp_alias, + ); + break :result .{ .register = temp_regs[0] }; + } + + break :result null; + }) orelse return self.fail("TODO implement airShuffle from {} and {} to {} with {}", .{ + lhs_ty.fmt(mod), rhs_ty.fmt(mod), dst_ty.fmt(mod), + Value.fromInterned(extra.mask).fmtValue( + Type.fromInterned(mod.intern_pool.typeOf(extra.mask)), + mod, + ), + }); + return self.finishAir(inst, result, .{ extra.a, extra.b, .none }); } fn airReduce(self: *Self, inst: Air.Inst.Index) !void { @@ -16751,7 +18083,7 @@ fn airAggregateInit(self: *Self, inst: Air.Inst.Index) !void { }, .Array, .Vector => { const elem_ty = result_ty.childType(mod); - if (result_ty.isVector(mod) and elem_ty.bitSize(mod) == 1) { + if (result_ty.isVector(mod) and elem_ty.toIntern() == .bool_type) { const result_size: u32 = @intCast(result_ty.abiSize(mod)); const dst_reg = try self.register_manager.allocReg(inst, abi.RegisterClass.gp); try self.asmRegisterRegister( @@ -17801,7 +19133,7 @@ fn splitType(self: *Self, ty: Type) ![2]Type { else => unreachable, }, .float => Type.f32, - .float_combine => try mod.vectorType(.{ .len = 2, .child = .f32_type }), + .float_combine => try mod.arrayType(.{ .len = 2, .child = .f32_type }), .sse => Type.f64, else => break, }; diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 8b91a20a4f..c4bf71e233 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -324,16 +324,19 @@ pub const Mnemonic = enum { // SSE3 movddup, movshdup, movsldup, // SSSE3 - pabsb, pabsd, pabsw, palignr, + pabsb, pabsd, pabsw, palignr, pshufb, // SSE4.1 blendpd, blendps, blendvpd, blendvps, extractps, insertps, packusdw, + pblendvb, pblendw, pcmpeqq, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw, + pmovsxbd, pmovsxbq, pmovsxbw, pmovsxdq, pmovsxwd, pmovsxwq, + pmovzxbd, pmovzxbq, pmovzxbw, pmovzxdq, pmovzxwd, pmovzxwq, pmulld, roundpd, roundps, roundsd, roundss, // SSE4.2 @@ -377,7 +380,8 @@ pub const Mnemonic = enum { vpabsb, vpabsd, vpabsw, vpackssdw, vpacksswb, vpackusdw, vpackuswb, vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw, - vpalignr, vpand, vpandn, vpclmulqdq, + vpalignr, vpand, vpandn, + vpblendvb, vpblendw, vpclmulqdq, vpcmpeqb, vpcmpeqd, vpcmpeqq, vpcmpeqw, vpcmpgtb, vpcmpgtd, vpcmpgtq, vpcmpgtw, vpextrb, vpextrd, vpextrq, vpextrw, @@ -385,9 +389,11 @@ pub const Mnemonic = enum { vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw, vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw, vpmovmskb, + vpmovsxbd, vpmovsxbq, vpmovsxbw, vpmovsxdq, vpmovsxwd, vpmovsxwq, + vpmovzxbd, vpmovzxbq, vpmovzxbw, vpmovzxdq, vpmovzxwd, vpmovzxwq, vpmulhw, vpmulld, vpmullw, vpor, - vpshufd, vpshufhw, vpshuflw, + vpshufb, vpshufd, vpshufhw, vpshuflw, vpslld, vpslldq, vpsllq, vpsllw, vpsrad, vpsraq, vpsraw, vpsrld, vpsrldq, vpsrlq, vpsrlw, @@ -409,7 +415,8 @@ pub const Mnemonic = enum { vfmadd132sd, vfmadd213sd, vfmadd231sd, vfmadd132ss, vfmadd213ss, vfmadd231ss, // AVX2 - vpbroadcastb, vpbroadcastd, vpbroadcasti128, vpbroadcastq, vpbroadcastw, + vbroadcasti128, vpbroadcastb, vpbroadcastd, vpbroadcastq, vpbroadcastw, + vextracti128, vinserti128, vpblendd, // zig fmt: on }; diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 4e9c37e5aa..13b97b551a 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -477,8 +477,9 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { .rri_s, .rri_u => inst.data.rri.fixes, .ri_s, .ri_u => inst.data.ri.fixes, .ri64, .rm, .rmi_s, .mr => inst.data.rx.fixes, - .mrr, .rrm => inst.data.rrx.fixes, + .mrr, .rrm, .rmr => inst.data.rrx.fixes, .rmi, .mri => inst.data.rix.fixes, + .rrmr => inst.data.rrrx.fixes, .rrmi => inst.data.rrix.fixes, .mi_u, .mi_s => inst.data.x.fixes, .m => inst.data.x.fixes, @@ -565,6 +566,11 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rx.r1 }, .{ .mem = lower.mem(inst.data.rx.payload) }, }, + .rmr => &.{ + .{ .reg = inst.data.rrx.r1 }, + .{ .mem = lower.mem(inst.data.rrx.payload) }, + .{ .reg = inst.data.rrx.r2 }, + }, .rmi => &.{ .{ .reg = inst.data.rix.r1 }, .{ .mem = lower.mem(inst.data.rix.payload) }, @@ -597,6 +603,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rrx.r2 }, .{ .mem = lower.mem(inst.data.rrx.payload) }, }, + .rrmr => &.{ + .{ .reg = inst.data.rrrx.r1 }, + .{ .reg = inst.data.rrrx.r2 }, + .{ .mem = lower.mem(inst.data.rrrx.payload) }, + .{ .reg = inst.data.rrrx.r3 }, + }, .rrmi => &.{ .{ .reg = inst.data.rrix.r1 }, .{ .reg = inst.data.rrix.r2 }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index dea9bb50cb..6cccb34b3e 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -230,6 +230,8 @@ pub const Inst = struct { v_d, /// VEX-Encoded ___ QuadWord v_q, + /// VEX-Encoded ___ Integer Data + v_i128, /// VEX-Encoded Packed ___ vp_, /// VEX-Encoded Packed ___ Byte @@ -242,8 +244,6 @@ pub const Inst = struct { vp_q, /// VEX-Encoded Packed ___ Double Quadword vp_dq, - /// VEX-Encoded Packed ___ Integer Data - vp_i128, /// VEX-Encoded ___ Scalar Single-Precision Values v_ss, /// VEX-Encoded ___ Packed Single-Precision Values @@ -654,10 +654,19 @@ pub const Inst = struct { /// Variable blend scalar double-precision floating-point values blendv, /// Extract packed floating-point values + /// Extract packed integer values extract, /// Insert scalar single-precision floating-point value /// Insert packed floating-point values insert, + /// Packed move with sign extend + movsxb, + movsxd, + movsxw, + /// Packed move with zero extend + movzxb, + movzxd, + movzxw, /// Round packed single-precision floating-point values /// Round scalar single-precision floating-point value /// Round packed double-precision floating-point values @@ -688,6 +697,7 @@ pub const Inst = struct { sha256rnds2, /// Load with broadcast floating-point data + /// Load integer and broadcast broadcast, /// Convert 16-bit floating-point values to single-precision floating-point values @@ -762,8 +772,11 @@ pub const Inst = struct { /// Uses `imm` payload. rel, /// Register, memory operands. - /// Uses `rx` payload. + /// Uses `rx` payload with extra data of type `Memory`. rm, + /// Register, memory, register operands. + /// Uses `rrx` payload with extra data of type `Memory`. + rmr, /// Register, memory, immediate (word) operands. /// Uses `rix` payload with extra data of type `Memory`. rmi, @@ -776,6 +789,9 @@ pub const Inst = struct { /// Register, register, memory. /// Uses `rrix` payload with extra data of type `Memory`. rrm, + /// Register, register, memory, register. + /// Uses `rrrx` payload with extra data of type `Memory`. + rrmr, /// Register, register, memory, immediate (byte) operands. /// Uses `rrix` payload with extra data of type `Memory`. rrmi, @@ -953,6 +969,14 @@ pub const Inst = struct { r2: Register, payload: u32, }, + /// Register, register, register, followed by Custom payload found in extra. + rrrx: struct { + fixes: Fixes = ._, + r1: Register, + r2: Register, + r3: Register, + payload: u32, + }, /// Register, byte immediate, followed by Custom payload found in extra. rix: struct { fixes: Fixes = ._, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 545e6b23ce..d4a7dcafe7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1185,6 +1185,8 @@ pub const table = [_]Entry{ .{ .palignr, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0f }, 0, .none, .ssse3 }, + .{ .pshufb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .none, .ssse3 }, + // SSE4.1 .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 }, @@ -1202,6 +1204,11 @@ pub const table = [_]Entry{ .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 }, + .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 }, + .{ .pblendvb, .rm, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x10 }, 0, .none, .sse4_1 }, + + .{ .pblendw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .none, .sse4_1 }, + .{ .pcmpeqq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x29 }, 0, .none, .sse4_1 }, .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, @@ -1228,6 +1235,20 @@ pub const table = [_]Entry{ .{ .pminud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .none, .sse4_1 }, + .{ .pmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .none, .sse4_1 }, + .{ .pmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .none, .sse4_1 }, + .{ .pmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .none, .sse4_1 }, + .{ .pmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .none, .sse4_1 }, + .{ .pmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .none, .sse4_1 }, + .{ .pmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .none, .sse4_1 }, + + .{ .pmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .none, .sse4_1 }, + .{ .pmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .none, .sse4_1 }, + .{ .pmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .none, .sse4_1 }, + .{ .pmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .none, .sse4_1 }, + .{ .pmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .none, .sse4_1 }, + .{ .pmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .none, .sse4_1 }, + .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 }, .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, @@ -1528,6 +1549,10 @@ pub const table = [_]Entry{ .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx }, + .{ .vpblendvb, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_128_w0, .avx }, + + .{ .vpblendw, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_128_wig, .avx }, + .{ .vpclmulqdq, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x44 }, 0, .vex_128_wig, .@"pclmul avx" }, .{ .vpcmpeqb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_128_wig, .avx }, @@ -1576,6 +1601,20 @@ pub const table = [_]Entry{ .{ .vpmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx }, .{ .vpmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_128_wig, .avx }, + .{ .vpmovsxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_128_wig, .avx }, + + .{ .vpmovzxbw, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_128_wig, .avx }, + .{ .vpmovzxbd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_128_wig, .avx }, + .{ .vpmovzxbq, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_128_wig, .avx }, + .{ .vpmovzxwd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_128_wig, .avx }, + .{ .vpmovzxwq, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_128_wig, .avx }, + .{ .vpmovzxdq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_128_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx }, .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx }, @@ -1584,6 +1623,8 @@ pub const table = [_]Entry{ .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx }, + .{ .vpshufb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_128_wig, .avx }, + .{ .vpshufd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_128_wig, .avx }, .{ .vpshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_128_wig, .avx }, @@ -1728,6 +1769,10 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vextracti128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x39 }, 0, .vex_256_w0, .avx2 }, + + .{ .vinserti128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x38 }, 0, .vex_256_w0, .avx2 }, + .{ .vpabsb, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1c }, 0, .vex_256_wig, .avx2 }, .{ .vpabsd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1e }, 0, .vex_256_wig, .avx2 }, .{ .vpabsw, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x1d }, 0, .vex_256_wig, .avx2 }, @@ -1756,6 +1801,13 @@ pub const table = [_]Entry{ .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, + .{ .vpblendd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_128_w0, .avx2 }, + .{ .vpblendd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x02 }, 0, .vex_256_w0, .avx2 }, + + .{ .vpblendvb, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4c }, 0, .vex_256_w0, .avx2 }, + + .{ .vpblendw, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0e }, 0, .vex_256_wig, .avx2 }, + .{ .vpbroadcastb, .rm, &.{ .xmm, .xmm_m8 }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_128_w0, .avx2 }, .{ .vpbroadcastb, .rm, &.{ .ymm, .xmm_m8 }, &.{ 0x66, 0x0f, 0x38, 0x78 }, 0, .vex_256_w0, .avx2 }, .{ .vpbroadcastw, .rm, &.{ .xmm, .xmm_m16 }, &.{ 0x66, 0x0f, 0x38, 0x79 }, 0, .vex_128_w0, .avx2 }, @@ -1764,7 +1816,7 @@ pub const table = [_]Entry{ .{ .vpbroadcastd, .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x58 }, 0, .vex_256_w0, .avx2 }, .{ .vpbroadcastq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_128_w0, .avx2 }, .{ .vpbroadcastq, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x59 }, 0, .vex_256_w0, .avx2 }, - .{ .vpbroadcasti128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 }, + .{ .vbroadcasti128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x5a }, 0, .vex_256_w0, .avx2 }, .{ .vpcmpeqb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x74 }, 0, .vex_256_wig, .avx2 }, .{ .vpcmpeqw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x75 }, 0, .vex_256_wig, .avx2 }, @@ -1799,6 +1851,20 @@ pub const table = [_]Entry{ .{ .vpmovmskb, .rm, &.{ .r32, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 }, .{ .vpmovmskb, .rm, &.{ .r64, .ymm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x20 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxbd, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x21 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxbq, .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x22 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x23 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxwq, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x24 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovsxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x25 }, 0, .vex_256_wig, .avx2 }, + + .{ .vpmovzxbw, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x30 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovzxbd, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x31 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovzxbq, .rm, &.{ .ymm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x32 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovzxwd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x33 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovzxwq, .rm, &.{ .ymm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x34 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmovzxdq, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x35 }, 0, .vex_256_wig, .avx2 }, + .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx2 }, .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx2 }, @@ -1807,6 +1873,7 @@ pub const table = [_]Entry{ .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 }, + .{ .vpshufb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x00 }, 0, .vex_256_wig, .avx2 }, .{ .vpshufd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 }, .{ .vpshufhw, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .vex_256_wig, .avx2 }, diff --git a/src/codegen.zig b/src/codegen.zig index 118bab4be9..7bcba80065 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -405,7 +405,7 @@ pub fn generateSymbol( .vector_type => |vector_type| { const abi_size = math.cast(usize, typed_value.ty.abiSize(mod)) orelse return error.Overflow; - if (Type.fromInterned(vector_type.child).bitSize(mod) == 1) { + if (vector_type.child == .bool_type) { const bytes = try code.addManyAsSlice(abi_size); @memset(bytes, 0xaa); var index: usize = 0; @@ -443,37 +443,34 @@ pub fn generateSymbol( }, }) byte.* |= mask else byte.* &= ~mask; } - } else switch (aggregate.storage) { - .bytes => |bytes| try code.appendSlice(bytes), - .elems, .repeated_elem => { - var index: u64 = 0; - while (index < vector_type.len) : (index += 1) { - switch (try generateSymbol(bin_file, src_loc, .{ - .ty = Type.fromInterned(vector_type.child), - .val = Value.fromInterned(switch (aggregate.storage) { - .bytes => unreachable, - .elems => |elems| elems[ - math.cast(usize, index) orelse return error.Overflow - ], - .repeated_elem => |elem| elem, - }), - }, code, debug_output, reloc_info)) { - .ok => {}, - .fail => |em| return .{ .fail = em }, + } else { + switch (aggregate.storage) { + .bytes => |bytes| try code.appendSlice(bytes), + .elems, .repeated_elem => { + var index: u64 = 0; + while (index < vector_type.len) : (index += 1) { + switch (try generateSymbol(bin_file, src_loc, .{ + .ty = Type.fromInterned(vector_type.child), + .val = Value.fromInterned(switch (aggregate.storage) { + .bytes => unreachable, + .elems => |elems| elems[ + math.cast(usize, index) orelse return error.Overflow + ], + .repeated_elem => |elem| elem, + }), + }, code, debug_output, reloc_info)) { + .ok => {}, + .fail => |em| return .{ .fail = em }, + } } - } - }, - } + }, + } - const padding = abi_size - (math.cast(usize, math.divCeil( - u64, - Type.fromInterned(vector_type.child).bitSize(mod) * vector_type.len, - 8, - ) catch |err| switch (err) { - error.DivisionByZero => unreachable, - else => |e| return e, - }) orelse return error.Overflow); - if (padding > 0) try code.appendNTimes(0, padding); + const padding = abi_size - + (math.cast(usize, Type.fromInterned(vector_type.child).abiSize(mod) * vector_type.len) orelse + return error.Overflow); + if (padding > 0) try code.appendNTimes(0, padding); + } }, .anon_struct_type => |tuple| { const struct_begin = code.items.len; diff --git a/src/codegen/c.zig b/src/codegen/c.zig index cf372ff5ef..0977acf7fe 100644 --- a/src/codegen/c.zig +++ b/src/codegen/c.zig @@ -4140,9 +4140,7 @@ fn airCmpOp( if (need_cast) try writer.writeAll("(void*)"); try f.writeCValue(writer, lhs, .Other); try v.elem(f, writer); - try writer.writeByte(' '); try writer.writeAll(compareOperatorC(operator)); - try writer.writeByte(' '); if (need_cast) try writer.writeAll("(void*)"); try f.writeCValue(writer, rhs, .Other); try v.elem(f, writer); @@ -4181,41 +4179,28 @@ fn airEquality( const writer = f.object.writer(); const inst_ty = f.typeOfIndex(inst); const local = try f.allocLocal(inst, inst_ty); + const a = try Assignment.start(f, writer, inst_ty); try f.writeCValue(writer, local, .Other); - try writer.writeAll(" = "); + try a.assign(f, writer); if (operand_ty.zigTypeTag(mod) == .Optional and !operand_ty.optionalReprIsPayload(mod)) { - // (A && B) || (C && (A == B)) - // A = lhs.is_null ; B = rhs.is_null ; C = rhs.payload == lhs.payload - - switch (operator) { - .eq => {}, - .neq => try writer.writeByte('!'), - else => unreachable, - } - try writer.writeAll("(("); + try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" }); + try writer.writeAll(" || "); + try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" }); + try writer.writeAll(" ? "); + try f.writeCValueMember(writer, lhs, .{ .identifier = "is_null" }); + try writer.writeAll(compareOperatorC(operator)); + try f.writeCValueMember(writer, rhs, .{ .identifier = "is_null" }); + try writer.writeAll(" : "); + try f.writeCValueMember(writer, lhs, .{ .identifier = "payload" }); + try writer.writeAll(compareOperatorC(operator)); + try f.writeCValueMember(writer, rhs, .{ .identifier = "payload" }); + } else { try f.writeCValue(writer, lhs, .Other); - try writer.writeAll(".is_null && "); + try writer.writeAll(compareOperatorC(operator)); try f.writeCValue(writer, rhs, .Other); - try writer.writeAll(".is_null) || ("); - try f.writeCValue(writer, lhs, .Other); - try writer.writeAll(".payload == "); - try f.writeCValue(writer, rhs, .Other); - try writer.writeAll(".payload && "); - try f.writeCValue(writer, lhs, .Other); - try writer.writeAll(".is_null == "); - try f.writeCValue(writer, rhs, .Other); - try writer.writeAll(".is_null));\n"); - - return local; } - - try f.writeCValue(writer, lhs, .Other); - try writer.writeByte(' '); - try writer.writeAll(compareOperatorC(operator)); - try writer.writeByte(' '); - try f.writeCValue(writer, rhs, .Other); - try writer.writeAll(";\n"); + try a.end(f, writer); return local; } @@ -6109,41 +6094,48 @@ fn airFloatCast(f: *Function, inst: Air.Inst.Index) !CValue { const ty_op = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; const inst_ty = f.typeOfIndex(inst); + const inst_scalar_ty = inst_ty.scalarType(mod); const operand = try f.resolveInst(ty_op.operand); try reap(f, inst, &.{ty_op.operand}); const operand_ty = f.typeOf(ty_op.operand); + const scalar_ty = operand_ty.scalarType(mod); const target = f.object.dg.module.getTarget(); - const operation = if (inst_ty.isRuntimeFloat() and operand_ty.isRuntimeFloat()) - if (inst_ty.floatBits(target) < operand_ty.floatBits(target)) "trunc" else "extend" - else if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) - if (inst_ty.isSignedInt(mod)) "fix" else "fixuns" - else if (inst_ty.isRuntimeFloat() and operand_ty.isInt(mod)) - if (operand_ty.isSignedInt(mod)) "float" else "floatun" + const operation = if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isRuntimeFloat()) + if (inst_scalar_ty.floatBits(target) < scalar_ty.floatBits(target)) "trunc" else "extend" + else if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) + if (inst_scalar_ty.isSignedInt(mod)) "fix" else "fixuns" + else if (inst_scalar_ty.isRuntimeFloat() and scalar_ty.isInt(mod)) + if (scalar_ty.isSignedInt(mod)) "float" else "floatun" else unreachable; const writer = f.object.writer(); const local = try f.allocLocal(inst, inst_ty); + const v = try Vectorize.start(f, inst, writer, operand_ty); + const a = try Assignment.start(f, writer, scalar_ty); try f.writeCValue(writer, local, .Other); - - try writer.writeAll(" = "); - if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) { + try v.elem(f, writer); + try a.assign(f, writer); + if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) { try writer.writeAll("zig_wrap_"); - try f.object.dg.renderTypeForBuiltinFnName(writer, inst_ty); + try f.object.dg.renderTypeForBuiltinFnName(writer, inst_scalar_ty); try writer.writeByte('('); } try writer.writeAll("zig_"); try writer.writeAll(operation); - try writer.writeAll(compilerRtAbbrev(operand_ty, mod)); - try writer.writeAll(compilerRtAbbrev(inst_ty, mod)); + try writer.writeAll(compilerRtAbbrev(scalar_ty, mod)); + try writer.writeAll(compilerRtAbbrev(inst_scalar_ty, mod)); try writer.writeByte('('); try f.writeCValue(writer, operand, .FunctionArgument); + try v.elem(f, writer); try writer.writeByte(')'); - if (inst_ty.isInt(mod) and operand_ty.isRuntimeFloat()) { - try f.object.dg.renderBuiltinInfo(writer, inst_ty, .bits); + if (inst_scalar_ty.isInt(mod) and scalar_ty.isRuntimeFloat()) { + try f.object.dg.renderBuiltinInfo(writer, inst_scalar_ty, .bits); try writer.writeByte(')'); } - try writer.writeAll(";\n"); + try a.end(f, writer); + try v.end(f, inst, writer); + return local; } @@ -6315,7 +6307,7 @@ fn airCmpBuiltinCall( try v.elem(f, writer); try f.object.dg.renderBuiltinInfo(writer, scalar_ty, info); try writer.writeByte(')'); - if (!ref_ret) try writer.print(" {s} {}", .{ + if (!ref_ret) try writer.print("{s}{}", .{ compareOperatorC(operator), try f.fmtIntLiteral(Type.i32, try mod.intValue(Type.i32, 0)), }); @@ -7661,12 +7653,12 @@ fn compareOperatorAbbrev(operator: std.math.CompareOperator) []const u8 { fn compareOperatorC(operator: std.math.CompareOperator) []const u8 { return switch (operator) { - .lt => "<", - .lte => "<=", - .eq => "==", - .gte => ">=", - .gt => ">", - .neq => "!=", + .lt => " < ", + .lte => " <= ", + .eq => " == ", + .gte => " >= ", + .gt => " > ", + .neq => " != ", }; } diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 4d444c686b..6608ea90eb 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -8646,8 +8646,6 @@ pub const FuncGen = struct { const operand_ty = self.typeOf(ty_op.operand); const dest_ty = self.typeOfIndex(inst); const target = mod.getTarget(); - const dest_bits = dest_ty.floatBits(target); - const src_bits = operand_ty.floatBits(target); if (intrinsicsAllowed(dest_ty, target) and intrinsicsAllowed(operand_ty, target)) { return self.wip.cast(.fpext, operand, try o.lowerType(dest_ty), ""); @@ -8655,11 +8653,19 @@ pub const FuncGen = struct { const operand_llvm_ty = try o.lowerType(operand_ty); const dest_llvm_ty = try o.lowerType(dest_ty); + const dest_bits = dest_ty.scalarType(mod).floatBits(target); + const src_bits = operand_ty.scalarType(mod).floatBits(target); const fn_name = try o.builder.fmt("__extend{s}f{s}f2", .{ compilerRtFloatAbbrev(src_bits), compilerRtFloatAbbrev(dest_bits), }); const libc_fn = try self.getLibcFunction(fn_name, &.{operand_llvm_ty}, dest_llvm_ty); + if (dest_ty.isVector(mod)) return self.buildElementwiseCall( + libc_fn, + &.{operand}, + try o.builder.poisonValue(dest_llvm_ty), + dest_ty.vectorLen(mod), + ); return self.wip.call( .normal, .ccc, diff --git a/src/type.zig b/src/type.zig index a6265692c2..a9d1654ba7 100644 --- a/src/type.zig +++ b/src/type.zig @@ -905,11 +905,32 @@ pub const Type = struct { return Type.fromInterned(array_type.child).abiAlignmentAdvanced(mod, strat); }, .vector_type => |vector_type| { - const bits_u64 = try bitSizeAdvanced(Type.fromInterned(vector_type.child), mod, opt_sema); - const bits: u32 = @intCast(bits_u64); - const bytes = ((bits * vector_type.len) + 7) / 8; - const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes); - return .{ .scalar = Alignment.fromByteUnits(alignment) }; + if (vector_type.len == 0) return .{ .scalar = .@"1" }; + switch (mod.comp.getZigBackend()) { + else => { + const elem_bits: u32 = @intCast(try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema)); + if (elem_bits == 0) return .{ .scalar = .@"1" }; + const bytes = ((elem_bits * vector_type.len) + 7) / 8; + const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes); + return .{ .scalar = Alignment.fromByteUnits(alignment) }; + }, + .stage2_x86_64 => { + if (vector_type.child == .bool_type) { + if (vector_type.len > 256 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" }; + if (vector_type.len > 128 and std.Target.x86.featureSetHas(target.cpu.features, .avx2)) return .{ .scalar = .@"32" }; + if (vector_type.len > 64) return .{ .scalar = .@"16" }; + const bytes = std.math.divCeil(u32, vector_type.len, 8) catch unreachable; + const alignment = std.math.ceilPowerOfTwoAssert(u32, bytes); + return .{ .scalar = Alignment.fromByteUnits(alignment) }; + } + const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar); + if (elem_bytes == 0) return .{ .scalar = .@"1" }; + const bytes = elem_bytes * vector_type.len; + if (bytes > 32 and std.Target.x86.featureSetHas(target.cpu.features, .avx512f)) return .{ .scalar = .@"64" }; + if (bytes > 16 and std.Target.x86.featureSetHas(target.cpu.features, .avx)) return .{ .scalar = .@"32" }; + return .{ .scalar = .@"16" }; + }, + } }, .opt_type => return abiAlignmentAdvancedOptional(ty, mod, strat), @@ -1237,9 +1258,6 @@ pub const Type = struct { .storage = .{ .lazy_size = ty.toIntern() }, } }))) }, }; - const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema); - const total_bits = elem_bits * vector_type.len; - const total_bytes = (total_bits + 7) / 8; const alignment = switch (try ty.abiAlignmentAdvanced(mod, strat)) { .scalar => |x| x, .val => return .{ .val = Value.fromInterned((try mod.intern(.{ .int = .{ @@ -1247,6 +1265,18 @@ pub const Type = struct { .storage = .{ .lazy_size = ty.toIntern() }, } }))) }, }; + const total_bytes = switch (mod.comp.getZigBackend()) { + else => total_bytes: { + const elem_bits = try Type.fromInterned(vector_type.child).bitSizeAdvanced(mod, opt_sema); + const total_bits = elem_bits * vector_type.len; + break :total_bytes (total_bits + 7) / 8; + }, + .stage2_x86_64 => total_bytes: { + if (vector_type.child == .bool_type) break :total_bytes std.math.divCeil(u32, vector_type.len, 8) catch unreachable; + const elem_bytes: u32 = @intCast((try Type.fromInterned(vector_type.child).abiSizeAdvanced(mod, strat)).scalar); + break :total_bytes elem_bytes * vector_type.len; + }, + }; return AbiSizeAdvanced{ .scalar = alignment.forward(total_bytes) }; }, @@ -2108,7 +2138,8 @@ pub const Type = struct { /// Returns true if and only if the type is a fixed-width integer. pub fn isInt(self: Type, mod: *const Module) bool { - return self.isSignedInt(mod) or self.isUnsignedInt(mod); + return self.toIntern() != .comptime_int_type and + mod.intern_pool.isIntegerType(self.toIntern()); } /// Returns true if and only if the type is a fixed-width, signed integer. diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig index 001f8c34db..3ac6115216 100644 --- a/test/behavior/bitcast.zig +++ b/test/behavior/bitcast.zig @@ -336,7 +336,7 @@ test "comptime @bitCast packed struct to int and back" { if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest; - if (builtin.zig_backend == .stage2_x86_64 and builtin.target.ofmt != .elf and builtin.target.ofmt != .macho) return error.SkipZigTest; + if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; if (builtin.zig_backend == .stage2_llvm and native_endian == .big) { // https://github.com/ziglang/zig/issues/13782 diff --git a/test/behavior/cast.zig b/test/behavior/cast.zig index 2ed29eb92d..19e5ebb3c1 100644 --- a/test/behavior/cast.zig +++ b/test/behavior/cast.zig @@ -601,25 +601,25 @@ test "cast *[1][*]const u8 to [*]const ?[*]const u8" { test "@intCast on vector" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; const S = struct { fn doTheTest() !void { // Upcast (implicit, equivalent to @intCast) var up0: @Vector(2, u8) = [_]u8{ 0x55, 0xaa }; _ = &up0; - const up1 = @as(@Vector(2, u16), up0); - const up2 = @as(@Vector(2, u32), up0); - const up3 = @as(@Vector(2, u64), up0); + const up1: @Vector(2, u16) = up0; + const up2: @Vector(2, u32) = up0; + const up3: @Vector(2, u64) = up0; // Downcast (safety-checked) var down0 = up3; _ = &down0; - const down1 = @as(@Vector(2, u32), @intCast(down0)); - const down2 = @as(@Vector(2, u16), @intCast(down0)); - const down3 = @as(@Vector(2, u8), @intCast(down0)); + const down1: @Vector(2, u32) = @intCast(down0); + const down2: @Vector(2, u16) = @intCast(down0); + const down3: @Vector(2, u8) = @intCast(down0); try expect(mem.eql(u16, &@as([2]u16, up1), &[2]u16{ 0x55, 0xaa })); try expect(mem.eql(u32, &@as([2]u32, up2), &[2]u32{ 0x55, 0xaa })); @@ -629,20 +629,10 @@ test "@intCast on vector" { try expect(mem.eql(u16, &@as([2]u16, down2), &[2]u16{ 0x55, 0xaa })); try expect(mem.eql(u8, &@as([2]u8, down3), &[2]u8{ 0x55, 0xaa })); } - - fn doTheTestFloat() !void { - var vec: @Vector(2, f32) = @splat(1234.0); - _ = &vec; - const wider: @Vector(2, f64) = vec; - try expect(wider[0] == 1234.0); - try expect(wider[1] == 1234.0); - } }; try S.doTheTest(); try comptime S.doTheTest(); - try S.doTheTestFloat(); - try comptime S.doTheTestFloat(); } test "@floatCast cast down" { @@ -2340,10 +2330,31 @@ test "@floatCast on vector" { const S = struct { fn doTheTest() !void { - var a: @Vector(3, f64) = .{ 1.5, 2.5, 3.5 }; - _ = &a; - const b: @Vector(3, f32) = @floatCast(a); - try expectEqual(@Vector(3, f32){ 1.5, 2.5, 3.5 }, b); + { + var a: @Vector(2, f64) = .{ 1.5, 2.5 }; + _ = &a; + const b: @Vector(2, f32) = @floatCast(a); + try expectEqual(@Vector(2, f32){ 1.5, 2.5 }, b); + } + { + var a: @Vector(2, f32) = .{ 3.25, 4.25 }; + _ = &a; + const b: @Vector(2, f64) = @floatCast(a); + try expectEqual(@Vector(2, f64){ 3.25, 4.25 }, b); + } + { + var a: @Vector(2, f32) = .{ 5.75, 6.75 }; + _ = &a; + const b: @Vector(2, f64) = a; + try expectEqual(@Vector(2, f64){ 5.75, 6.75 }, b); + } + { + var vec: @Vector(2, f32) = @splat(1234.0); + _ = &vec; + const wider: @Vector(2, f64) = vec; + try expect(wider[0] == 1234.0); + try expect(wider[1] == 1234.0); + } } }; @@ -2441,6 +2452,7 @@ test "@intFromBool on vector" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; const S = struct { fn doTheTest() !void { diff --git a/test/behavior/optional.zig b/test/behavior/optional.zig index 2030645d3d..edf02d3e6e 100644 --- a/test/behavior/optional.zig +++ b/test/behavior/optional.zig @@ -110,44 +110,89 @@ test "nested optional field in struct" { try expect(s.x.?.y == 127); } -test "equality compare optional with non-optional" { +test "equality compare optionals and non-optionals" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO - try test_cmp_optional_non_optional(); - try comptime test_cmp_optional_non_optional(); + const S = struct { + fn doTheTest() !void { + var five: isize = 5; + var ten: isize = 10; + var opt_null: ?isize = null; + var opt_ten: ?isize = 10; + _ = .{ &five, &ten, &opt_null, &opt_ten }; + try expect(opt_null != five); + try expect(opt_null != ten); + try expect(opt_ten != five); + try expect(opt_ten == ten); + + var opt_int: ?isize = null; + try expect(opt_int != five); + try expect(opt_int != ten); + try expect(opt_int == opt_null); + try expect(opt_int != opt_ten); + + opt_int = 10; + try expect(opt_int != five); + try expect(opt_int == ten); + try expect(opt_int != opt_null); + try expect(opt_int == opt_ten); + + opt_int = five; + try expect(opt_int == five); + try expect(opt_int != ten); + try expect(opt_int != opt_null); + try expect(opt_int != opt_ten); + + // test evaluation is always lexical + // ensure that the optional isn't always computed before the non-optional + var mutable_state: i32 = 0; + _ = blk1: { + mutable_state += 1; + break :blk1 @as(?f64, 10.0); + } != blk2: { + try expect(mutable_state == 1); + break :blk2 @as(f64, 5.0); + }; + _ = blk1: { + mutable_state += 1; + break :blk1 @as(f64, 10.0); + } != blk2: { + try expect(mutable_state == 2); + break :blk2 @as(?f64, 5.0); + }; + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); } -fn test_cmp_optional_non_optional() !void { - var ten: i32 = 10; - var opt_ten: ?i32 = 10; - var five: i32 = 5; - var int_n: ?i32 = null; +test "compare optionals with modified payloads" { + if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; - _ = .{ &ten, &opt_ten, &five, &int_n }; + var lhs: ?bool = false; + const lhs_payload = &lhs.?; + var rhs: ?bool = true; + const rhs_payload = &rhs.?; + try expect(lhs != rhs and !(lhs == rhs)); - try expect(int_n != ten); - try expect(opt_ten == ten); - try expect(opt_ten != five); + lhs = null; + lhs_payload.* = false; + rhs = false; + try expect(lhs != rhs and !(lhs == rhs)); - // test evaluation is always lexical - // ensure that the optional isn't always computed before the non-optional - var mutable_state: i32 = 0; - _ = blk1: { - mutable_state += 1; - break :blk1 @as(?f64, 10.0); - } != blk2: { - try expect(mutable_state == 1); - break :blk2 @as(f64, 5.0); - }; - _ = blk1: { - mutable_state += 1; - break :blk1 @as(f64, 10.0); - } != blk2: { - try expect(mutable_state == 2); - break :blk2 @as(?f64, 5.0); - }; + lhs = true; + rhs = null; + rhs_payload.* = true; + try expect(lhs != rhs and !(lhs == rhs)); + + lhs = null; + lhs_payload.* = false; + rhs = null; + rhs_payload.* = true; + try expect(lhs == rhs and !(lhs != rhs)); } test "unwrap function call with optional pointer return value" { diff --git a/test/behavior/select.zig b/test/behavior/select.zig index de717e5e5b..2396d8bb11 100644 --- a/test/behavior/select.zig +++ b/test/behavior/select.zig @@ -5,7 +5,6 @@ const expect = std.testing.expect; test "@select vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -36,11 +35,12 @@ fn selectVectors() !void { test "@select arrays" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest; + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .avx2)) return error.SkipZigTest; try comptime selectArrays(); try selectArrays(); diff --git a/test/behavior/shuffle.zig b/test/behavior/shuffle.zig index 95913be3af..c3d760103d 100644 --- a/test/behavior/shuffle.zig +++ b/test/behavior/shuffle.zig @@ -4,10 +4,11 @@ const mem = std.mem; const expect = std.testing.expect; test "@shuffle int" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest; const S = struct { fn doTheTest() !void { diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig index cb9cd4a87a..9d21f8fdb0 100644 --- a/test/behavior/vector.zig +++ b/test/behavior/vector.zig @@ -29,7 +29,7 @@ test "vector wrap operators" { if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_x86_64 and - !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; const S = struct { fn doTheTest() !void { @@ -906,22 +906,26 @@ test "vector @reduce comptime" { } test "mask parameter of @shuffle is comptime scope" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .ssse3)) return error.SkipZigTest; const __v4hi = @Vector(4, i16); - var v4_a = __v4hi{ 0, 0, 0, 0 }; - var v4_b = __v4hi{ 0, 0, 0, 0 }; + var v4_a = __v4hi{ 1, 2, 3, 4 }; + var v4_b = __v4hi{ 5, 6, 7, 8 }; _ = .{ &v4_a, &v4_b }; const shuffled: __v4hi = @shuffle(i16, v4_a, v4_b, @Vector(4, i32){ std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len), - std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len), - std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len), - std.zig.c_translation.shuffleVectorIndex(0, @typeInfo(@TypeOf(v4_a)).Vector.len), + std.zig.c_translation.shuffleVectorIndex(2, @typeInfo(@TypeOf(v4_a)).Vector.len), + std.zig.c_translation.shuffleVectorIndex(4, @typeInfo(@TypeOf(v4_a)).Vector.len), + std.zig.c_translation.shuffleVectorIndex(6, @typeInfo(@TypeOf(v4_a)).Vector.len), }); - _ = shuffled; + try expect(shuffled[0] == 1); + try expect(shuffled[1] == 3); + try expect(shuffled[2] == 5); + try expect(shuffled[3] == 7); } test "saturating add" { @@ -1177,10 +1181,22 @@ test "@shlWithOverflow" { } test "alignment of vectors" { - try expect(@alignOf(@Vector(2, u8)) == 2); - try expect(@alignOf(@Vector(2, u1)) == 1); - try expect(@alignOf(@Vector(1, u1)) == 1); - try expect(@alignOf(@Vector(2, u16)) == 4); + try expect(@alignOf(@Vector(2, u8)) == switch (builtin.zig_backend) { + else => 2, + .stage2_x86_64 => 16, + }); + try expect(@alignOf(@Vector(2, u1)) == switch (builtin.zig_backend) { + else => 1, + .stage2_x86_64 => 16, + }); + try expect(@alignOf(@Vector(1, u1)) == switch (builtin.zig_backend) { + else => 1, + .stage2_x86_64 => 16, + }); + try expect(@alignOf(@Vector(2, u16)) == switch (builtin.zig_backend) { + else => 4, + .stage2_x86_64 => 16, + }); } test "loading the second vector from a slice of vectors" { @@ -1316,10 +1332,10 @@ test "modRem with zero divisor" { test "array operands to shuffle are coerced to vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; const mask = [5]i32{ -1, 0, 1, 2, 3 };