diff --git a/lib/std/zig/Zir.zig b/lib/std/zig/Zir.zig index 18099b047e..fb8a4ef696 100644 --- a/lib/std/zig/Zir.zig +++ b/lib/std/zig/Zir.zig @@ -2142,7 +2142,7 @@ pub const Inst = struct { ref_start_index = static_len, _, - pub const static_len = 105; + pub const static_len = 109; pub fn toRef(i: Index) Inst.Ref { return @enumFromInt(@intFromEnum(Index.ref_start_index) + @intFromEnum(i)); @@ -2255,11 +2255,15 @@ pub const Inst = struct { vector_1_u256_type, vector_4_f16_type, vector_8_f16_type, + vector_16_f16_type, + vector_32_f16_type, vector_2_f32_type, vector_4_f32_type, vector_8_f32_type, + vector_16_f32_type, vector_2_f64_type, vector_4_f64_type, + vector_8_f64_type, optional_noreturn_type, anyerror_void_error_union_type, adhoc_inferred_error_set_type, diff --git a/src/Air.zig b/src/Air.zig index 188fb2b0e7..e1ee37c134 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -1038,11 +1038,15 @@ pub const Inst = struct { vector_1_u256_type = @intFromEnum(InternPool.Index.vector_1_u256_type), vector_4_f16_type = @intFromEnum(InternPool.Index.vector_4_f16_type), vector_8_f16_type = @intFromEnum(InternPool.Index.vector_8_f16_type), + vector_16_f16_type = @intFromEnum(InternPool.Index.vector_16_f16_type), + vector_32_f16_type = @intFromEnum(InternPool.Index.vector_32_f16_type), vector_2_f32_type = @intFromEnum(InternPool.Index.vector_2_f32_type), vector_4_f32_type = @intFromEnum(InternPool.Index.vector_4_f32_type), vector_8_f32_type = @intFromEnum(InternPool.Index.vector_8_f32_type), + vector_16_f32_type = @intFromEnum(InternPool.Index.vector_16_f32_type), vector_2_f64_type = @intFromEnum(InternPool.Index.vector_2_f64_type), vector_4_f64_type = @intFromEnum(InternPool.Index.vector_4_f64_type), + vector_8_f64_type = @intFromEnum(InternPool.Index.vector_8_f64_type), optional_noreturn_type = @intFromEnum(InternPool.Index.optional_noreturn_type), anyerror_void_error_union_type = @intFromEnum(InternPool.Index.anyerror_void_error_union_type), adhoc_inferred_error_set_type = @intFromEnum(InternPool.Index.adhoc_inferred_error_set_type), diff --git a/src/InternPool.zig b/src/InternPool.zig index adcf2fb797..8967d23aaa 100644 --- a/src/InternPool.zig +++ b/src/InternPool.zig @@ -4615,11 +4615,15 @@ pub const Index = enum(u32) { vector_1_u256_type, vector_4_f16_type, vector_8_f16_type, + vector_16_f16_type, + vector_32_f16_type, vector_2_f32_type, vector_4_f32_type, vector_8_f32_type, + vector_16_f32_type, vector_2_f64_type, vector_4_f64_type, + vector_8_f64_type, optional_noreturn_type, anyerror_void_error_union_type, @@ -5174,16 +5178,24 @@ pub const static_keys = [_]Key{ .{ .vector_type = .{ .len = 4, .child = .f16_type } }, // @Vector(8, f16) .{ .vector_type = .{ .len = 8, .child = .f16_type } }, + // @Vector(16, f16) + .{ .vector_type = .{ .len = 16, .child = .f16_type } }, + // @Vector(32, f16) + .{ .vector_type = .{ .len = 32, .child = .f16_type } }, // @Vector(2, f32) .{ .vector_type = .{ .len = 2, .child = .f32_type } }, // @Vector(4, f32) .{ .vector_type = .{ .len = 4, .child = .f32_type } }, // @Vector(8, f32) .{ .vector_type = .{ .len = 8, .child = .f32_type } }, + // @Vector(16, f32) + .{ .vector_type = .{ .len = 16, .child = .f32_type } }, // @Vector(2, f64) .{ .vector_type = .{ .len = 2, .child = .f64_type } }, // @Vector(4, f64) .{ .vector_type = .{ .len = 4, .child = .f64_type } }, + // @Vector(8, f64) + .{ .vector_type = .{ .len = 8, .child = .f64_type } }, // ?noreturn .{ .opt_type = .noreturn_type }, @@ -11847,11 +11859,15 @@ pub fn typeOf(ip: *const InternPool, index: Index) Index { .vector_1_u256_type, .vector_4_f16_type, .vector_8_f16_type, + .vector_16_f16_type, + .vector_32_f16_type, .vector_2_f32_type, .vector_4_f32_type, .vector_8_f32_type, + .vector_16_f32_type, .vector_2_f64_type, .vector_4_f64_type, + .vector_8_f64_type, .optional_noreturn_type, .anyerror_void_error_union_type, .adhoc_inferred_error_set_type, @@ -12175,11 +12191,15 @@ pub fn zigTypeTag(ip: *const InternPool, index: Index) std.builtin.TypeId { .vector_1_u256_type, .vector_4_f16_type, .vector_8_f16_type, + .vector_16_f16_type, + .vector_32_f16_type, .vector_2_f32_type, .vector_4_f32_type, .vector_8_f32_type, + .vector_16_f32_type, .vector_2_f64_type, .vector_4_f64_type, + .vector_8_f64_type, => .vector, .optional_noreturn_type => .optional, diff --git a/src/Sema.zig b/src/Sema.zig index f64ce0754a..2b117aad82 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -36571,11 +36571,15 @@ pub fn typeHasOnePossibleValue(sema: *Sema, ty: Type) CompileError!?Value { .vector_1_u256_type, .vector_4_f16_type, .vector_8_f16_type, + .vector_16_f16_type, + .vector_32_f16_type, .vector_2_f32_type, .vector_4_f32_type, .vector_8_f32_type, + .vector_16_f32_type, .vector_2_f64_type, .vector_4_f64_type, + .vector_8_f64_type, .anyerror_void_error_union_type, => null, .void_type => Value.void, diff --git a/src/Type.zig b/src/Type.zig index e278384d43..17a58807eb 100644 --- a/src/Type.zig +++ b/src/Type.zig @@ -4136,11 +4136,15 @@ pub const vector_2_u128: Type = .{ .ip_index = .vector_2_u128_type }; pub const vector_1_u256: Type = .{ .ip_index = .vector_1_u256_type }; pub const vector_4_f16: Type = .{ .ip_index = .vector_4_f16_type }; pub const vector_8_f16: Type = .{ .ip_index = .vector_8_f16_type }; +pub const vector_16_f16: Type = .{ .ip_index = .vector_16_f16_type }; +pub const vector_32_f16: Type = .{ .ip_index = .vector_32_f16_type }; pub const vector_2_f32: Type = .{ .ip_index = .vector_2_f32_type }; pub const vector_4_f32: Type = .{ .ip_index = .vector_4_f32_type }; pub const vector_8_f32: Type = .{ .ip_index = .vector_8_f32_type }; +pub const vector_16_f32: Type = .{ .ip_index = .vector_16_f32_type }; pub const vector_2_f64: Type = .{ .ip_index = .vector_2_f64_type }; pub const vector_4_f64: Type = .{ .ip_index = .vector_4_f64_type }; +pub const vector_8_f64: Type = .{ .ip_index = .vector_8_f64_type }; pub const empty_tuple: Type = .{ .ip_index = .empty_tuple_type }; diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index f20d079035..88c411c462 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2389,7 +2389,7 @@ fn genBodyBlock(self: *CodeGen, body: []const Air.Inst.Index) InnerError!void { } fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { - @setEvalBranchQuota(23_600); + @setEvalBranchQuota(23_800); const pt = cg.pt; const zcu = pt.zcu; const ip = &zcu.intern_pool; @@ -68441,7 +68441,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68465,7 +68465,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68489,7 +68489,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68517,7 +68517,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68546,7 +68546,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -68575,7 +68575,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -68642,7 +68642,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68668,7 +68668,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68694,7 +68694,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68724,7 +68724,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68755,7 +68755,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -68786,7 +68786,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -68856,7 +68856,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68882,7 +68882,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -68908,7 +68908,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68938,7 +68938,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -68969,7 +68969,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -69000,7 +69000,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -69070,7 +69070,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69096,7 +69096,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69122,7 +69122,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -69149,7 +69149,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69178,7 +69178,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -69247,7 +69247,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69274,7 +69274,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69301,7 +69301,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -69329,7 +69329,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69360,7 +69360,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -69432,7 +69432,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69459,7 +69459,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69486,7 +69486,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69515,7 +69515,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69546,7 +69546,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .word, .smear = 8 } } }, - .{ .type = .vector_16_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .unused, .unused, @@ -69618,7 +69618,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69644,7 +69644,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69670,7 +69670,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -69698,7 +69698,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } }, - .{ .type = .vector_32_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69768,7 +69768,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69795,7 +69795,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69822,7 +69822,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -69851,7 +69851,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } }, - .{ .type = .vector_32_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } }, .unused, @@ -69924,7 +69924,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69951,7 +69951,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -69978,7 +69978,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, .unused, @@ -70007,7 +70007,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, .extra_temps = .{ .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .dword, .smear = 8 } } }, - .{ .type = .vector_32_u8, .kind = .reverse_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .reverse } }, .{ .type = .vector_32_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } }, .unused, @@ -70080,7 +70080,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70106,7 +70106,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70132,7 +70132,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .unused, .unused, .unused, @@ -70203,7 +70203,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, .unused, .unused, @@ -70231,7 +70231,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .unused, .unused, .unused, @@ -70305,7 +70305,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70332,7 +70332,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .qword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70359,7 +70359,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .unused, .unused, .unused, @@ -70433,7 +70433,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70459,7 +70459,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70485,7 +70485,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, @@ -70518,7 +70518,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .u64, .kind = .{ .rc = .general_purpose } }, @@ -70551,7 +70551,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .vector_16_u8, .kind = .{ .mut_rc = .{ .ref = .src0, .rc = .sse } } }, @@ -70584,7 +70584,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_mut_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .u64, .kind = .{ .rc = .general_purpose } }, @@ -70617,7 +70617,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_32_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .unused, @@ -70644,7 +70644,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ .src = .{ .to_sse, .none, .none } }, }, .extra_temps = .{ - .{ .type = .vector_32_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, @@ -70676,7 +70676,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .extra_temps = .{ .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .isize, .kind = .{ .rc = .general_purpose } }, - .{ .type = .vector_32_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_32_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_32_u8, .kind = .{ .pshufb_bswap_mem = .{ .repeat = 2, .size = .xword } } }, .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } }, .{ .type = .vector_32_u8, .kind = .{ .rc = .sse } }, @@ -70712,7 +70712,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .extra_temps = .{ .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, .{ .type = .isize, .kind = .{ .rc = .general_purpose } }, - .{ .type = .vector_16_u8, .kind = .forward_bits_mem }, + .{ .type = .vector_16_u8, .kind = .{ .bits_mem = .forward } }, .{ .type = .vector_16_u8, .kind = .{ .pshufb_bswap_mem = .{ .size = .xword } } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, .{ .type = .vector_16_u8, .kind = .{ .rc = .sse } }, @@ -129477,8 +129477,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { switch (reduce.operation) { .And, .Or, .Xor => unreachable, .Min, .Max => break :fallback try cg.airReduce(inst), - .Add => {}, - .Mul => break :fallback try cg.airReduce(inst), + .Add, .Mul => {}, } var ops = try cg.tempsFromOperands(inst, .{reduce.operand}); var res: [1]Temp = undefined; @@ -129894,7 +129893,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, } }, }, .{ - .required_features = .{ .avx512f, .f16c, null, null }, + .required_features = .{ .avx512f, null, null, null }, .dst_constraints = .{ .{ .float = .word }, .any }, .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, .patterns = &.{ @@ -129938,7 +129937,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, } }, }, .{ - .required_features = .{ .avx512f, .f16c, null, null }, + .required_features = .{ .avx512f, null, null, null }, .dst_constraints = .{ .{ .float = .word }, .any }, .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, .patterns = &.{ @@ -130051,7 +130050,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, } }, }, .{ - .required_features = .{ .avx512f, .f16c, null, null }, + .required_features = .{ .avx512f, null, null, null }, .dst_constraints = .{ .{ .float = .word }, .any }, .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, .patterns = &.{ @@ -130685,10 +130684,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .dst_temps = .{ .{ .rc = .sse }, .unused }, .each = .{ .once = &.{ .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, - .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ }, - .{ ._, .v_ps, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ }, - .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ }, - .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ }, + .{ ._, .v_ps, .@"and", .tmp2y, .tmp2y, .memd(.src0y, 32), ._ }, .{ ._, .v_ps, .add, .dst0y, .dst0y, .tmp2y, ._ }, .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ }, .{ ._, .v_ps, .add, .dst0x, .dst0x, .tmp2x, ._ }, @@ -131248,10 +131247,10 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .dst_temps = .{ .{ .rc = .sse }, .unused }, .each = .{ .once = &.{ .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, - .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ }, - .{ ._, .v_pd, .mova, .tmp2y, .memd(.src0y, 32), ._, ._ }, - .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .lea(.tmp0y), ._ }, - .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .lead(.tmp0y, 32), ._ }, + .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_pd, .mova, .tmp2y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ }, + .{ ._, .v_pd, .@"and", .tmp2y, .tmp2y, .memd(.src0y, 32), ._ }, .{ ._, .v_pd, .add, .dst0y, .dst0y, .tmp2y, ._ }, .{ ._, .v_i128, .extract, .tmp2x, .dst0y, .ui(1), ._ }, .{ ._, .v_pd, .add, .dst0x, .dst0x, .tmp2x, ._ }, @@ -131744,7 +131743,1549 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, } }, } }, - .Mul => unreachable, + .Mul => comptime &.{ .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .dword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .mem, .none, .none } }, + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .mem, .none, .none } }, + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .mem, .none, .none } }, + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .cvtph2, .dst0x, .src0q, ._, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_8_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0x, .src0x, .lea(.tmp0x), ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"or", .dst0x, .src0x, .lea(.tmp0x), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .mem, .none, .none } }, + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_16_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .src0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp0y, .tmp0x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp0y, ._ }, + .{ ._, .v_f128, .extract, .tmp0x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_32_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, .v_f128, .extract, .tmp4x, .tmp3y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ }, + .{ ._, .v_ps, .mul, .tmp3y, .tmp3y, .tmp4y, ._ }, + .{ ._, .v_f128, .extract, .tmp4x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp4y, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .mova, .tmp1y, .memd(.src0y, 32), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ }, + .{ ._, .v_f128, .extract, .tmp2x, .tmp1y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .tmp1y, .tmp1x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ }, + .{ ._, .v_ps, .mul, .tmp1y, .tmp1y, .tmp2y, ._ }, + .{ ._, .v_f128, .extract, .tmp2x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp2y, .tmp2x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp2y, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp1y, ._ }, + .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", .v_ps, .cvtph2, .tmp1y, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp1y, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_32_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-80, .src0, .add_size), ._, ._ }, + .{ ._, .v_f128, .extract, .tmp4x, .tmp3y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ }, + .{ ._, .v_ps, .mul, .tmp3y, .tmp3y, .tmp4y, ._ }, + .{ ._, .v_f128, .extract, .tmp4x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp4y, .tmp4x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp4y, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ .@"0:", .v_ps, .cvtph2, .tmp3y, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .f16c, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_16_f16, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .cvtph2, .dst0y, .dst0x, ._, ._ }, + .{ ._, .v_ps, .cvtph2, .tmp3y, .tmp3x, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ .@"0:", .v_ps, .cvtph2, .tmp3y, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b11_10_11_10) }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_, .cvtps2ph, .dst0q, .dst0x, .rm(.{}), ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f16, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, .vp_, .xor, .dst0x, .dst0x, .dst0x, ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .vp_w, .insr, .dst0x, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0) }, + .{ .@"0:", .vp_, .xor, .tmp1x, .tmp1x, .tmp1x, ._ }, + .{ ._, .vp_w, .insr, .tmp1x, .tmp1x, .memi(.src0w, .tmp0), .ui(0) }, + .{ ._, ._, .call, .tmp2d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f16, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, .p_, .xor, .dst0x, .dst0x, ._, ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .p_w, .insr, .dst0x, .memad(.src0w, .add_unaligned_size, -2), .ui(0), ._ }, + .{ .@"0:", .p_, .xor, .tmp1x, .tmp1x, ._, ._ }, + .{ ._, .p_w, .insr, .tmp1x, .memi(.src0w, .tmp0), .ui(0), ._ }, + .{ ._, ._, .call, .tmp2d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .word }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .word, .is = .word } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f16, .kind = .{ .reg = .ax } }, + .{ .type = .f32, .kind = .mem }, + .{ .type = .f16, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__mulhf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, ._ps, .xor, .dst0x, .dst0x, ._, ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-4, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, ._, .movzx, .tmp1d, .memad(.src0w, .add_unaligned_size, -2), ._, ._ }, + .{ ._, ._, .mov, .mem(.tmp2d), .tmp1d, ._, ._ }, + .{ ._, ._ss, .mov, .dst0x, .mem(.tmp2d), ._, ._ }, + .{ .@"0:", ._ps, .xor, .tmp3x, .tmp3x, ._, ._ }, + .{ ._, ._ss, .mov, .tmp3x, .memi(.src0d, .tmp0), ._, ._ }, + .{ ._, ._, .call, .tmp4d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(2), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .src0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .qword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mut_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .ref = .src0 }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._ps, .mova, .tmp0x, .src0x, ._, ._ }, + .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ }, + .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ }, + .{ ._, .v_ps, .mul, .tmp0x, .src0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .dst0x, .src0x, .src0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .tmp0x, .dst0x, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mut_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .ref = .src0 }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ }, + .{ ._, ._ss, .mul, .tmp0x, .src0x, ._, ._ }, + .{ ._, ._ps, .shuf, .dst0x, .src0x, .ui(0b01_01_01_01), ._ }, + .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .src0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .src0x, .src0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .src0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mut_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .ref = .src0 }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ }, + .{ ._, ._ps, .mul, .dst0x, .tmp0x, ._, ._ }, + .{ ._, ._ps, .mova, .tmp0x, .dst0x, ._, ._ }, + .{ ._, ._ps, .shuf, .tmp0x, .tmp0x, .ui(0b01_01_01_01), ._ }, + .{ ._, ._ss, .mul, .dst0x, .tmp0x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_8_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .src0y, .lea(.tmp0y), ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .src0y, .lea(.tmp0y), ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_f128, .extract, .tmp1x, .src0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .src0x, .tmp1x, ._ }, + .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_16_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ }, + .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_i128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .mova, .dst0y, .mem(.src0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp0y, ._ }, + .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .shuf, .tmp0x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ }, + .{ .@"0:", .v_ps, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .shuf, .tmp1x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", ._ps, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ }, + .{ ._, ._ps, .mul, .dst0x, .tmp1x, ._, ._ }, + .{ ._, ._ps, .mova, .tmp1x, .dst0x, ._, ._ }, + .{ ._, ._ps, .shuf, .tmp1x, .tmp1x, .ui(0b01_01_01_01), ._ }, + .{ ._, ._ss, .mul, .dst0x, .tmp1x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_16_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_ps, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_ps, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ }, + .{ .@"0:", .v_ps, .mul, .tmp3y, .tmp3y, .memid(.src0y, .tmp0, 32), ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_ps, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_8_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_ps, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_ps, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_ps, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ .@"0:", .v_ps, .mul, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_ps, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .shuf, .tmp3x, .dst0x, .dst0x, .ui(0b01_01_01_01) }, + .{ ._, .v_ss, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .dword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .dword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_4_f32, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, ._ps, .mova, .dst0x, .lea(.tmp0x), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, ._ps, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ }, + .{ ._, ._ps, .@"or", .dst0x, .lea(.tmp0x), ._, ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ }, + .{ .@"0:", ._ps, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, ._ps, .xor, .tmp3x, .tmp3x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp3x, .dst0x, ._, ._ }, + .{ ._, ._ps, .mul, .dst0x, .tmp3x, ._, ._ }, + .{ ._, ._ps, .mova, .tmp3x, .dst0x, ._, ._ }, + .{ ._, ._ps, .shuf, .tmp3x, .tmp3x, .ui(0b01_01_01_01), ._ }, + .{ ._, ._ss, .mul, .dst0x, .tmp3x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_ps, .movhl, .tmp0x, .src0x, .src0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .src0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mut_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .ref = .src0 }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._ps, .xor, .tmp0x, .tmp0x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp0x, .src0x, ._, ._ }, + .{ ._, ._sd, .mul, .dst0x, .tmp0x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .tmp0x, .src0x, .tmp0x, ._ }, + .{ ._, .v_ps, .movhl, .dst0x, .src0x, .src0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .tmp0x, .dst0x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_sse, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .mut_rc = .{ .ref = .src0, .rc = .sse } }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_f128, .extract, .tmp0x, .src0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .src0x, .tmp0x, ._ }, + .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exclusive_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .usize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_8_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_8_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_pd, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .mem(.src0y), ._ }, + .{ ._, .v_pd, .@"and", .tmp3y, .tmp3y, .memd(.src0y, 32), ._ }, + .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_pd, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_i128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .exact_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .each = .{ .once = &.{ + .{ ._, .v_pd, .mova, .dst0y, .mem(.src0y), ._, ._ }, + .{ ._, .v_pd, .mova, .tmp0y, .memd(.src0y, 32), ._, ._ }, + .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp0y, ._ }, + .{ ._, .v_i128, .extract, .tmp0x, .dst0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + .{ ._, .v_ps, .movhl, .tmp0x, .dst0x, .dst0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp0x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_4_f32, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-64, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .v_pd, .mova, .dst0y, .memad(.src0y, .add_unaligned_size, -32), ._, ._ }, + .{ .@"0:", .v_pd, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(32), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_f128, .extract, .tmp1x, .dst0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + .{ ._, .v_ps, .movhl, .tmp1x, .dst0x, .dst0x, ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp1x, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .unaligned_multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, ._pd, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", ._pd, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, ._ps, .xor, .tmp1x, .tmp1x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp1x, .dst0x, ._, ._ }, + .{ ._, ._sd, .mul, .dst0x, .tmp1x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx512f, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .zword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_64_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_8_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_4_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, .v_pd, .mova, .tmp3y, .lead(.tmp0y, 32), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_pd, .@"and", .tmp3y, .tmp3y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -64), ._ }, + .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, .v_pd, .@"or", .tmp3y, .tmp3y, .lead(.tmp0y, 32), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-128, .src0, .add_size), ._, ._ }, + .{ .@"0:", .v_pd, .mul, .tmp3y, .tmp3y, .memid(.src0y, .tmp0, 32), ._ }, + .{ ._, .v_pd, .mul, .dst0y, .dst0y, .memi(.src0y, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(64), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_pd, .mul, .dst0y, .dst0y, .tmp3y, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_sd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .yword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_32_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_4_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, .v_pd, .mova, .dst0y, .lea(.tmp0y), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, .v_pd, .@"and", .dst0y, .dst0y, .memad(.src0y, .add_size, -32), ._ }, + .{ ._, .v_pd, .@"or", .dst0y, .dst0y, .lea(.tmp0y), ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-48, .src0, .add_size), ._, ._ }, + .{ ._, .v_f128, .extract, .tmp3x, .dst0y, .ui(1), ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + .{ .@"0:", .v_pd, .mul, .dst0x, .dst0x, .memi(.src0x, .tmp0), ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .v_ps, .movhl, .tmp3x, .dst0x, .dst0x, ._ }, + .{ ._, .v_pd, .mul, .dst0x, .dst0x, .tmp3x, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .vector_16_u8, .kind = .{ .pand_mask_mem = .{ .ref = .src0 } } }, + .{ .type = .vector_2_f64, .kind = .{ .splat_float_mem = .{ .ref = .src0, .val = 1.0, .fill = .outside } } }, + .{ .type = .vector_2_f64, .kind = .{ .rc = .sse } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .rc = .sse }, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .lea, .tmp0p, .mem(.tmp1), ._, ._ }, + .{ ._, ._pd, .mova, .dst0x, .lea(.tmp0x), ._, ._ }, + .{ ._, ._, .lea, .tmp0p, .mem(.tmp2), ._, ._ }, + .{ ._, ._pd, .@"and", .dst0x, .memad(.src0x, .add_size, -16), ._, ._ }, + .{ ._, ._pd, .@"or", .dst0x, .lea(.tmp0x), ._, ._ }, + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_size), ._, ._ }, + .{ .@"0:", ._pd, .mul, .dst0x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, ._ps, .xor, .tmp3x, .tmp3x, ._, ._ }, + .{ ._, ._ps, .movhl, .tmp3x, .dst0x, ._, ._ }, + .{ ._, ._pd, .mul, .dst0x, .tmp3x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .x87, null, null, null }, + .dst_constraints = .{ .{ .float = .qword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .qword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .isize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f64, .kind = .{ .reg = .st7 } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .mem, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0p, .sia(-16, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .f_, .ld, .memad(.src0q, .add_unaligned_size, -8), ._, ._, ._ }, + .{ .@"0:", .f_, .mul, .memi(.src0q, .tmp0), ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0p, .si(8), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .f_p, .st, .dst0q, ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .x87, null, null, null }, + .dst_constraints = .{ .{ .float = .tbyte }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .qword, .is = .tbyte } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .extra_temps = .{ + .{ .type = .isize, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f80, .kind = .{ .reg = .st6 } }, + .{ .type = .f80, .kind = .{ .reg = .st7 } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .mem, .unused }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0p, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .f_, .ld, .memad(.src0t, .add_unaligned_size, -16), ._, ._, ._ }, + .{ .@"0:", .f_, .ld, .memi(.src0t, .tmp0), ._, ._, ._ }, + .{ ._, .f_p, .mul, ._, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0p, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + .{ ._, .f_p, .st, .dst0t, ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .dst_constraints = .{ .{ .float = .xword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f128, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, .v_dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", .v_dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .call, .tmp2d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .dst_constraints = .{ .{ .float = .xword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f128, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, ._dqa, .mov, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", ._dqa, .mov, .tmp1x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .call, .tmp2d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .dst_constraints = .{ .{ .float = .xword }, .any }, + .src_constraints = .{ .{ .multiple_scalar_float = .{ .of = .xword, .is = .xword } }, .any, .any }, + .patterns = &.{ + .{ .src = .{ .to_mem, .none, .none } }, + }, + .call_frame = .{ .alignment = .@"16" }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .f128, .kind = .{ .reg = .xmm1 } }, + .{ .type = .usize, .kind = .{ .symbol = &.{ .name = "__multf3" } } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .dst_temps = .{ .{ .reg = .xmm0 }, .unused }, + .clobbers = .{ .eflags = true, .caller_preserved = .ccc }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .sia(-32, .src0, .add_unaligned_size), ._, ._ }, + .{ ._, ._ps, .mova, .dst0x, .memad(.src0x, .add_unaligned_size, -16), ._, ._ }, + .{ .@"0:", ._ps, .mova, .tmp1x, .memi(.src0x, .tmp0), ._, ._ }, + .{ ._, ._, .call, .tmp2d, ._, ._, ._ }, + .{ ._, ._, .sub, .tmp0d, .si(16), ._, ._ }, + .{ ._, ._nb, .j, .@"0b", ._, ._, ._ }, + } }, + } }, }) catch |err| switch (err) { error.SelectFailed => return cg.fail("failed to select {s} {} {}", .{ @tagName(air_tag), @@ -149031,7 +150572,7 @@ fn genCopy(self: *CodeGen, ty: Type, dst_mcv: MCValue, src_mcv: MCValue, opts: C } else if (self.hasFeature(.sse4_1)) { try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128()); try self.asmRegisterRegisterImmediate(.{ .p_q, .extr }, dst_regs[1].to64(), src_reg.to128(), .u(1)); - } else { + } else if (self.hasFeature(.sse2)) { const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse); const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); defer self.register_manager.unlockReg(tmp_lock); @@ -149039,6 +150580,19 @@ fn genCopy(self: *CodeGen, ty: Type, dst_mcv: MCValue, src_mcv: MCValue, opts: C try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[0].to64(), src_reg.to128()); try self.asmRegisterRegister(.{ ._ps, .movhl }, tmp_reg.to128(), src_reg.to128()); try self.asmRegisterRegister(.{ ._q, .mov }, dst_regs[1].to64(), tmp_reg.to128()); + } else { + const frame_index = try self.allocFrameIndex(.init(.{ + .size = 16, + .alignment = .@"16", + })); + try self.asmMemoryRegister(.{ ._ps, .mova }, .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .xword } }, + }, src_reg.to128()); + for (dst_regs, 0..) |dst_reg, dst_index| try self.asmRegisterMemory(.{ ._, .mov }, dst_reg.to64(), .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .qword, .disp = @intCast(8 * dst_index) } }, + }); } return; } else unreachable, @@ -149282,7 +150836,7 @@ fn genSetReg( }, ), .x87 => switch (src_reg.class()) { - .general_purpose, .gphi, .segment => unreachable, + .general_purpose, .gphi, .segment, .mmx, .ip, .cr, .dr => unreachable, .x87 => switch (src_reg) { .st0 => try self.asmRegister(.{ .f_, .st }, dst_reg), .st1, .st2, .st3, .st4, .st5, .st6 => switch (dst_reg) { @@ -149307,7 +150861,25 @@ fn genSetReg( }, else => unreachable, }, - .mmx, .sse, .ip, .cr, .dr => unreachable, + .sse => if (abi_size <= 16) { + const frame_index = try self.allocFrameIndex(.init(.{ + .size = 16, + .alignment = .@"16", + })); + try self.asmMemoryRegister(if (self.hasFeature(.avx)) + .{ .v_dqa, .mov } + else if (self.hasFeature(.sse2)) + .{ ._dqa, .mov } + else + .{ ._ps, .mova }, .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .xword } }, + }, src_reg.to128()); + try MoveStrategy.read(.load_store_x87, self, dst_reg, .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .tbyte } }, + }); + } else unreachable, }, .mmx => unreachable, .sse => switch (src_reg.class()) { @@ -149349,7 +150921,7 @@ fn genSetReg( .{ .register = try self.copyToTmpRegister(ty, src_mcv) }, opts, ), - .x87 => { + .x87 => if (abi_size <= 16) { const frame_index = try self.allocFrameIndex(.init(.{ .size = 16, .alignment = .@"16", @@ -149367,7 +150939,7 @@ fn genSetReg( .base = .{ .frame = frame_index }, .mod = .{ .rm = .{ .size = .xword } }, }); - }, + } else unreachable, .mmx, .ip, .cr, .dr => unreachable, .sse => try self.asmRegisterRegister( @as(?Mir.Inst.FixedTag, switch (ty.scalarType(zcu).zigTypeTag(zcu)) { @@ -149431,7 +151003,7 @@ fn genSetReg( } else if (self.hasFeature(.sse4_1)) { try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64()); try self.asmRegisterRegisterImmediate(.{ .p_q, .insr }, dst_reg.to128(), src_regs[1].to64(), .u(1)); - } else { + } else if (self.hasFeature(.sse2)) { const tmp_reg = try self.register_manager.allocReg(null, abi.RegisterClass.sse); const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); defer self.register_manager.unlockReg(tmp_lock); @@ -149439,6 +151011,19 @@ fn genSetReg( try self.asmRegisterRegister(.{ ._q, .mov }, dst_reg.to128(), src_regs[0].to64()); try self.asmRegisterRegister(.{ ._q, .mov }, tmp_reg.to128(), src_regs[1].to64()); try self.asmRegisterRegister(.{ ._ps, .movlh }, dst_reg.to128(), tmp_reg.to128()); + } else { + const frame_index = try self.allocFrameIndex(.init(.{ + .size = 16, + .alignment = .@"16", + })); + for (src_regs, 0..) |src_reg, src_index| try self.asmMemoryRegister(.{ ._, .mov }, .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .qword, .disp = @intCast(8 * src_index) } }, + }, src_reg.to64()); + try self.asmRegisterMemory(.{ ._ps, .mova }, dst_reg.to128(), .{ + .base = .{ .frame = frame_index }, + .mod = .{ .rm = .{ .size = .xword } }, + }); } } else unreachable, else => unreachable, @@ -149746,11 +151331,18 @@ fn genSetMem( }, else => abi_size, }; - const src_alias = registerAlias(src_reg, abi_size); - const src_size: u32 = @intCast(switch (src_alias.class()) { - .general_purpose, .gphi, .segment, .x87, .ip, .cr, .dr => @divExact(src_alias.bitSize(), 8), + const src_alias = registerAlias(src_reg, @intCast(self.unalignedSize(ty))); + const src_class = src_alias.class(); + const src_size: u32 = switch (src_class) { + .general_purpose, .gphi, .segment, .ip, .cr, .dr => @intCast(@divExact(src_alias.bitSize(), 8)), .mmx, .sse => abi_size, - }); + .x87 => switch (abi.classifySystemV(ty, zcu, self.target, .other)[0]) { + else => unreachable, + .float, .float_combine => 4, + .sse => 8, + .x87 => 10, + }, + }; const src_align: InternPool.Alignment = .fromNonzeroByteUnits( std.math.ceilPowerOfTwoAssert(u32, src_size), ); @@ -149760,7 +151352,7 @@ fn genSetMem( .alignment = src_align, })); const frame_mcv: MCValue = .{ .load_frame = .{ .index = frame_index } }; - try (try self.moveStrategy(ty, src_alias.class(), true)).write( + try (try self.moveStrategy(ty, src_class, true)).write( self, .{ .base = .{ .frame = frame_index }, .mod = .{ .rm = .{ .size = .fromSize(src_size), @@ -149769,7 +151361,7 @@ fn genSetMem( ); try self.genSetMem(base, disp, ty, frame_mcv, opts); try self.freeValue(frame_mcv); - } else try (try self.moveStrategy(ty, src_alias.class(), switch (base) { + } else try (try self.moveStrategy(ty, src_class, switch (base) { .none => src_align.check(@as(u32, @bitCast(disp))), .reg => |reg| switch (reg) { .es, .cs, .ss, .ds => src_align.check(@as(u32, @bitCast(disp))), @@ -154146,7 +155738,7 @@ fn registerAlias(reg: Register, size_bytes: u32) Register { reg else unreachable, - .x87 => if (size_bytes >= 10 and size_bytes <= 16) + .x87 => if (size_bytes >= 4 and size_bytes <= 16) reg else unreachable, @@ -154433,7 +156025,10 @@ fn promoteVarArg(self: *CodeGen, ty: Type) Type { fn unalignedSize(cg: *CodeGen, ty: Type) u64 { const zcu = cg.pt.zcu; return switch (zcu.intern_pool.indexToKey(ty.toIntern())) { - .vector_type => |vector_type| Type.fromInterned(vector_type.child).abiSize(zcu) * vector_type.len, + .vector_type => |vector_type| switch (vector_type.child) { + .bool_type => ty.abiSize(zcu), + else => Type.fromInterned(vector_type.child).abiSize(zcu) * vector_type.len, + }, else => ty.abiSize(zcu), }; } @@ -155222,7 +156817,7 @@ const Temp = struct { else => |mcv| std.debug.panic("{s}: {}\n", .{ @src().fn_name, mcv }), .register => |val_reg| try src.readReg(opts.disp, val_ty, registerAlias( val_reg, - @intCast(val_ty.abiSize(cg.pt.zcu)), + @intCast(cg.unalignedSize(val_ty)), ), cg), inline .register_pair, .register_triple, .register_quadruple => |val_regs| { var disp = opts.disp; @@ -160731,8 +162326,8 @@ const Select = struct { pand_mask_mem: struct { ref: Select.Operand.Ref, invert: bool = false }, ptest_mask_mem: Select.Operand.Ref, pshufb_bswap_mem: struct { repeat: u4 = 1, size: Memory.Size, smear: u4 = 1 }, - forward_bits_mem, - reverse_bits_mem, + bits_mem: enum { forward, reverse }, + splat_float_mem: struct { ref: Select.Operand.Ref, val: f16, fill: enum { inside, outside } = .inside }, frame: FrameIndex, lazy_symbol: struct { kind: link.File.LazySymbol.Kind, ref: Select.Operand.Ref = .none }, symbol: *const struct { lib: ?[]const u8 = null, name: []const u8 }, @@ -161051,11 +162646,11 @@ const Select = struct { .pand_mask_mem => |mask_spec| { const zcu = pt.zcu; assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type); - const ty = mask_spec.ref.typeOf(s); - assert(ty.isVector(zcu)); + const ref_ty = mask_spec.ref.typeOf(s); + assert(ref_ty.isVector(zcu)); var elem_buf: [64]u8 = undefined; const elems = elem_buf[0..spec.type.vectorLen(zcu)]; - const mask_len: usize = @intCast(cg.unalignedSize(ty) % elems.len); + const mask_len: usize = @intCast((cg.unalignedSize(ref_ty) - 1) % elems.len + 1); const invert_mask: u8 = switch (mask_spec.invert) { false => std.math.minInt(u8), true => std.math.maxInt(u8), @@ -161070,14 +162665,14 @@ const Select = struct { .ptest_mask_mem => |mask_ref| { const zcu = pt.zcu; assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type); - const ty = mask_ref.typeOf(s); - assert(ty.isVector(zcu) and ty.childType(zcu).toIntern() == .bool_type); + const ref_ty = mask_ref.typeOf(s); + assert(ref_ty.isVector(zcu) and ref_ty.childType(zcu).toIntern() == .bool_type); const mask_info = mask_ref.valueOf(s).register_mask.info; var elem_buf: [64]u8 = @splat(0); const elems = elem_buf[0..spec.type.vectorLen(zcu)]; const elem_bytes: u6 = @intCast(@divExact(mask_info.scalar.bitSize(cg.target), 8)); var index: u7 = 0; - for (0..@intCast(ty.vectorLen(zcu))) |_| { + for (0..@intCast(ref_ty.vectorLen(zcu))) |_| { switch (mask_info.kind) { .sign => { @memset(elems[index..][0 .. elem_bytes - 1], std.math.minInt(u8)); @@ -161108,21 +162703,42 @@ const Select = struct { .storage = .{ .bytes = try zcu.intern_pool.getOrPutString(zcu.gpa, pt.tid, elems, .maybe_embedded_nulls) }, } }))), true }; }, - .forward_bits_mem, .reverse_bits_mem => { + .bits_mem => |direction| { const zcu = pt.zcu; assert(spec.type.isVector(zcu) and spec.type.childType(zcu).toIntern() == .u8_type); var bytes: [32]u8 = undefined; const elems = bytes[0..spec.type.vectorLen(zcu)]; - for (elems, 0..) |*elem, index| elem.* = switch (spec.kind) { - else => unreachable, - .forward_bits_mem => @as(u8, 1 << 0) << @truncate(index), - .reverse_bits_mem => @as(u8, 1 << 7) >> @truncate(index), + for (elems, 0..) |*elem, index| elem.* = switch (direction) { + .forward => @as(u8, 1 << 0) << @truncate(index), + .reverse => @as(u8, 1 << 7) >> @truncate(index), }; return .{ try cg.tempMemFromValue(.fromInterned(try pt.intern(.{ .aggregate = .{ .ty = spec.type.toIntern(), .storage = .{ .bytes = try zcu.intern_pool.getOrPutString(zcu.gpa, pt.tid, elems, .maybe_embedded_nulls) }, } }))), true }; }, + .splat_float_mem => |splat_spec| { + const zcu = pt.zcu; + assert(spec.type.isVector(zcu)); + const elem_ty = spec.type.childType(zcu); + const ref_ty = splat_spec.ref.typeOf(s); + assert(ref_ty.isVector(zcu) and ref_ty.childType(zcu).toIntern() == elem_ty.toIntern()); + var elem_buf: [@divExact(64, 2)]InternPool.Index = undefined; + const elems = elem_buf[0..spec.type.vectorLen(zcu)]; + const inside_len = (ref_ty.vectorLen(zcu) - 1) % elems.len + 1; + @memset(elems[0..inside_len], (try pt.floatValue(elem_ty, switch (splat_spec.fill) { + .inside => splat_spec.val, + .outside => 0.0, + })).toIntern()); + @memset(elems[inside_len..], (try pt.floatValue(elem_ty, switch (splat_spec.fill) { + .inside => 0.0, + .outside => splat_spec.val, + })).toIntern()); + return .{ try cg.tempMemFromValue(.fromInterned(try pt.intern(.{ .aggregate = .{ + .ty = spec.type.toIntern(), + .storage = .{ .elems = elems }, + } }))), true }; + }, .frame => |frame_index| .{ try cg.tempInit(spec.type, .{ .load_frame = .{ .index = frame_index } }), true }, .lazy_symbol => |lazy_symbol_spec| { const ip = &pt.zcu.intern_pool; diff --git a/src/codegen/c/Type.zig b/src/codegen/c/Type.zig index 2fa3856cfc..93e52cd2ec 100644 --- a/src/codegen/c/Type.zig +++ b/src/codegen/c/Type.zig @@ -1885,6 +1885,36 @@ pub const Pool = struct { }; return pool.fromFields(allocator, .@"struct", &fields, kind); }, + .vector_16_f16_type => { + const vector_ctype = try pool.getVector(allocator, .{ + .elem_ctype = .f16, + .len = 16, + }); + if (!kind.isParameter()) return vector_ctype; + var fields = [_]Info.Field{ + .{ + .name = .{ .index = .array }, + .ctype = vector_ctype, + .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)), + }, + }; + return pool.fromFields(allocator, .@"struct", &fields, kind); + }, + .vector_32_f16_type => { + const vector_ctype = try pool.getVector(allocator, .{ + .elem_ctype = .f16, + .len = 32, + }); + if (!kind.isParameter()) return vector_ctype; + var fields = [_]Info.Field{ + .{ + .name = .{ .index = .array }, + .ctype = vector_ctype, + .alignas = AlignAs.fromAbiAlignment(Type.f16.abiAlignment(zcu)), + }, + }; + return pool.fromFields(allocator, .@"struct", &fields, kind); + }, .vector_2_f32_type => { const vector_ctype = try pool.getVector(allocator, .{ .elem_ctype = .f32, @@ -1930,6 +1960,21 @@ pub const Pool = struct { }; return pool.fromFields(allocator, .@"struct", &fields, kind); }, + .vector_16_f32_type => { + const vector_ctype = try pool.getVector(allocator, .{ + .elem_ctype = .f32, + .len = 16, + }); + if (!kind.isParameter()) return vector_ctype; + var fields = [_]Info.Field{ + .{ + .name = .{ .index = .array }, + .ctype = vector_ctype, + .alignas = AlignAs.fromAbiAlignment(Type.f32.abiAlignment(zcu)), + }, + }; + return pool.fromFields(allocator, .@"struct", &fields, kind); + }, .vector_2_f64_type => { const vector_ctype = try pool.getVector(allocator, .{ .elem_ctype = .f64, @@ -1960,6 +2005,21 @@ pub const Pool = struct { }; return pool.fromFields(allocator, .@"struct", &fields, kind); }, + .vector_8_f64_type => { + const vector_ctype = try pool.getVector(allocator, .{ + .elem_ctype = .f64, + .len = 8, + }); + if (!kind.isParameter()) return vector_ctype; + var fields = [_]Info.Field{ + .{ + .name = .{ .index = .array }, + .ctype = vector_ctype, + .alignas = AlignAs.fromAbiAlignment(Type.f64.abiAlignment(zcu)), + }, + }; + return pool.fromFields(allocator, .@"struct", &fields, kind); + }, .undef, .zero, diff --git a/test/behavior/x86_64/math.zig b/test/behavior/x86_64/math.zig index 5b31876edc..bc2b417620 100644 --- a/test/behavior/x86_64/math.zig +++ b/test/behavior/x86_64/math.zig @@ -125,7 +125,7 @@ fn boolOr(lhs: anytype, rhs: @TypeOf(lhs)) @TypeOf(lhs) { @compileError("unsupported boolOr type: " ++ @typeName(@TypeOf(lhs))); } -pub const Compare = enum { strict, relaxed, approx, approx_int }; +pub const Compare = enum { strict, relaxed, approx, approx_int, approx_or_overflow }; // noinline for a more helpful stack trace pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comptime compare: Compare) !void { const Expected = @TypeOf(expected); @@ -137,20 +137,32 @@ pub noinline fn checkExpected(expected: anytype, actual: @TypeOf(expected), comp break :unexpected switch (compare) { .strict => boolOr(unequal, sign(expected) != sign(actual)), .relaxed => unequal, - .approx, .approx_int => comptime unreachable, + .approx, .approx_int, .approx_or_overflow => comptime unreachable, }; }, - .approx, .approx_int => { + .approx, .approx_int, .approx_or_overflow => { const epsilon = math.floatEps(Scalar(Expected)); - const tolerance = @sqrt(epsilon); - break :unexpected @abs(expected - actual) > @max( + const tolerance = switch (compare) { + .strict, .relaxed => comptime unreachable, + .approx, .approx_int => @sqrt(epsilon), + .approx_or_overflow => @exp2(@log2(epsilon) * 0.4), + }; + const approx_unequal = @abs(expected - actual) > @max( @abs(expected) * splat(Expected, tolerance), splat(Expected, switch (compare) { .strict, .relaxed => comptime unreachable, - .approx => tolerance, + .approx, .approx_or_overflow => tolerance, .approx_int => 1, }), ); + break :unexpected switch (compare) { + .strict, .relaxed => comptime unreachable, + .approx, .approx_int => approx_unequal, + .approx_or_overflow => boolAnd(approx_unequal, boolOr(boolAnd( + @abs(expected) != splat(Expected, inf(Expected)), + @abs(actual) != splat(Expected, inf(Expected)), + ), sign(expected) != sign(actual))), + }; }, }, .@"struct" => |@"struct"| inline for (@"struct".fields) |field| { diff --git a/test/behavior/x86_64/unary.zig b/test/behavior/x86_64/unary.zig index 839feb02de..53793e9635 100644 --- a/test/behavior/x86_64/unary.zig +++ b/test/behavior/x86_64/unary.zig @@ -5119,6 +5119,15 @@ test reduceAddOptimized { try test_reduce_add_optimized.testFloatVectors(); } +inline fn reduceMulOptimized(comptime Type: type, rhs: Type) @typeInfo(Type).vector.child { + @setFloatMode(.optimized); + return @reduce(.Mul, rhs); +} +test reduceMulOptimized { + const test_reduce_mul_optimized = unary(reduceMulOptimized, .{ .compare = .approx_or_overflow }); + try test_reduce_mul_optimized.testFloatVectors(); +} + inline fn splat(comptime Type: type, rhs: Type) Type { return @splat(rhs[0]); } diff --git a/test/cases/compile_errors/@import_zon_bad_type.zig b/test/cases/compile_errors/@import_zon_bad_type.zig index 08da4aaff5..15282956f2 100644 --- a/test/cases/compile_errors/@import_zon_bad_type.zig +++ b/test/cases/compile_errors/@import_zon_bad_type.zig @@ -117,9 +117,9 @@ export fn testMutablePointer() void { // tmp.zig:37:38: note: imported here // neg_inf.zon:1:1: error: expected type '?u8' // tmp.zig:57:28: note: imported here -// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_505' +// neg_inf.zon:1:1: error: expected type 'tmp.testNonExhaustiveEnum__enum_509' // tmp.zig:62:39: note: imported here -// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_507' +// neg_inf.zon:1:1: error: expected type 'tmp.testUntaggedUnion__union_511' // tmp.zig:67:44: note: imported here -// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_510' +// neg_inf.zon:1:1: error: expected type 'tmp.testTaggedUnionVoid__union_514' // tmp.zig:72:50: note: imported here diff --git a/test/cases/compile_errors/anytype_param_requires_comptime.zig b/test/cases/compile_errors/anytype_param_requires_comptime.zig index d96e54edb3..d85a5559f4 100644 --- a/test/cases/compile_errors/anytype_param_requires_comptime.zig +++ b/test/cases/compile_errors/anytype_param_requires_comptime.zig @@ -15,6 +15,6 @@ pub export fn entry() void { // error // // :7:25: error: unable to resolve comptime value -// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_479.C' must be comptime-known +// :7:25: note: initializer of comptime-only struct 'tmp.S.foo__anon_483.C' must be comptime-known // :4:16: note: struct requires comptime because of this field // :4:16: note: types are not available at runtime diff --git a/test/cases/compile_errors/bogus_method_call_on_slice.zig b/test/cases/compile_errors/bogus_method_call_on_slice.zig index 76f103b4a1..fe338c86fe 100644 --- a/test/cases/compile_errors/bogus_method_call_on_slice.zig +++ b/test/cases/compile_errors/bogus_method_call_on_slice.zig @@ -16,5 +16,5 @@ pub export fn entry2() void { // // :3:6: error: no field or member function named 'copy' in '[]const u8' // :9:8: error: no field or member function named 'bar' in '@TypeOf(.{})' -// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_483' +// :12:18: error: no field or member function named 'bar' in 'tmp.entry2__struct_487' // :12:6: note: struct declared here diff --git a/test/cases/compile_errors/coerce_anon_struct.zig b/test/cases/compile_errors/coerce_anon_struct.zig index b53d2750d2..2e2cab6ac7 100644 --- a/test/cases/compile_errors/coerce_anon_struct.zig +++ b/test/cases/compile_errors/coerce_anon_struct.zig @@ -6,6 +6,6 @@ export fn foo() void { // error // -// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_472' +// :4:16: error: expected type 'tmp.T', found 'tmp.foo__struct_476' // :3:16: note: struct declared here // :1:11: note: struct declared here diff --git a/test/cases/compile_errors/redundant_try.zig b/test/cases/compile_errors/redundant_try.zig index 4da2a31dbc..b1acf7f7cc 100644 --- a/test/cases/compile_errors/redundant_try.zig +++ b/test/cases/compile_errors/redundant_try.zig @@ -44,9 +44,9 @@ comptime { // // :5:23: error: expected error union type, found 'comptime_int' // :10:23: error: expected error union type, found '@TypeOf(.{})' -// :15:23: error: expected error union type, found 'tmp.test2__struct_509' +// :15:23: error: expected error union type, found 'tmp.test2__struct_513' // :15:23: note: struct declared here -// :20:27: error: expected error union type, found 'tmp.test3__struct_511' +// :20:27: error: expected error union type, found 'tmp.test3__struct_515' // :20:27: note: struct declared here // :25:23: error: expected error union type, found 'struct { comptime *const [5:0]u8 = "hello" }' // :31:13: error: expected error union type, found 'u32'