Air.Legalize: revert to loops for scalarizations

I had tried unrolling the loops to avoid requiring the
`vector_store_elem` instruction, but it's arguably a problem to generate
O(N) code for an operation on `@Vector(N, T)`. In addition, that
lowering emitted a lot of `.aggregate_init` instructions, which is
itself a quite difficult operation to codegen.

This requires reintroducing runtime vector indexing internally. However,
I've put it in a couple of instructions which are intended only for use
by `Air.Legalize`, named `legalize_vec_elem_val` (like `array_elem_val`,
but for indexing a vector with a runtime-known index) and
`legalize_vec_store_elem` (like the old `vector_store_elem`
instruction). These are explicitly documented as *not* being emitted by
Sema, so need only be implemented by backends if they actually use an
`Air.Legalize.Feature` which emits them (otherwise they can be marked as
`unreachable`).
This commit is contained in:
Matthew Lugg 2025-11-09 15:16:49 +00:00
parent 99a7884308
commit 69f39868b4
No known key found for this signature in database
GPG Key ID: 3F5B7DCCBF4AF02E
14 changed files with 1543 additions and 296 deletions

View File

@ -660,8 +660,8 @@ pub const Inst = struct {
/// Given a pointer to a slice, return a pointer to the pointer of the slice.
/// Uses the `ty_op` field.
ptr_slice_ptr_ptr,
/// Given an (array value or vector value) and element index,
/// return the element value at that index.
/// Given an (array value or vector value) and element index, return the element value at
/// that index. If the lhs is a vector value, the index is guaranteed to be comptime-known.
/// Result type is the element type of the array operand.
/// Uses the `bin_op` field.
array_elem_val,
@ -915,6 +915,26 @@ pub const Inst = struct {
/// Operand is unused and set to Ref.none
work_group_id,
// The remaining instructions are not emitted by Sema. They are only emitted by `Legalize`,
// depending on the enabled features. As such, backends can consider them `unreachable` if
// they do not enable the relevant legalizations.
/// Given a pointer to a vector, a runtime-known index, and a scalar value, store the value
/// into the vector at the given index. Zig does not support this operation, but `Legalize`
/// may emit it when scalarizing vector operations.
///
/// Uses the `pl_op` field with payload `Bin`. `operand` is the vector pointer. `lhs` is the
/// element index of type `usize`. `rhs` is the element value. Result is always void.
legalize_vec_store_elem,
/// Given a vector value and a runtime-known index, return the element value at that index.
/// This instruction is similar to `array_elem_val`; the only difference is that the index
/// here is runtime-known, which is usually not allowed for vectors. `Legalize` may emit
/// this instruction when scalarizing vector operations.
///
/// Uses the `bin_op` field. `lhs` is the vector pointer. `rhs` is the element index. Result
/// type is the vector element type.
legalize_vec_elem_val,
pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag {
switch (op) {
.lt => return if (optimized) .cmp_lt_optimized else .cmp_lt,
@ -1681,6 +1701,7 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
.prefetch,
.set_err_return_trace,
.c_va_end,
.legalize_vec_store_elem,
=> return .void,
.slice_len,
@ -1699,7 +1720,7 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
return .fromInterned(ip.funcTypeReturnType(callee_ty.toIntern()));
},
.slice_elem_val, .ptr_elem_val, .array_elem_val => {
.slice_elem_val, .ptr_elem_val, .array_elem_val, .legalize_vec_elem_val => {
const ptr_ty = air.typeOf(datas[@intFromEnum(inst)].bin_op.lhs, ip);
return ptr_ty.childTypeIp(ip);
},
@ -1857,6 +1878,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
.intcast_safe,
.int_from_float_safe,
.int_from_float_optimized_safe,
.legalize_vec_store_elem,
=> true,
.add,
@ -2002,6 +2024,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
.work_item_id,
.work_group_size,
.work_group_id,
.legalize_vec_elem_val,
=> false,
.is_non_null_ptr, .is_null_ptr, .is_non_err_ptr, .is_err_ptr => air.typeOf(data.un_op, ip).isVolatilePtrIp(ip),

File diff suppressed because it is too large Load Diff

View File

@ -458,6 +458,7 @@ fn analyzeInst(
.memset_safe,
.memcpy,
.memmove,
.legalize_vec_elem_val,
=> {
const o = inst_datas[@intFromEnum(inst)].bin_op;
return analyzeOperands(a, pass, data, inst, .{ o.lhs, o.rhs, .none });
@ -769,6 +770,12 @@ fn analyzeInst(
const pl_op = inst_datas[@intFromEnum(inst)].pl_op;
return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, .none, .none });
},
.legalize_vec_store_elem => {
const pl_op = inst_datas[@intFromEnum(inst)].pl_op;
const bin = a.air.extraData(Air.Bin, pl_op.payload).data;
return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, bin.lhs, bin.rhs });
},
}
}

View File

@ -272,6 +272,7 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
.memset_safe,
.memcpy,
.memmove,
.legalize_vec_elem_val,
=> {
const bin_op = data[@intFromEnum(inst)].bin_op;
try self.verifyInstOperands(inst, .{ bin_op.lhs, bin_op.rhs, .none });
@ -577,6 +578,11 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
try self.verifyInst(inst);
},
.legalize_vec_store_elem => {
const pl_op = data[@intFromEnum(inst)].pl_op;
const bin = self.air.extraData(Air.Bin, pl_op.payload).data;
try self.verifyInstOperands(inst, .{ pl_op.operand, bin.lhs, bin.rhs });
},
}
}
}

View File

@ -171,6 +171,7 @@ const Writer = struct {
.memmove,
.memset,
.memset_safe,
.legalize_vec_elem_val,
=> try w.writeBinOp(s, inst),
.is_null,
@ -331,6 +332,7 @@ const Writer = struct {
.reduce, .reduce_optimized => try w.writeReduce(s, inst),
.cmp_vector, .cmp_vector_optimized => try w.writeCmpVector(s, inst),
.runtime_nav_ptr => try w.writeRuntimeNavPtr(s, inst),
.legalize_vec_store_elem => try w.writeLegalizeVecStoreElem(s, inst),
.work_item_id,
.work_group_size,
@ -508,6 +510,18 @@ const Writer = struct {
try w.writeOperand(s, inst, 2, pl_op.operand);
}
fn writeLegalizeVecStoreElem(w: *Writer, s: *std.Io.Writer, inst: Air.Inst.Index) Error!void {
const pl_op = w.air.instructions.items(.data)[@intFromEnum(inst)].pl_op;
const bin = w.air.extraData(Air.Bin, pl_op.payload).data;
try w.writeOperand(s, inst, 0, pl_op.operand);
try s.writeAll(", ");
try w.writeOperand(s, inst, 1, bin.lhs);
try s.writeAll(", ");
try w.writeOperand(s, inst, 2, bin.rhs);
try s.writeAll(", ");
}
fn writeShuffleOne(w: *Writer, s: *std.Io.Writer, inst: Air.Inst.Index) Error!void {
const unwrapped = w.air.unwrapShuffleOne(w.pt.zcu, inst);
try w.writeType(s, unwrapped.result_ty);

View File

@ -88,6 +88,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
.atomic_store_monotonic,
.atomic_store_release,
.atomic_store_seq_cst,
.legalize_vec_elem_val,
=> {
if (!checkRef(data.bin_op.lhs, zcu)) return false;
if (!checkRef(data.bin_op.rhs, zcu)) return false;
@ -322,6 +323,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool {
.select,
.mul_add,
.legalize_vec_store_elem,
=> {
const bin = air.extraData(Air.Bin, data.pl_op.payload).data;
if (!checkRef(data.pl_op.operand, zcu)) return false;

View File

@ -15930,7 +15930,11 @@ fn zirOverflowArithmetic(
}
}
// If either of the arguments is one, the result is the other and no overflow occured.
const scalar_one = try pt.intValue(dest_ty.scalarType(zcu), 1);
const dest_scalar_ty = dest_ty.scalarType(zcu);
const dest_scalar_int = dest_scalar_ty.intInfo(zcu);
// We could still be working with i1, where '1' is not a legal value!
if (!(dest_scalar_int.bits == 1 and dest_scalar_int.signedness == .signed)) {
const scalar_one = try pt.intValue(dest_scalar_ty, 1);
const vec_one = try sema.splat(dest_ty, scalar_one);
if (maybe_lhs_val) |lhs_val| {
if (!lhs_val.isUndef(zcu) and try sema.compareAll(lhs_val, .eq, vec_one, dest_ty)) {
@ -15942,6 +15946,7 @@ fn zirOverflowArithmetic(
break :result .{ .overflow_bit = try sema.splat(overflow_ty, .zero_u1), .inst = lhs };
}
}
}
if (maybe_lhs_val) |lhs_val| {
if (maybe_rhs_val) |rhs_val| {

View File

@ -134,6 +134,10 @@ pub fn analyze(isel: *Select, air_body: []const Air.Inst.Index) !void {
var air_inst_index = air_body[air_body_index];
const initial_def_order_len = isel.def_order.count();
air_tag: switch (air_tags[@intFromEnum(air_inst_index)]) {
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.arg,
.ret_addr,
.frame_addr,
@ -950,6 +954,11 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory,
};
air_tag: switch (air.next().?) {
else => |air_tag| return isel.fail("unimplemented {t}", .{air_tag}),
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.arg => {
const arg_vi = isel.live_values.fetchRemove(air.inst_index).?.value;
defer arg_vi.deref(isel);

View File

@ -3325,6 +3325,10 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void {
// zig fmt: off
.inferred_alloc, .inferred_alloc_comptime => unreachable,
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.arg => try airArg(f, inst),
.breakpoint => try airBreakpoint(f),

View File

@ -4886,6 +4886,11 @@ pub const FuncGen = struct {
const val: Builder.Value = switch (air_tags[@intFromEnum(inst)]) {
// zig fmt: off
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.add => try self.airAdd(inst, .normal),
.add_optimized => try self.airAdd(inst, .fast),
.add_wrap => try self.airAddWrap(inst),

View File

@ -1391,6 +1391,11 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void {
const tag = air_tags[@intFromEnum(inst)];
switch (tag) {
// zig fmt: off
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.add,
.add_wrap,
.sub,

View File

@ -479,6 +479,11 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
self.reused_operands = @TypeOf(self.reused_operands).initEmpty();
switch (air_tags[@intFromEnum(inst)]) {
// zig fmt: off
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.ptr_add => try self.airPtrArithmetic(inst, .ptr_add),
.ptr_sub => try self.airPtrArithmetic(inst, .ptr_sub),

View File

@ -1786,6 +1786,10 @@ fn buildPointerOffset(cg: *CodeGen, ptr_value: WValue, offset: u64, action: enum
fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void {
const air_tags = cg.air.instructions.items(.tag);
return switch (air_tags[@intFromEnum(inst)]) {
// No "scalarize" legalizations are enabled, so these instructions never appear.
.legalize_vec_elem_val => unreachable,
.legalize_vec_store_elem => unreachable,
.inferred_alloc, .inferred_alloc_comptime => unreachable,
.add => cg.airBinOp(inst, .add),

View File

@ -103926,7 +103926,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
try ops[0].toOffset(0, cg);
try ops[0].finish(inst, &.{ty_op.operand}, &ops, cg);
},
.array_elem_val => {
.array_elem_val, .legalize_vec_elem_val => {
const bin_op = air_datas[@intFromEnum(inst)].bin_op;
const array_ty = cg.typeOf(bin_op.lhs);
const res_ty = array_ty.elemType2(zcu);
@ -173061,6 +173061,634 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void {
.c_va_copy => try cg.airVaCopy(inst),
.c_va_end => try cg.airVaEnd(inst),
.c_va_start => try cg.airVaStart(inst),
.legalize_vec_store_elem => {
const pl_op = air_datas[@intFromEnum(inst)].pl_op;
const bin = cg.air.extraData(Air.Bin, pl_op.payload).data;
// vector_ptr, index, elem_val
var ops = try cg.tempsFromOperands(inst, .{ pl_op.operand, bin.lhs, bin.rhs });
cg.select(&.{}, &.{}, &ops, comptime &.{ .{
.src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
},
.extra_temps = .{
.{ .type = .u8, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
.{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
} },
}, .{
.src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
},
.extra_temps = .{
.{ .type = .u8, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
.{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
} },
}, .{
.required_features = .{ .cmov, null, null, null },
.src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u8, .kind = .{ .rc = .general_purpose } },
.{ .type = .u8, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
.{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
.{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
.{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
.{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
} },
}, .{
.src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u8, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._nz, .j, .@"0f", ._, ._, ._ },
.{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._mp, .j, .@"1f", ._, ._, ._ },
.{ .@"0:", ._s, .bt, .tmp0d, .src1d, ._, ._ },
.{ .@"1:", ._, .mov, .lea(.src0b), .tmp0b, ._, ._ },
} },
}, .{
.src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ },
} },
}, .{
.src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
} },
}, .{
.required_features = .{ .cmov, null, null, null },
.src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u16, .kind = .{ .rc = .general_purpose } },
.{ .type = .u16, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .movzx, .tmp0d, .lea(.src0w), ._, ._ },
.{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
.{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
.{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
.{ ._, ._, .mov, .lea(.src0w), .tmp0w, ._, ._ },
} },
}, .{
.src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
.{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ },
.{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
.{ .@"1:", ._s, .bt, .lea(.src0w), .src1w, ._, ._ },
} },
}, .{
.src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ },
} },
}, .{
.src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
} },
}, .{
.required_features = .{ .cmov, null, null, null },
.src_constraints = .{ .{ .ptr_bool_vec = .dword }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u32, .kind = .{ .rc = .general_purpose } },
.{ .type = .u32, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .mov, .tmp0d, .lea(.src0d), ._, ._ },
.{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ },
.{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ },
.{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ },
.{ ._, ._, .mov, .lea(.src0d), .tmp0d, ._, ._ },
} },
}, .{
.required_features = .{ .@"64bit", .cmov, null, null },
.src_constraints = .{ .{ .ptr_bool_vec = .qword }, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u64, .kind = .{ .rc = .general_purpose } },
.{ .type = .u64, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .mov, .tmp0q, .lea(.src0q), ._, ._ },
.{ ._, ._, .mov, .tmp1q, .tmp0q, ._, ._ },
.{ ._, ._r, .bt, .tmp1q, .src1q, ._, ._ },
.{ ._, ._s, .bt, .tmp0q, .src1q, ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._z, .cmov, .tmp0q, .tmp1q, ._, ._ },
.{ ._, ._, .mov, .lea(.src0q), .tmp0q, ._, ._ },
} },
}, .{
.required_features = .{ .cmov, null, null, null },
.src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.extra_temps = .{
.{ .type = .u32, .kind = .{ .rc = .general_purpose } },
.{ .type = .u32, .kind = .{ .rc = .general_purpose } },
.{ .type = .u32, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .mov, .tmp0d, .src1d, ._, ._ },
.{ ._, ._r, .sh, .tmp0d, .ui(5), ._, ._ },
.{ ._, ._, .mov, .tmp1d, .leasi(.src0d, .@"4", .tmp0), ._, ._ },
.{ ._, ._, .mov, .tmp2d, .tmp1d, ._, ._ },
.{ ._, ._r, .bt, .tmp2d, .src1d, ._, ._ },
.{ ._, ._s, .bt, .tmp1d, .src1d, ._, ._ },
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._z, .cmov, .tmp1d, .tmp2d, ._, ._ },
.{ ._, ._, .mov, .leasi(.src0d, .@"4", .tmp0), .tmp1d, ._, ._ },
} },
}, .{
.src_constraints = .{ .ptr_any_bool_vec, .any, .bool },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.clobbers = .{ .eflags = true },
.each = .{ .once = &.{
.{ ._, ._, .@"test", .src2b, .si(1), ._, ._ },
.{ ._, ._nz, .j, .@"1f", ._, ._, ._ },
.{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ },
.{ ._, ._mp, .j, .@"0f", ._, ._, ._ },
.{ .@"1:", ._s, .bt, .lea(.src0d), .src1d, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .byte } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .imm8 } },
.{ .src = .{ .to_gpr, .simm32, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leaa(.src0b, .add_src0_elem_size_mul_src1), .src2b, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .byte } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .imm8 } },
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leai(.src0b, .src1), .src2b, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .imm16 } },
.{ .src = .{ .to_gpr, .simm32, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2w, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .imm16 } },
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .src2w, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .vp_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
} },
}, .{
.required_features = .{ .sse4_1, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .p_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ },
} },
}, .{
.required_features = .{ .sse2, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.extra_temps = .{
.{ .type = .f16, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.each = .{ .once = &.{
.{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
.{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp0w, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.extra_temps = .{
.{ .type = .f32, .kind = .mem },
.{ .type = .f16, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.each = .{ .once = &.{
.{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
.{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
.{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp1w, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .vp_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
} },
}, .{
.required_features = .{ .sse4_1, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .p_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ },
} },
}, .{
.required_features = .{ .sse2, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.extra_temps = .{
.{ .type = .f16, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.each = .{ .once = &.{
.{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ },
.{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp0w, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .word } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.extra_temps = .{
.{ .type = .f32, .kind = .mem },
.{ .type = .f16, .kind = .{ .rc = .general_purpose } },
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
.unused,
},
.each = .{ .once = &.{
.{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ },
.{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ },
.{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp1w, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .imm32 } },
.{ .src = .{ .to_gpr, .simm32, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2d, ._, ._ },
} },
}, .{
.src_constraints = .{ .any, .any, .{ .int = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .imm32 } },
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leasi(.src0d, .@"4", .src1), .src2d, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .v_ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .v_ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .dword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .@"64bit", null, null, null },
.src_constraints = .{ .any, .any, .{ .int = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .simm32 } },
.{ .src = .{ .to_gpr, .simm32, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2q, ._, ._ },
} },
}, .{
.required_features = .{ .@"64bit", null, null, null },
.src_constraints = .{ .any, .any, .{ .int = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .simm32 } },
.{ .src = .{ .to_gpr, .to_gpr, .to_gpr } },
},
.each = .{ .once = &.{
.{ ._, ._, .mov, .leasi(.src0q, .@"8", .src1), .src2q, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .v_sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse2, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .simm32, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._ps, .movl, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .avx, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, .v_sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse2, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
} },
}, .{
.required_features = .{ .sse, null, null, null },
.src_constraints = .{ .any, .any, .{ .float = .qword } },
.patterns = &.{
.{ .src = .{ .to_gpr, .to_gpr, .to_sse } },
},
.each = .{ .once = &.{
.{ ._, ._ps, .movl, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ },
} },
} }) catch |err| switch (err) {
error.SelectFailed => {
const elem_size = cg.typeOf(bin.rhs).abiSize(zcu);
while (try ops[0].toRegClass(true, .general_purpose, cg) or
try ops[1].toRegClass(true, .general_purpose, cg))
{}
const base_reg = ops[0].tracking(cg).short.register.to64();
const rhs_reg = ops[1].tracking(cg).short.register.to64();
if (!std.math.isPowerOfTwo(elem_size)) {
try cg.spillEflagsIfOccupied();
try cg.asmRegisterRegisterImmediate(
.{ .i_, .mul },
rhs_reg,
rhs_reg,
.u(elem_size),
);
try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
.base = .{ .reg = base_reg },
.mod = .{ .rm = .{ .index = rhs_reg } },
});
} else if (elem_size > 8) {
try cg.spillEflagsIfOccupied();
try cg.asmRegisterImmediate(
.{ ._l, .sh },
rhs_reg,
.u(std.math.log2_int(u64, elem_size)),
);
try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
.base = .{ .reg = base_reg },
.mod = .{ .rm = .{ .index = rhs_reg } },
});
} else try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{
.base = .{ .reg = base_reg },
.mod = .{ .rm = .{
.index = rhs_reg,
.scale = .fromFactor(@intCast(elem_size)),
} },
});
try ops[0].store(&ops[2], .{}, cg);
},
else => |e| return e,
};
for (ops) |op| try op.die(cg);
},
.work_item_id, .work_group_size, .work_group_id => unreachable,
}
try cg.resetTemps(@enumFromInt(0));