From 69f39868b4125e79e4070a88bbdfcd3643dbc90d Mon Sep 17 00:00:00 2001 From: Matthew Lugg Date: Sun, 9 Nov 2025 15:16:49 +0000 Subject: [PATCH] Air.Legalize: revert to loops for scalarizations I had tried unrolling the loops to avoid requiring the `vector_store_elem` instruction, but it's arguably a problem to generate O(N) code for an operation on `@Vector(N, T)`. In addition, that lowering emitted a lot of `.aggregate_init` instructions, which is itself a quite difficult operation to codegen. This requires reintroducing runtime vector indexing internally. However, I've put it in a couple of instructions which are intended only for use by `Air.Legalize`, named `legalize_vec_elem_val` (like `array_elem_val`, but for indexing a vector with a runtime-known index) and `legalize_vec_store_elem` (like the old `vector_store_elem` instruction). These are explicitly documented as *not* being emitted by Sema, so need only be implemented by backends if they actually use an `Air.Legalize.Feature` which emits them (otherwise they can be marked as `unreachable`). --- src/Air.zig | 29 +- src/Air/Legalize.zig | 1096 +++++++++++++++++++++++-------- src/Air/Liveness.zig | 7 + src/Air/Liveness/Verify.zig | 6 + src/Air/print.zig | 14 + src/Air/types_resolved.zig | 2 + src/Sema.zig | 23 +- src/codegen/aarch64/Select.zig | 9 + src/codegen/c.zig | 4 + src/codegen/llvm.zig | 5 + src/codegen/riscv64/CodeGen.zig | 5 + src/codegen/sparc64/CodeGen.zig | 5 + src/codegen/wasm/CodeGen.zig | 4 + src/codegen/x86_64/CodeGen.zig | 630 +++++++++++++++++- 14 files changed, 1543 insertions(+), 296 deletions(-) diff --git a/src/Air.zig b/src/Air.zig index db5307f459..722ea28305 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -660,8 +660,8 @@ pub const Inst = struct { /// Given a pointer to a slice, return a pointer to the pointer of the slice. /// Uses the `ty_op` field. ptr_slice_ptr_ptr, - /// Given an (array value or vector value) and element index, - /// return the element value at that index. + /// Given an (array value or vector value) and element index, return the element value at + /// that index. If the lhs is a vector value, the index is guaranteed to be comptime-known. /// Result type is the element type of the array operand. /// Uses the `bin_op` field. array_elem_val, @@ -915,6 +915,26 @@ pub const Inst = struct { /// Operand is unused and set to Ref.none work_group_id, + // The remaining instructions are not emitted by Sema. They are only emitted by `Legalize`, + // depending on the enabled features. As such, backends can consider them `unreachable` if + // they do not enable the relevant legalizations. + + /// Given a pointer to a vector, a runtime-known index, and a scalar value, store the value + /// into the vector at the given index. Zig does not support this operation, but `Legalize` + /// may emit it when scalarizing vector operations. + /// + /// Uses the `pl_op` field with payload `Bin`. `operand` is the vector pointer. `lhs` is the + /// element index of type `usize`. `rhs` is the element value. Result is always void. + legalize_vec_store_elem, + /// Given a vector value and a runtime-known index, return the element value at that index. + /// This instruction is similar to `array_elem_val`; the only difference is that the index + /// here is runtime-known, which is usually not allowed for vectors. `Legalize` may emit + /// this instruction when scalarizing vector operations. + /// + /// Uses the `bin_op` field. `lhs` is the vector pointer. `rhs` is the element index. Result + /// type is the vector element type. + legalize_vec_elem_val, + pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag { switch (op) { .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt, @@ -1681,6 +1701,7 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool) .prefetch, .set_err_return_trace, .c_va_end, + .legalize_vec_store_elem, => return .void, .slice_len, @@ -1699,7 +1720,7 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool) return .fromInterned(ip.funcTypeReturnType(callee_ty.toIntern())); }, - .slice_elem_val, .ptr_elem_val, .array_elem_val => { + .slice_elem_val, .ptr_elem_val, .array_elem_val, .legalize_vec_elem_val => { const ptr_ty = air.typeOf(datas[@intFromEnum(inst)].bin_op.lhs, ip); return ptr_ty.childTypeIp(ip); }, @@ -1857,6 +1878,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool { .intcast_safe, .int_from_float_safe, .int_from_float_optimized_safe, + .legalize_vec_store_elem, => true, .add, @@ -2002,6 +2024,7 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool { .work_item_id, .work_group_size, .work_group_id, + .legalize_vec_elem_val, => false, .is_non_null_ptr, .is_null_ptr, .is_non_err_ptr, .is_err_ptr => air.typeOf(data.un_op, ip).isVolatilePtrIp(ip), diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig index a26ed89964..1d935bd360 100644 --- a/src/Air/Legalize.zig +++ b/src/Air/Legalize.zig @@ -320,28 +320,36 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .xor, => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) { const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op; - if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op); + if (l.typeOf(bin_op.lhs).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)); + } }, .add_safe => if (l.features.has(.expand_add_safe)) { assert(!l.features.has(.scalarize_add_safe)); // it doesn't make sense to do both continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .add_with_overflow)); } else if (l.features.has(.scalarize_add_safe)) { const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op; - if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op); + if (l.typeOf(bin_op.lhs).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)); + } }, .sub_safe => if (l.features.has(.expand_sub_safe)) { assert(!l.features.has(.scalarize_sub_safe)); // it doesn't make sense to do both continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .sub_with_overflow)); } else if (l.features.has(.scalarize_sub_safe)) { const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op; - if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op); + if (l.typeOf(bin_op.lhs).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)); + } }, .mul_safe => if (l.features.has(.expand_mul_safe)) { assert(!l.features.has(.scalarize_mul_safe)); // it doesn't make sense to do both continue :inst l.replaceInst(inst, .block, try l.safeArithmeticBlockPayload(inst, .mul_with_overflow)); } else if (l.features.has(.scalarize_mul_safe)) { const bin_op = l.air_instructions.items(.data)[@intFromEnum(inst)].bin_op; - if (l.typeOf(bin_op.lhs).isVector(zcu)) continue :inst try l.scalarize(inst, .bin_op); + if (l.typeOf(bin_op.lhs).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)); + } }, .ptr_add, .ptr_sub => {}, inline .add_with_overflow, @@ -350,7 +358,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .shl_with_overflow, => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) { const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl; - if (ty_pl.ty.toType().fieldType(0, zcu).isVector(zcu)) continue :inst l.replaceInst(inst, .block, try l.scalarizeOverflowBlockPayload(inst)); + if (ty_pl.ty.toType().fieldType(0, zcu).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeOverflowBlockPayload(inst)); + } }, .alloc => {}, .inferred_alloc, .inferred_alloc_comptime => unreachable, @@ -387,7 +397,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { } } } - if (l.features.has(comptime .scalarize(air_tag))) continue :inst try l.scalarize(inst, .bin_op); + if (l.features.has(comptime .scalarize(air_tag))) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .bin_op)); + } } }, inline .not, @@ -406,7 +418,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .float_from_int, => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) { const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op; - if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op); + if (ty_op.ty.toType().isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)); + } }, .bitcast => if (l.features.has(.scalarize_bitcast)) { if (try l.scalarizeBitcastBlockPayload(inst)) |payload| { @@ -418,21 +432,27 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { continue :inst l.replaceInst(inst, .block, try l.safeIntcastBlockPayload(inst)); } else if (l.features.has(.scalarize_intcast_safe)) { const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op; - if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op); + if (ty_op.ty.toType().isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)); + } }, .int_from_float_safe => if (l.features.has(.expand_int_from_float_safe)) { assert(!l.features.has(.scalarize_int_from_float_safe)); continue :inst l.replaceInst(inst, .block, try l.safeIntFromFloatBlockPayload(inst, false)); } else if (l.features.has(.scalarize_int_from_float_safe)) { const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op; - if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op); + if (ty_op.ty.toType().isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)); + } }, .int_from_float_optimized_safe => if (l.features.has(.expand_int_from_float_optimized_safe)) { assert(!l.features.has(.scalarize_int_from_float_optimized_safe)); continue :inst l.replaceInst(inst, .block, try l.safeIntFromFloatBlockPayload(inst, true)); } else if (l.features.has(.scalarize_int_from_float_optimized_safe)) { const ty_op = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_op; - if (ty_op.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .ty_op); + if (ty_op.ty.toType().isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .ty_op)); + } }, .block, .loop => { const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl; @@ -467,7 +487,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .neg_optimized, => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) { const un_op = l.air_instructions.items(.data)[@intFromEnum(inst)].un_op; - if (l.typeOf(un_op).isVector(zcu)) continue :inst try l.scalarize(inst, .un_op); + if (l.typeOf(un_op).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .un_op)); + } }, .cmp_lt, .cmp_lt_optimized, @@ -484,7 +506,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { => {}, inline .cmp_vector, .cmp_vector_optimized => |air_tag| if (l.features.has(comptime .scalarize(air_tag))) { const ty_pl = l.air_instructions.items(.data)[@intFromEnum(inst)].ty_pl; - if (ty_pl.ty.toType().isVector(zcu)) continue :inst try l.scalarize(inst, .cmp_vector); + if (ty_pl.ty.toType().isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .cmp_vector)); + } }, .cond_br => { const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op; @@ -614,9 +638,15 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { else => {}, } }, - .shuffle_one => if (l.features.has(.scalarize_shuffle_one)) continue :inst try l.scalarize(inst, .shuffle_one), - .shuffle_two => if (l.features.has(.scalarize_shuffle_two)) continue :inst try l.scalarize(inst, .shuffle_two), - .select => if (l.features.has(.scalarize_select)) continue :inst try l.scalarize(inst, .select), + .shuffle_one => if (l.features.has(.scalarize_shuffle_one)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeShuffleOneBlockPayload(inst)); + }, + .shuffle_two => if (l.features.has(.scalarize_shuffle_two)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeShuffleTwoBlockPayload(inst)); + }, + .select => if (l.features.has(.scalarize_select)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .select)); + }, .memset, .memset_safe, .memcpy, @@ -657,7 +687,9 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .union_init, .prefetch => {}, .mul_add => if (l.features.has(.scalarize_mul_add)) { const pl_op = l.air_instructions.items(.data)[@intFromEnum(inst)].pl_op; - if (l.typeOf(pl_op.operand).isVector(zcu)) continue :inst try l.scalarize(inst, .pl_op_bin); + if (l.typeOf(pl_op.operand).isVector(zcu)) { + continue :inst l.replaceInst(inst, .block, try l.scalarizeBlockPayload(inst, .pl_op_bin)); + } }, .field_parent_ptr, .wasm_memory_size, @@ -675,96 +707,123 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { .work_item_id, .work_group_size, .work_group_id, + .legalize_vec_elem_val, + .legalize_vec_store_elem, => {}, } } } -const ScalarizeForm = enum { un_op, ty_op, bin_op, pl_op_bin, cmp_vector, shuffle_one, shuffle_two, select }; -/// inline to propagate comptime-known `replaceInst` result. -inline fn scalarize(l: *Legalize, orig_inst: Air.Inst.Index, comptime form: ScalarizeForm) Error!Air.Inst.Tag { - return l.replaceInst(orig_inst, .block, try l.scalarizeBlockPayload(orig_inst, form)); -} -fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime form: ScalarizeForm) Error!Air.Inst.Data { +const ScalarizeForm = enum { un_op, ty_op, bin_op, pl_op_bin, cmp_vector, select }; +fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, form: ScalarizeForm) Error!Air.Inst.Data { const pt = l.pt; const zcu = pt.zcu; - const gpa = zcu.gpa; const orig = l.air_instructions.get(@intFromEnum(orig_inst)); const res_ty = l.typeOfIndex(orig_inst); - const res_len = res_ty.vectorLen(zcu); + const result_is_array = switch (res_ty.zigTypeTag(zcu)) { + .vector => false, + .array => true, + else => unreachable, + }; + const res_len = res_ty.arrayLen(zcu); + const res_elem_ty = res_ty.childType(zcu); - const inst_per_elem = switch (form) { + if (result_is_array) { + // This is only allowed when legalizing an elementwise bitcast. + assert(orig.tag == .bitcast); + assert(form == .ty_op); + } + + // Our output will be a loop doing elementwise stores: + // + // %1 = block(@Vector(N, Scalar), { + // %2 = alloc(*usize) + // %3 = alloc(*@Vector(N, Scalar)) + // %4 = store(%2, @zero_usize) + // %5 = loop({ + // %6 = load(%2) + // %7 = + // %8 = legalize_vec_store_elem(%3, %5, %6) + // %9 = cmp_eq(%6, ) + // %10 = cond_br(%9, { + // %11 = load(%3) + // %12 = br(%1, %11) + // }, { + // %13 = add(%6, @one_usize) + // %14 = store(%2, %13) + // %15 = repeat(%5) + // }) + // }) + // }) + // + // If scalarizing an elementwise bitcast, the result might be an array, in which case + // `legalize_vec_store_elem` becomes two instructions (`ptr_elem_ptr` and `store`). + // Therefore, there are 13 or 14 instructions in the block, plus however many are + // needed to compute each result element for `form`. + const inst_per_form: usize = switch (form) { .un_op, .ty_op => 2, .bin_op, .cmp_vector => 3, .pl_op_bin => 4, - .shuffle_one, .shuffle_two => 1, .select => 7, }; + const max_inst_per_form = 7; // maximum value in the above switch + var inst_buf: [14 + max_inst_per_form]Air.Inst.Index = undefined; - var sfba_state = std.heap.stackFallback(@sizeOf([inst_per_elem * 32 + 2]Air.Inst.Index) + @sizeOf([32]Air.Inst.Ref), gpa); - const sfba = sfba_state.get(); + var main_block: Block = .init(&inst_buf); + try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len); - // Plus 2 extra instructions for `aggregate_init` and `br`. - const inst_buf = try sfba.alloc(Air.Inst.Index, inst_per_elem * res_len + 2); - defer sfba.free(inst_buf); + const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef(); + const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(res_ty)).toRef(); - var main_block: Block = .init(inst_buf); - try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len); + _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize); - const elem_buf = try sfba.alloc(Air.Inst.Ref, res_len); - defer sfba.free(elem_buf); + var loop: Loop = .init(l, &main_block); + loop.block = .init(main_block.stealRemainingCapacity()); - switch (form) { - .un_op => { + const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef(); + const elem_val: Air.Inst.Ref = switch (form) { + .un_op => elem: { const orig_operand = orig.data.un_op; - const un_op_tag = orig.tag; - for (elem_buf, 0..) |*elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const operand = main_block.addBinOp(l, .array_elem_val, orig_operand, elem_idx_ref).toRef(); - elem.* = main_block.addUnOp(l, un_op_tag, operand).toRef(); - } + const operand = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operand, index_val).toRef(); + break :elem loop.block.addUnOp(l, orig.tag, operand).toRef(); }, - .ty_op => { + .ty_op => elem: { const orig_operand = orig.data.ty_op.operand; - const orig_ty: Type = .fromInterned(orig.data.ty_op.ty.toInterned().?); - const scalar_ty = orig_ty.childType(zcu); - const ty_op_tag = orig.tag; - for (elem_buf, 0..) |*elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const operand = main_block.addBinOp(l, .array_elem_val, orig_operand, elem_idx_ref).toRef(); - elem.* = main_block.addTyOp(l, ty_op_tag, scalar_ty, operand).toRef(); - } + const operand_is_array = switch (l.typeOf(orig_operand).zigTypeTag(zcu)) { + .vector => false, + .array => true, + else => unreachable, + }; + const operand = loop.block.addBinOp( + l, + if (operand_is_array) .array_elem_val else .legalize_vec_elem_val, + orig_operand, + index_val, + ).toRef(); + break :elem loop.block.addTyOp(l, orig.tag, res_elem_ty, operand).toRef(); }, - .bin_op => { - const orig_operands = orig.data.bin_op; - const bin_op_tag = orig.tag; - for (elem_buf, 0..) |*elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const lhs = main_block.addBinOp(l, .array_elem_val, orig_operands.lhs, elem_idx_ref).toRef(); - const rhs = main_block.addBinOp(l, .array_elem_val, orig_operands.rhs, elem_idx_ref).toRef(); - elem.* = main_block.addBinOp(l, bin_op_tag, lhs, rhs).toRef(); - } + .bin_op => elem: { + const orig_bin = orig.data.bin_op; + const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef(); + const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef(); + break :elem loop.block.addBinOp(l, orig.tag, lhs, rhs).toRef(); }, - .pl_op_bin => { + .pl_op_bin => elem: { const orig_operand = orig.data.pl_op.operand; - const orig_payload = l.extraData(Air.Bin, orig.data.pl_op.payload).data; - const pl_op_tag = orig.tag; - for (elem_buf, 0..) |*elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const operand = main_block.addBinOp(l, .array_elem_val, orig_operand, elem_idx_ref).toRef(); - const lhs = main_block.addBinOp(l, .array_elem_val, orig_payload.lhs, elem_idx_ref).toRef(); - const rhs = main_block.addBinOp(l, .array_elem_val, orig_payload.rhs, elem_idx_ref).toRef(); - elem.* = main_block.add(l, .{ - .tag = pl_op_tag, - .data = .{ .pl_op = .{ - .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }), - .operand = operand, - } }, - }).toRef(); - } + const orig_bin = l.extraData(Air.Bin, orig.data.pl_op.payload).data; + const operand = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operand, index_val).toRef(); + const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef(); + const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef(); + break :elem loop.block.add(l, .{ + .tag = orig.tag, + .data = .{ .pl_op = .{ + .operand = operand, + .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }), + } }, + }).toRef(); }, - .cmp_vector => { + .cmp_vector => elem: { const orig_payload = l.extraData(Air.VectorCmp, orig.data.ty_pl.payload).data; const cmp_op = orig_payload.compareOperator(); const optimized = switch (orig.tag) { @@ -772,116 +831,393 @@ fn scalarizeBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, comptime form: .cmp_vector_optimized => true, else => unreachable, }; - for (elem_buf, 0..) |*elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const lhs = main_block.addBinOp(l, .array_elem_val, orig_payload.lhs, elem_idx_ref).toRef(); - const rhs = main_block.addBinOp(l, .array_elem_val, orig_payload.rhs, elem_idx_ref).toRef(); - elem.* = main_block.addCmpScalar(l, cmp_op, lhs, rhs, optimized).toRef(); - } + const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_payload.lhs, index_val).toRef(); + const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_payload.rhs, index_val).toRef(); + break :elem loop.block.addCmpScalar(l, cmp_op, lhs, rhs, optimized).toRef(); }, - .shuffle_one => { - const shuffle = l.getTmpAir().unwrapShuffleOne(zcu, orig_inst); - for (elem_buf, shuffle.mask) |*elem, mask| elem.* = switch (mask.unwrap()) { - .value => |val| .fromIntern(val), - .elem => |src_idx| elem: { - const src_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, src_idx)); - break :elem main_block.addBinOp(l, .array_elem_val, shuffle.operand, src_idx_ref).toRef(); - }, - }; - }, - .shuffle_two => { - const shuffle = l.getTmpAir().unwrapShuffleTwo(zcu, orig_inst); - const scalar_ty = res_ty.childType(zcu); - for (elem_buf, shuffle.mask) |*elem, mask| elem.* = switch (mask.unwrap()) { - .undef => .fromValue(try pt.undefValue(scalar_ty)), - .a_elem => |src_idx| elem: { - const src_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, src_idx)); - break :elem main_block.addBinOp(l, .array_elem_val, shuffle.operand_a, src_idx_ref).toRef(); - }, - .b_elem => |src_idx| elem: { - const src_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, src_idx)); - break :elem main_block.addBinOp(l, .array_elem_val, shuffle.operand_b, src_idx_ref).toRef(); - }, - }; - }, - .select => { + .select => elem: { const orig_cond = orig.data.pl_op.operand; const orig_bin = l.extraData(Air.Bin, orig.data.pl_op.payload).data; - const res_scalar_ty = res_ty.childType(zcu); - for (elem_buf, 0..) |*elem, elem_idx| { - // Payload to be populated later; we need the index early for `br`s. - const elem_block_inst = main_block.add(l, .{ - .tag = .block, - .data = .{ .ty_pl = .{ - .ty = .fromType(res_scalar_ty), - .payload = undefined, - } }, - }); - var elem_block: Block = .init(main_block.stealCapacity(2)); - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const cond = elem_block.addBinOp(l, .array_elem_val, orig_cond, elem_idx_ref).toRef(); - var condbr: CondBr = .init(l, cond, &elem_block, .{}); + const elem_block_inst = loop.block.add(l, .{ + .tag = .block, + .data = .{ .ty_pl = .{ + .ty = .fromType(res_elem_ty), + .payload = undefined, + } }, + }); + var elem_block: Block = .init(loop.block.stealCapacity(2)); + const cond = elem_block.addBinOp(l, .legalize_vec_elem_val, orig_cond, index_val).toRef(); - condbr.then_block = .init(main_block.stealCapacity(2)); - const lhs = condbr.then_block.addBinOp(l, .array_elem_val, orig_bin.lhs, elem_idx_ref).toRef(); - condbr.then_block.addBr(l, elem_block_inst, lhs); + var condbr: CondBr = .init(l, cond, &elem_block, .{}); - condbr.else_block = .init(main_block.stealCapacity(2)); - const rhs = condbr.else_block.addBinOp(l, .array_elem_val, orig_bin.rhs, elem_idx_ref).toRef(); - condbr.else_block.addBr(l, elem_block_inst, rhs); + condbr.then_block = .init(loop.block.stealCapacity(2)); + const lhs = condbr.then_block.addBinOp(l, .legalize_vec_elem_val, orig_bin.lhs, index_val).toRef(); + condbr.then_block.addBr(l, elem_block_inst, lhs); - try condbr.finish(l); + condbr.else_block = .init(loop.block.stealCapacity(2)); + const rhs = condbr.else_block.addBinOp(l, .legalize_vec_elem_val, orig_bin.rhs, index_val).toRef(); + condbr.else_block.addBr(l, elem_block_inst, rhs); - const inst_data = l.air_instructions.items(.data); - inst_data[@intFromEnum(elem_block_inst)].ty_pl.payload = try l.addBlockBody(elem_block.body()); + try condbr.finish(l); - elem.* = elem_block_inst.toRef(); - } + const inst_data = l.air_instructions.items(.data); + inst_data[@intFromEnum(elem_block_inst)].ty_pl.payload = try l.addBlockBody(elem_block.body()); + + break :elem elem_block_inst.toRef(); }, + }; + _ = loop.block.stealCapacity(max_inst_per_form - inst_per_form); + if (result_is_array) { + const elem_ptr = loop.block.add(l, .{ + .tag = .ptr_elem_ptr, + .data = .{ .ty_pl = .{ + .ty = .fromType(try pt.singleMutPtrType(res_elem_ty)), + .payload = try l.addExtra(Air.Bin, .{ + .lhs = result_ptr, + .rhs = index_val, + }), + } }, + }).toRef(); + _ = loop.block.addBinOp(l, .store, elem_ptr, elem_val); + } else { + _ = loop.block.add(l, .{ + .tag = .legalize_vec_store_elem, + .data = .{ .pl_op = .{ + .operand = result_ptr, + .payload = try l.addExtra(Air.Bin, .{ + .lhs = index_val, + .rhs = elem_val, + }), + } }, + }); + _ = loop.block.stealCapacity(1); } + const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, res_len - 1))).toRef(); - const result = main_block.add(l, .{ - .tag = .aggregate_init, - .data = .{ .ty_pl = .{ - .ty = .fromType(res_ty), - .payload = payload: { - const idx = l.air_extra.items.len; - try l.air_extra.appendSlice(gpa, @ptrCast(elem_buf)); - break :payload @intCast(idx); - }, - } }, - }).toRef(); + var condbr: CondBr = .init(l, is_end_val, &loop.block, .{}); + condbr.then_block = .init(loop.block.stealRemainingCapacity()); + const result_val = condbr.then_block.addTyOp(l, .load, res_ty, result_ptr).toRef(); + condbr.then_block.addBr(l, orig_inst, result_val); - main_block.addBr(l, orig_inst, result); + condbr.else_block = .init(condbr.then_block.stealRemainingCapacity()); + const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef(); + _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val); + _ = condbr.else_block.add(l, .{ + .tag = .repeat, + .data = .{ .repeat = .{ .loop_inst = loop.inst } }, + }); - // Some `form` values may intentionally not use the full instruction buffer. - switch (form) { - .un_op, - .ty_op, - .bin_op, - .pl_op_bin, - .cmp_vector, - .select, - => {}, - .shuffle_one, - .shuffle_two, - => _ = main_block.stealRemainingCapacity(), - } + try condbr.finish(l); + + try loop.finish(l); return .{ .ty_pl = .{ .ty = .fromType(res_ty), .payload = try l.addBlockBody(main_block.body()), } }; } -fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?Air.Inst.Data { +fn scalarizeShuffleOneBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data { const pt = l.pt; const zcu = pt.zcu; const gpa = zcu.gpa; + const shuffle = l.getTmpAir().unwrapShuffleOne(zcu, orig_inst); + + // We're going to emit something like this: + // + // var x: @Vector(N, T) = all_comptime_known_elems; + // for (out_idxs, in_idxs) |i, j| x[i] = operand[j]; + // + // So we must first compute `out_idxs` and `in_idxs`. + var sfba_state = std.heap.stackFallback(512, gpa); const sfba = sfba_state.get(); + const out_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len); + defer sfba.free(out_idxs_buf); + + const in_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len); + defer sfba.free(in_idxs_buf); + + var n: usize = 0; + for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) { + .value => {}, + .elem => |in_idx| { + out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern(); + in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern(); + n += 1; + }, + }; + + const init_val: Value = init: { + const undef_val = try pt.undefValue(shuffle.result_ty.childType(zcu)); + const elems = try sfba.alloc(InternPool.Index, shuffle.mask.len); + defer sfba.free(elems); + for (shuffle.mask, elems) |mask, *elem| elem.* = switch (mask.unwrap()) { + .value => |ip_index| ip_index, + .elem => undef_val.toIntern(), + }; + break :init try pt.aggregateValue(shuffle.result_ty, elems); + }; + + // %1 = block(@Vector(N, T), { + // %2 = alloc(*@Vector(N, T)) + // %3 = alloc(*usize) + // %4 = store(%2, ) + // %5 = [addScalarizedShuffle] + // %6 = load(%2) + // %7 = br(%1, %6) + // }) + + var inst_buf: [6]Air.Inst.Index = undefined; + var main_block: Block = .init(&inst_buf); + try l.air_instructions.ensureUnusedCapacity(gpa, 19); + + const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(shuffle.result_ty)).toRef(); + const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef(); + + _ = main_block.addBinOp(l, .store, result_ptr, .fromValue(init_val)); + + try l.addScalarizedShuffle( + &main_block, + shuffle.operand, + result_ptr, + index_ptr, + out_idxs_buf[0..n], + in_idxs_buf[0..n], + ); + + const result_val = main_block.addTyOp(l, .load, shuffle.result_ty, result_ptr).toRef(); + main_block.addBr(l, orig_inst, result_val); + + return .{ .ty_pl = .{ + .ty = .fromType(shuffle.result_ty), + .payload = try l.addBlockBody(main_block.body()), + } }; +} +fn scalarizeShuffleTwoBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data { + const pt = l.pt; + const zcu = pt.zcu; + const gpa = zcu.gpa; + + const shuffle = l.getTmpAir().unwrapShuffleTwo(zcu, orig_inst); + + // We're going to emit something like this: + // + // var x: @Vector(N, T) = undefined; + // for (out_idxs_a, in_idxs_a) |i, j| x[i] = operand_a[j]; + // for (out_idxs_b, in_idxs_b) |i, j| x[i] = operand_b[j]; + // + // The AIR will look like this: + // + // %1 = block(@Vector(N, T), { + // %2 = alloc(*@Vector(N, T)) + // %3 = alloc(*usize) + // %4 = store(%2, <@Vector(N, T), undefined>) + // %5 = [addScalarizedShuffle] + // %6 = [addScalarizedShuffle] + // %7 = load(%2) + // %8 = br(%1, %7) + // }) + + var sfba_state = std.heap.stackFallback(512, gpa); + const sfba = sfba_state.get(); + + const out_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len); + defer sfba.free(out_idxs_buf); + + const in_idxs_buf = try sfba.alloc(InternPool.Index, shuffle.mask.len); + defer sfba.free(in_idxs_buf); + + // Iterate `shuffle.mask` before doing anything, because modifying AIR invalidates it. + const out_idxs_a, const in_idxs_a, const out_idxs_b, const in_idxs_b = idxs: { + var n: usize = 0; + for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) { + .undef, .b_elem => {}, + .a_elem => |in_idx| { + out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern(); + in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern(); + n += 1; + }, + }; + const a_len = n; + for (shuffle.mask, 0..) |mask, out_idx| switch (mask.unwrap()) { + .undef, .a_elem => {}, + .b_elem => |in_idx| { + out_idxs_buf[n] = (try pt.intValue(.usize, out_idx)).toIntern(); + in_idxs_buf[n] = (try pt.intValue(.usize, in_idx)).toIntern(); + n += 1; + }, + }; + break :idxs .{ + out_idxs_buf[0..a_len], + in_idxs_buf[0..a_len], + out_idxs_buf[a_len..n], + in_idxs_buf[a_len..n], + }; + }; + + var inst_buf: [7]Air.Inst.Index = undefined; + var main_block: Block = .init(&inst_buf); + try l.air_instructions.ensureUnusedCapacity(gpa, 33); + + const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(shuffle.result_ty)).toRef(); + const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef(); + + _ = main_block.addBinOp(l, .store, result_ptr, .fromValue(try pt.undefValue(shuffle.result_ty))); + + if (out_idxs_a.len == 0) { + _ = main_block.stealCapacity(1); + } else { + try l.addScalarizedShuffle( + &main_block, + shuffle.operand_a, + result_ptr, + index_ptr, + out_idxs_a, + in_idxs_a, + ); + } + + if (out_idxs_b.len == 0) { + _ = main_block.stealCapacity(1); + } else { + try l.addScalarizedShuffle( + &main_block, + shuffle.operand_b, + result_ptr, + index_ptr, + out_idxs_b, + in_idxs_b, + ); + } + + const result_val = main_block.addTyOp(l, .load, shuffle.result_ty, result_ptr).toRef(); + main_block.addBr(l, orig_inst, result_val); + + return .{ .ty_pl = .{ + .ty = .fromType(shuffle.result_ty), + .payload = try l.addBlockBody(main_block.body()), + } }; +} +/// Adds code to `parent_block` which behaves like this loop: +/// +/// for (out_idxs, in_idxs) |i, j| result_vec_ptr[i] = operand_vec[j]; +/// +/// The actual AIR adds exactly one instruction to `parent_block` itself and 14 instructions +/// overall, and is as follows: +/// +/// %1 = block(void, { +/// %2 = store(index_ptr, @zero_usize) +/// %3 = loop({ +/// %4 = load(index_ptr) +/// %5 = ptr_elem_val(out_idxs_ptr, %4) +/// %6 = ptr_elem_val(in_idxs_ptr, %4) +/// %7 = legalize_vec_elem_val(operand_vec, %6) +/// %8 = legalize_vec_store_elem(result_vec_ptr, %4, %7) +/// %9 = cmp_eq(%4, ) +/// %10 = cond_br(%9, { +/// %11 = br(%1, @void_value) +/// }, { +/// %12 = add(%4, @one_usize) +/// %13 = store(index_ptr, %12) +/// %14 = repeat(%3) +/// }) +/// }) +/// }) +/// +/// The caller is responsible for reserving space in `l.air_instructions`. +fn addScalarizedShuffle( + l: *Legalize, + parent_block: *Block, + operand_vec: Air.Inst.Ref, + result_vec_ptr: Air.Inst.Ref, + index_ptr: Air.Inst.Ref, + out_idxs: []const InternPool.Index, + in_idxs: []const InternPool.Index, +) Error!void { + const pt = l.pt; + + assert(out_idxs.len == in_idxs.len); + const n = out_idxs.len; + + const idxs_ty = try pt.arrayType(.{ .len = n, .child = .usize_type }); + const idxs_ptr_ty = try pt.singleConstPtrType(idxs_ty); + const manyptr_usize_ty = try pt.manyConstPtrType(.usize); + + const out_idxs_ptr = try pt.intern(.{ .ptr = .{ + .ty = manyptr_usize_ty.toIntern(), + .base_addr = .{ .uav = .{ + .val = (try pt.aggregateValue(idxs_ty, out_idxs)).toIntern(), + .orig_ty = idxs_ptr_ty.toIntern(), + } }, + .byte_offset = 0, + } }); + const in_idxs_ptr = try pt.intern(.{ .ptr = .{ + .ty = manyptr_usize_ty.toIntern(), + .base_addr = .{ .uav = .{ + .val = (try pt.aggregateValue(idxs_ty, in_idxs)).toIntern(), + .orig_ty = idxs_ptr_ty.toIntern(), + } }, + .byte_offset = 0, + } }); + + const main_block_inst = parent_block.add(l, .{ + .tag = .block, + .data = .{ .ty_pl = .{ + .ty = .void_type, + .payload = undefined, + } }, + }); + + var inst_buf: [13]Air.Inst.Index = undefined; + var main_block: Block = .init(&inst_buf); + + _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize); + + var loop: Loop = .init(l, &main_block); + loop.block = .init(main_block.stealRemainingCapacity()); + + const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef(); + const in_idx_val = loop.block.addBinOp(l, .ptr_elem_val, .fromIntern(in_idxs_ptr), index_val).toRef(); + const out_idx_val = loop.block.addBinOp(l, .ptr_elem_val, .fromIntern(out_idxs_ptr), index_val).toRef(); + + const elem_val = loop.block.addBinOp(l, .legalize_vec_elem_val, operand_vec, in_idx_val).toRef(); + _ = loop.block.add(l, .{ + .tag = .legalize_vec_store_elem, + .data = .{ .pl_op = .{ + .operand = result_vec_ptr, + .payload = try l.addExtra(Air.Bin, .{ + .lhs = out_idx_val, + .rhs = elem_val, + }), + } }, + }); + + const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, n - 1))).toRef(); + var condbr: CondBr = .init(l, is_end_val, &loop.block, .{}); + condbr.then_block = .init(loop.block.stealRemainingCapacity()); + condbr.then_block.addBr(l, main_block_inst, .void_value); + + condbr.else_block = .init(condbr.then_block.stealRemainingCapacity()); + const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef(); + _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val); + _ = condbr.else_block.add(l, .{ + .tag = .repeat, + .data = .{ .repeat = .{ .loop_inst = loop.inst } }, + }); + + try condbr.finish(l); + try loop.finish(l); + + const inst_data = l.air_instructions.items(.data); + inst_data[@intFromEnum(main_block_inst)].ty_pl.payload = try l.addBlockBody(main_block.body()); +} +fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!?Air.Inst.Data { + const pt = l.pt; + const zcu = pt.zcu; + const ty_op = l.air_instructions.items(.data)[@intFromEnum(orig_inst)].ty_op; const dest_ty = ty_op.ty.toType(); @@ -920,72 +1256,204 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!? const uint_ty = try pt.intType(.unsigned, num_bits); const shift_ty = try pt.intType(.unsigned, std.math.log2_int_ceil(u16, num_bits)); - const inst_buf = try sfba.alloc(Air.Inst.Index, len: { - const operand_to_uint_len: u64 = if (operand_legal) 1 else (operand_ty.arrayLen(zcu) * 5); - const uint_to_dest_len: u64 = if (dest_legal) 1 else (dest_ty.arrayLen(zcu) * 3 + 1); - break :len @intCast(operand_to_uint_len + uint_to_dest_len + 1); - }); - defer sfba.free(inst_buf); - var main_block: Block = .init(inst_buf); - try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len); + var inst_buf: [39]Air.Inst.Index = undefined; + var main_block: Block = .init(&inst_buf); + try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len); // First, convert `operand_ty` to `uint_ty` (`uN`). const uint_val: Air.Inst.Ref = uint_val: { - if (operand_legal) break :uint_val main_block.addBitCast(l, uint_ty, ty_op.operand); - - const bits_per_elem: u16 = @intCast(operand_ty.childType(zcu).bitSize(zcu)); - const bits_per_elem_ref: Air.Inst.Ref = .fromValue(try pt.intValue(shift_ty, bits_per_elem)); - const elem_uint_ty = try pt.intType(.unsigned, bits_per_elem); - - var cur_uint: Air.Inst.Ref = .fromValue(try pt.intValue(uint_ty, 0)); - var elem_idx = operand_ty.arrayLen(zcu); - while (elem_idx > 0) { - elem_idx -= 1; - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const orig_elem = main_block.addBinOp(l, .array_elem_val, ty_op.operand, elem_idx_ref).toRef(); - const elem_as_uint = main_block.addBitCast(l, elem_uint_ty, orig_elem); - const elem_extended = main_block.addTyOp(l, .intcast, uint_ty, elem_as_uint).toRef(); - cur_uint = main_block.addBinOp(l, .shl_exact, cur_uint, bits_per_elem_ref).toRef(); - cur_uint = main_block.addBinOp(l, .bit_or, cur_uint, elem_extended).toRef(); + if (operand_legal) { + _ = main_block.stealCapacity(19); + break :uint_val main_block.addBitCast(l, uint_ty, ty_op.operand); } - break :uint_val cur_uint; + + // %1 = block({ + // %2 = alloc(*usize) + // %3 = alloc(*uN) + // %4 = store(%2, ) + // %5 = store(%3, ) + // %6 = loop({ + // %7 = load(%2) + // %8 = array_elem_val(orig_operand, %7) + // %9 = bitcast(uE, %8) + // %10 = intcast(uN, %9) + // %11 = load(%3) + // %12 = shl_exact(%11, ) + // %13 = bit_or(%12, %10) + // %14 = cmp_eq(%4, @zero_usize) + // %15 = cond_br(%14, { + // %16 = br(%1, %13) + // }, { + // %17 = store(%3, %13) + // %18 = sub(%7, @one_usize) + // %19 = store(%2, %18) + // %20 = repeat(%6) + // }) + // }) + // }) + + const elem_bits = operand_ty.childType(zcu).bitSize(zcu); + const elem_bits_val = try pt.intValue(shift_ty, elem_bits); + const elem_uint_ty = try pt.intType(.unsigned, @intCast(elem_bits)); + + const uint_block_inst = main_block.add(l, .{ + .tag = .block, + .data = .{ .ty_pl = .{ + .ty = .fromType(uint_ty), + .payload = undefined, + } }, + }); + var uint_block: Block = .init(main_block.stealCapacity(19)); + + const index_ptr = uint_block.addTy(l, .alloc, .ptr_usize).toRef(); + const result_ptr = uint_block.addTy(l, .alloc, try pt.singleMutPtrType(uint_ty)).toRef(); + _ = uint_block.addBinOp( + l, + .store, + index_ptr, + .fromValue(try pt.intValue(.usize, operand_ty.arrayLen(zcu))), + ); + _ = uint_block.addBinOp(l, .store, result_ptr, .fromValue(try pt.intValue(uint_ty, 0))); + + var loop: Loop = .init(l, &uint_block); + loop.block = .init(uint_block.stealRemainingCapacity()); + + const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef(); + const raw_elem = loop.block.addBinOp( + l, + if (operand_ty.zigTypeTag(zcu) == .vector) .legalize_vec_elem_val else .array_elem_val, + ty_op.operand, + index_val, + ).toRef(); + const elem_uint = loop.block.addBitCast(l, elem_uint_ty, raw_elem); + const elem_extended = loop.block.addTyOp(l, .intcast, uint_ty, elem_uint).toRef(); + const old_result = loop.block.addTyOp(l, .load, uint_ty, result_ptr).toRef(); + const shifted_result = loop.block.addBinOp(l, .shl_exact, old_result, .fromValue(elem_bits_val)).toRef(); + const new_result = loop.block.addBinOp(l, .bit_or, shifted_result, elem_extended).toRef(); + + const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .zero_usize).toRef(); + var condbr: CondBr = .init(l, is_end_val, &loop.block, .{}); + + condbr.then_block = .init(loop.block.stealRemainingCapacity()); + condbr.then_block.addBr(l, uint_block_inst, new_result); + + condbr.else_block = .init(condbr.then_block.stealRemainingCapacity()); + _ = condbr.else_block.addBinOp(l, .store, result_ptr, new_result); + const new_index_val = condbr.else_block.addBinOp(l, .sub, index_val, .one_usize).toRef(); + _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val); + _ = condbr.else_block.add(l, .{ + .tag = .repeat, + .data = .{ .repeat = .{ .loop_inst = loop.inst } }, + }); + + try condbr.finish(l); + try loop.finish(l); + + const inst_data = l.air_instructions.items(.data); + inst_data[@intFromEnum(uint_block_inst)].ty_pl.payload = try l.addBlockBody(uint_block.body()); + + break :uint_val uint_block_inst.toRef(); }; // Now convert `uint_ty` (`uN`) to `dest_ty`. - const result: Air.Inst.Ref = result: { - if (dest_legal) break :result main_block.addBitCast(l, dest_ty, uint_val); + if (dest_legal) { + _ = main_block.stealCapacity(17); + const result = main_block.addBitCast(l, dest_ty, uint_val); + main_block.addBr(l, orig_inst, result); + } else { + // %1 = alloc(*usize) + // %2 = alloc(*@Vector(N, Result)) + // %3 = store(%1, @zero_usize) + // %4 = loop({ + // %5 = load(%1) + // %6 = mul(%5, ) + // %7 = intcast(uS, %6) + // %8 = shr(uint_val, %7) + // %9 = trunc(uE, %8) + // %10 = bitcast(Result, %9) + // %11 = legalize_vec_store_elem(%2, %5, %10) + // %12 = cmp_eq(%5, ) + // %13 = cond_br(%12, { + // %14 = load(%2) + // %15 = br(%0, %14) + // }, { + // %16 = add(%5, @one_usize) + // %17 = store(%1, %16) + // %18 = repeat(%4) + // }) + // }) + // + // The result might be an array, in which case `legalize_vec_store_elem` + // becomes `ptr_elem_ptr` followed by `store`. const elem_ty = dest_ty.childType(zcu); - const bits_per_elem: u16 = @intCast(elem_ty.bitSize(zcu)); - const bits_per_elem_ref: Air.Inst.Ref = .fromValue(try pt.intValue(shift_ty, bits_per_elem)); - const elem_uint_ty = try pt.intType(.unsigned, bits_per_elem); + const elem_bits = elem_ty.bitSize(zcu); + const elem_uint_ty = try pt.intType(.unsigned, @intCast(elem_bits)); - const elem_buf = try sfba.alloc(Air.Inst.Ref, dest_ty.arrayLen(zcu)); - defer sfba.free(elem_buf); + const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef(); + const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(dest_ty)).toRef(); + _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize); - var cur_uint = uint_val; - for (elem_buf) |*elem| { - const elem_as_uint = main_block.addTyOp(l, .trunc, elem_uint_ty, cur_uint).toRef(); - elem.* = main_block.addBitCast(l, elem_ty, elem_as_uint); - cur_uint = main_block.addBinOp(l, .shr, cur_uint, bits_per_elem_ref).toRef(); + var loop: Loop = .init(l, &main_block); + loop.block = .init(main_block.stealRemainingCapacity()); + + const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef(); + const bit_offset = loop.block.addBinOp(l, .mul, index_val, .fromValue(try pt.intValue(.usize, elem_bits))).toRef(); + const casted_bit_offset = loop.block.addTyOp(l, .intcast, shift_ty, bit_offset).toRef(); + const shifted_uint = loop.block.addBinOp(l, .shr, index_val, casted_bit_offset).toRef(); + const elem_uint = loop.block.addTyOp(l, .trunc, elem_uint_ty, shifted_uint).toRef(); + const elem_val = loop.block.addBitCast(l, elem_ty, elem_uint); + switch (dest_ty.zigTypeTag(zcu)) { + .array => { + const elem_ptr = loop.block.add(l, .{ + .tag = .ptr_elem_ptr, + .data = .{ .ty_pl = .{ + .ty = .fromType(try pt.singleMutPtrType(elem_ty)), + .payload = try l.addExtra(Air.Bin, .{ + .lhs = result_ptr, + .rhs = index_val, + }), + } }, + }).toRef(); + _ = loop.block.addBinOp(l, .store, elem_ptr, elem_val); + }, + .vector => { + _ = loop.block.add(l, .{ + .tag = .legalize_vec_store_elem, + .data = .{ .pl_op = .{ + .operand = result_ptr, + .payload = try l.addExtra(Air.Bin, .{ + .lhs = index_val, + .rhs = elem_val, + }), + } }, + }); + _ = loop.block.stealCapacity(1); + }, + else => unreachable, } - break :result main_block.add(l, .{ - .tag = .aggregate_init, - .data = .{ .ty_pl = .{ - .ty = .fromType(dest_ty), - .payload = payload: { - const idx = l.air_extra.items.len; - try l.air_extra.appendSlice(gpa, @ptrCast(elem_buf)); - break :payload @intCast(idx); - }, - } }, - }).toRef(); - }; + const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, dest_ty.arrayLen(zcu) - 1))).toRef(); - main_block.addBr(l, orig_inst, result); + var condbr: CondBr = .init(l, is_end_val, &loop.block, .{}); + + condbr.then_block = .init(loop.block.stealRemainingCapacity()); + const result_val = condbr.then_block.addTyOp(l, .load, dest_ty, result_ptr).toRef(); + condbr.then_block.addBr(l, orig_inst, result_val); + + condbr.else_block = .init(condbr.then_block.stealRemainingCapacity()); + const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef(); + _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val); + _ = condbr.else_block.add(l, .{ + .tag = .repeat, + .data = .{ .repeat = .{ .loop_inst = loop.inst } }, + }); + + try condbr.finish(l); + try loop.finish(l); + } return .{ .ty_pl = .{ .ty = .fromType(dest_ty), @@ -995,10 +1463,6 @@ fn scalarizeBitcastBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!? fn scalarizeOverflowBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error!Air.Inst.Data { const pt = l.pt; const zcu = pt.zcu; - const gpa = zcu.gpa; - - var sfba_state = std.heap.stackFallback(512, gpa); - const sfba = sfba_state.get(); const orig = l.air_instructions.get(@intFromEnum(orig_inst)); const orig_operands = l.extraData(Air.Bin, orig.data.ty_pl.payload).data; @@ -1015,89 +1479,127 @@ fn scalarizeOverflowBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index) Error! const scalar_int_ty = vec_int_ty.childType(zcu); const scalar_tuple_ty = try pt.overflowArithmeticTupleType(scalar_int_ty); + // %1 = block(struct { @Vector(N, Int), @Vector(N, u1) }, { + // %2 = alloc(*usize) + // %3 = alloc(*struct { @Vector(N, Int), @Vector(N, u1) }) + // %4 = struct_field_ptr_index_0(*@Vector(N, Int), %3) + // %5 = struct_field_ptr_index_1(*@Vector(N, u1), %3) + // %6 = store(%2, @zero_usize) + // %7 = loop({ + // %8 = load(%2) + // %9 = legalize_vec_elem_val(orig_lhs, %8) + // %10 = legalize_vec_elem_val(orig_rhs, %8) + // %11 = ???_with_overflow(struct { Int, u1 }, %9, %10) + // %12 = struct_field_val(%11, 0) + // %13 = struct_field_val(%11, 1) + // %14 = legalize_vec_store_elem(%4, %8, %12) + // %15 = legalize_vec_store_elem(%4, %8, %13) + // %16 = cmp_eq(%8, ) + // %17 = cond_br(%16, { + // %18 = load(%3) + // %19 = br(%1, %18) + // }, { + // %20 = add(%8, @one_usize) + // %21 = store(%2, %20) + // %22 = repeat(%7) + // }) + // }) + // }) + const elems_len = vec_int_ty.vectorLen(zcu); - const inst_buf = try sfba.alloc(Air.Inst.Index, 5 * elems_len + 4); - defer sfba.free(inst_buf); + var inst_buf: [21]Air.Inst.Index = undefined; + var main_block: Block = .init(&inst_buf); + try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len); - var main_block: Block = .init(inst_buf); - try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len); + const index_ptr = main_block.addTy(l, .alloc, .ptr_usize).toRef(); + const result_ptr = main_block.addTy(l, .alloc, try pt.singleMutPtrType(vec_tuple_ty)).toRef(); + const result_int_ptr = main_block.addTyOp( + l, + .struct_field_ptr_index_0, + try pt.singleMutPtrType(vec_int_ty), + result_ptr, + ).toRef(); + const result_overflow_ptr = main_block.addTyOp( + l, + .struct_field_ptr_index_1, + try pt.singleMutPtrType(vec_overflow_ty), + result_ptr, + ).toRef(); - const int_elem_buf = try sfba.alloc(Air.Inst.Ref, elems_len); - defer sfba.free(int_elem_buf); - const overflow_elem_buf = try sfba.alloc(Air.Inst.Ref, elems_len); - defer sfba.free(overflow_elem_buf); + _ = main_block.addBinOp(l, .store, index_ptr, .zero_usize); - for (int_elem_buf, overflow_elem_buf, 0..) |*int_elem, *overflow_elem, elem_idx| { - const elem_idx_ref: Air.Inst.Ref = .fromValue(try pt.intValue(.usize, elem_idx)); - const lhs = main_block.addBinOp(l, .array_elem_val, orig_operands.lhs, elem_idx_ref).toRef(); - const rhs = main_block.addBinOp(l, .array_elem_val, orig_operands.rhs, elem_idx_ref).toRef(); - const elem_result = main_block.add(l, .{ - .tag = orig.tag, - .data = .{ .ty_pl = .{ - .ty = .fromType(scalar_tuple_ty), - .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }), - } }, - }).toRef(); - int_elem.* = main_block.add(l, .{ - .tag = .struct_field_val, - .data = .{ .ty_pl = .{ - .ty = .fromType(scalar_int_ty), - .payload = try l.addExtra(Air.StructField, .{ - .struct_operand = elem_result, - .field_index = 0, - }), - } }, - }).toRef(); - overflow_elem.* = main_block.add(l, .{ - .tag = .struct_field_val, - .data = .{ .ty_pl = .{ - .ty = .bool_type, - .payload = try l.addExtra(Air.StructField, .{ - .struct_operand = elem_result, - .field_index = 1, - }), - } }, - }).toRef(); - } + var loop: Loop = .init(l, &main_block); + loop.block = .init(main_block.stealRemainingCapacity()); - const int_vec = main_block.add(l, .{ - .tag = .aggregate_init, + const index_val = loop.block.addTyOp(l, .load, .usize, index_ptr).toRef(); + const lhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operands.lhs, index_val).toRef(); + const rhs = loop.block.addBinOp(l, .legalize_vec_elem_val, orig_operands.rhs, index_val).toRef(); + const elem_result = loop.block.add(l, .{ + .tag = orig.tag, .data = .{ .ty_pl = .{ - .ty = .fromType(vec_int_ty), - .payload = payload: { - const idx = l.air_extra.items.len; - try l.air_extra.appendSlice(gpa, @ptrCast(int_elem_buf)); - break :payload @intCast(idx); - }, + .ty = .fromType(scalar_tuple_ty), + .payload = try l.addExtra(Air.Bin, .{ .lhs = lhs, .rhs = rhs }), } }, }).toRef(); - const overflow_vec = main_block.add(l, .{ - .tag = .aggregate_init, + const int_elem = loop.block.add(l, .{ + .tag = .struct_field_val, .data = .{ .ty_pl = .{ - .ty = .fromType(vec_overflow_ty), - .payload = payload: { - const idx = l.air_extra.items.len; - try l.air_extra.appendSlice(gpa, @ptrCast(overflow_elem_buf)); - break :payload @intCast(idx); - }, + .ty = .fromType(scalar_int_ty), + .payload = try l.addExtra(Air.StructField, .{ + .struct_operand = elem_result, + .field_index = 0, + }), } }, }).toRef(); - - const tuple_elems: [2]Air.Inst.Ref = .{ int_vec, overflow_vec }; - const result = main_block.add(l, .{ - .tag = .aggregate_init, + const overflow_elem = loop.block.add(l, .{ + .tag = .struct_field_val, .data = .{ .ty_pl = .{ - .ty = .fromType(vec_tuple_ty), - .payload = payload: { - const idx = l.air_extra.items.len; - try l.air_extra.appendSlice(gpa, @ptrCast(&tuple_elems)); - break :payload @intCast(idx); - }, + .ty = .u1_type, + .payload = try l.addExtra(Air.StructField, .{ + .struct_operand = elem_result, + .field_index = 1, + }), } }, }).toRef(); + _ = loop.block.add(l, .{ + .tag = .legalize_vec_store_elem, + .data = .{ .pl_op = .{ + .operand = result_int_ptr, + .payload = try l.addExtra(Air.Bin, .{ + .lhs = index_val, + .rhs = int_elem, + }), + } }, + }); + _ = loop.block.add(l, .{ + .tag = .legalize_vec_store_elem, + .data = .{ .pl_op = .{ + .operand = result_overflow_ptr, + .payload = try l.addExtra(Air.Bin, .{ + .lhs = index_val, + .rhs = overflow_elem, + }), + } }, + }); - main_block.addBr(l, orig_inst, result); + const is_end_val = loop.block.addBinOp(l, .cmp_eq, index_val, .fromValue(try pt.intValue(.usize, elems_len - 1))).toRef(); + var condbr: CondBr = .init(l, is_end_val, &loop.block, .{}); + + condbr.then_block = .init(loop.block.stealRemainingCapacity()); + const result_val = condbr.then_block.addTyOp(l, .load, vec_tuple_ty, result_ptr).toRef(); + condbr.then_block.addBr(l, orig_inst, result_val); + + condbr.else_block = .init(condbr.then_block.stealRemainingCapacity()); + const new_index_val = condbr.else_block.addBinOp(l, .add, index_val, .one_usize).toRef(); + _ = condbr.else_block.addBinOp(l, .store, index_ptr, new_index_val); + _ = condbr.else_block.add(l, .{ + .tag = .repeat, + .data = .{ .repeat = .{ .loop_inst = loop.inst } }, + }); + + try condbr.finish(l); + try loop.finish(l); return .{ .ty_pl = .{ .ty = .fromType(vec_tuple_ty), @@ -1288,7 +1790,7 @@ fn safeIntFromFloatBlockPayload(l: *Legalize, orig_inst: Air.Inst.Index, optimiz // We emit 9 instructions in the worst case. var inst_buf: [9]Air.Inst.Index = undefined; - try l.air_instructions.ensureUnusedCapacity(zcu.gpa, inst_buf.len); + try l.air_instructions.ensureUnusedCapacity(gpa, inst_buf.len); var main_block: Block = .init(&inst_buf); // This check is a bit annoying because of floating-point rounding and the fact that this @@ -1771,6 +2273,9 @@ const Block = struct { .data = .{ .br = .{ .block_inst = target, .operand = operand } }, }); } + fn addTy(b: *Block, l: *Legalize, tag: Air.Inst.Tag, ty: Type) Air.Inst.Index { + return b.add(l, .{ .tag = tag, .data = .{ .ty = ty } }); + } fn addBinOp(b: *Block, l: *Legalize, tag: Air.Inst.Tag, lhs: Air.Inst.Ref, rhs: Air.Inst.Ref) Air.Inst.Index { return b.add(l, .{ .tag = tag, @@ -1921,6 +2426,31 @@ const Block = struct { } }; +const Loop = struct { + inst: Air.Inst.Index, + block: Block, + + /// The return value has `block` initialized to `undefined`; it is the caller's reponsibility + /// to initialize it. + fn init(l: *Legalize, parent_block: *Block) Loop { + return .{ + .inst = parent_block.add(l, .{ + .tag = .loop, + .data = .{ .ty_pl = .{ + .ty = .noreturn_type, + .payload = undefined, + } }, + }), + .block = undefined, + }; + } + + fn finish(loop: Loop, l: *Legalize) Error!void { + const data = &l.air_instructions.items(.data)[@intFromEnum(loop.inst)]; + data.ty_pl.payload = try l.addBlockBody(loop.block.body()); + } +}; + const CondBr = struct { inst: Air.Inst.Index, hints: Air.CondBr.BranchHints, diff --git a/src/Air/Liveness.zig b/src/Air/Liveness.zig index ea170d0893..c60ece5e4f 100644 --- a/src/Air/Liveness.zig +++ b/src/Air/Liveness.zig @@ -458,6 +458,7 @@ fn analyzeInst( .memset_safe, .memcpy, .memmove, + .legalize_vec_elem_val, => { const o = inst_datas[@intFromEnum(inst)].bin_op; return analyzeOperands(a, pass, data, inst, .{ o.lhs, o.rhs, .none }); @@ -769,6 +770,12 @@ fn analyzeInst( const pl_op = inst_datas[@intFromEnum(inst)].pl_op; return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, .none, .none }); }, + + .legalize_vec_store_elem => { + const pl_op = inst_datas[@intFromEnum(inst)].pl_op; + const bin = a.air.extraData(Air.Bin, pl_op.payload).data; + return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, bin.lhs, bin.rhs }); + }, } } diff --git a/src/Air/Liveness/Verify.zig b/src/Air/Liveness/Verify.zig index 2f50937bbe..f522e1367e 100644 --- a/src/Air/Liveness/Verify.zig +++ b/src/Air/Liveness/Verify.zig @@ -272,6 +272,7 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void { .memset_safe, .memcpy, .memmove, + .legalize_vec_elem_val, => { const bin_op = data[@intFromEnum(inst)].bin_op; try self.verifyInstOperands(inst, .{ bin_op.lhs, bin_op.rhs, .none }); @@ -577,6 +578,11 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void { try self.verifyInst(inst); }, + .legalize_vec_store_elem => { + const pl_op = data[@intFromEnum(inst)].pl_op; + const bin = self.air.extraData(Air.Bin, pl_op.payload).data; + try self.verifyInstOperands(inst, .{ pl_op.operand, bin.lhs, bin.rhs }); + }, } } } diff --git a/src/Air/print.zig b/src/Air/print.zig index 4b44af3206..3324055dc7 100644 --- a/src/Air/print.zig +++ b/src/Air/print.zig @@ -171,6 +171,7 @@ const Writer = struct { .memmove, .memset, .memset_safe, + .legalize_vec_elem_val, => try w.writeBinOp(s, inst), .is_null, @@ -331,6 +332,7 @@ const Writer = struct { .reduce, .reduce_optimized => try w.writeReduce(s, inst), .cmp_vector, .cmp_vector_optimized => try w.writeCmpVector(s, inst), .runtime_nav_ptr => try w.writeRuntimeNavPtr(s, inst), + .legalize_vec_store_elem => try w.writeLegalizeVecStoreElem(s, inst), .work_item_id, .work_group_size, @@ -508,6 +510,18 @@ const Writer = struct { try w.writeOperand(s, inst, 2, pl_op.operand); } + fn writeLegalizeVecStoreElem(w: *Writer, s: *std.Io.Writer, inst: Air.Inst.Index) Error!void { + const pl_op = w.air.instructions.items(.data)[@intFromEnum(inst)].pl_op; + const bin = w.air.extraData(Air.Bin, pl_op.payload).data; + + try w.writeOperand(s, inst, 0, pl_op.operand); + try s.writeAll(", "); + try w.writeOperand(s, inst, 1, bin.lhs); + try s.writeAll(", "); + try w.writeOperand(s, inst, 2, bin.rhs); + try s.writeAll(", "); + } + fn writeShuffleOne(w: *Writer, s: *std.Io.Writer, inst: Air.Inst.Index) Error!void { const unwrapped = w.air.unwrapShuffleOne(w.pt.zcu, inst); try w.writeType(s, unwrapped.result_ty); diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig index cac981cb00..d90550982d 100644 --- a/src/Air/types_resolved.zig +++ b/src/Air/types_resolved.zig @@ -88,6 +88,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .atomic_store_monotonic, .atomic_store_release, .atomic_store_seq_cst, + .legalize_vec_elem_val, => { if (!checkRef(data.bin_op.lhs, zcu)) return false; if (!checkRef(data.bin_op.rhs, zcu)) return false; @@ -322,6 +323,7 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { .select, .mul_add, + .legalize_vec_store_elem, => { const bin = air.extraData(Air.Bin, data.pl_op.payload).data; if (!checkRef(data.pl_op.operand, zcu)) return false; diff --git a/src/Sema.zig b/src/Sema.zig index 8b5c7c5de2..7974b47913 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -15930,16 +15930,21 @@ fn zirOverflowArithmetic( } } // If either of the arguments is one, the result is the other and no overflow occured. - const scalar_one = try pt.intValue(dest_ty.scalarType(zcu), 1); - const vec_one = try sema.splat(dest_ty, scalar_one); - if (maybe_lhs_val) |lhs_val| { - if (!lhs_val.isUndef(zcu) and try sema.compareAll(lhs_val, .eq, vec_one, dest_ty)) { - break :result .{ .overflow_bit = try sema.splat(overflow_ty, .zero_u1), .inst = rhs }; + const dest_scalar_ty = dest_ty.scalarType(zcu); + const dest_scalar_int = dest_scalar_ty.intInfo(zcu); + // We could still be working with i1, where '1' is not a legal value! + if (!(dest_scalar_int.bits == 1 and dest_scalar_int.signedness == .signed)) { + const scalar_one = try pt.intValue(dest_scalar_ty, 1); + const vec_one = try sema.splat(dest_ty, scalar_one); + if (maybe_lhs_val) |lhs_val| { + if (!lhs_val.isUndef(zcu) and try sema.compareAll(lhs_val, .eq, vec_one, dest_ty)) { + break :result .{ .overflow_bit = try sema.splat(overflow_ty, .zero_u1), .inst = rhs }; + } } - } - if (maybe_rhs_val) |rhs_val| { - if (!rhs_val.isUndef(zcu) and try sema.compareAll(rhs_val, .eq, vec_one, dest_ty)) { - break :result .{ .overflow_bit = try sema.splat(overflow_ty, .zero_u1), .inst = lhs }; + if (maybe_rhs_val) |rhs_val| { + if (!rhs_val.isUndef(zcu) and try sema.compareAll(rhs_val, .eq, vec_one, dest_ty)) { + break :result .{ .overflow_bit = try sema.splat(overflow_ty, .zero_u1), .inst = lhs }; + } } } diff --git a/src/codegen/aarch64/Select.zig b/src/codegen/aarch64/Select.zig index 36ca69e589..64aeeb7ff4 100644 --- a/src/codegen/aarch64/Select.zig +++ b/src/codegen/aarch64/Select.zig @@ -134,6 +134,10 @@ pub fn analyze(isel: *Select, air_body: []const Air.Inst.Index) !void { var air_inst_index = air_body[air_body_index]; const initial_def_order_len = isel.def_order.count(); air_tag: switch (air_tags[@intFromEnum(air_inst_index)]) { + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .arg, .ret_addr, .frame_addr, @@ -950,6 +954,11 @@ pub fn body(isel: *Select, air_body: []const Air.Inst.Index) error{ OutOfMemory, }; air_tag: switch (air.next().?) { else => |air_tag| return isel.fail("unimplemented {t}", .{air_tag}), + + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .arg => { const arg_vi = isel.live_values.fetchRemove(air.inst_index).?.value; defer arg_vi.deref(isel); diff --git a/src/codegen/c.zig b/src/codegen/c.zig index 0abea3d503..a19c4bb346 100644 --- a/src/codegen/c.zig +++ b/src/codegen/c.zig @@ -3325,6 +3325,10 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) Error!void { // zig fmt: off .inferred_alloc, .inferred_alloc_comptime => unreachable, + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .arg => try airArg(f, inst), .breakpoint => try airBreakpoint(f), diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 1160c2958e..b862a23ddc 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -4886,6 +4886,11 @@ pub const FuncGen = struct { const val: Builder.Value = switch (air_tags[@intFromEnum(inst)]) { // zig fmt: off + + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .add => try self.airAdd(inst, .normal), .add_optimized => try self.airAdd(inst, .fast), .add_wrap => try self.airAddWrap(inst), diff --git a/src/codegen/riscv64/CodeGen.zig b/src/codegen/riscv64/CodeGen.zig index bf5e5b6718..cdca3c2fd8 100644 --- a/src/codegen/riscv64/CodeGen.zig +++ b/src/codegen/riscv64/CodeGen.zig @@ -1391,6 +1391,11 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void { const tag = air_tags[@intFromEnum(inst)]; switch (tag) { // zig fmt: off + + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .add, .add_wrap, .sub, diff --git a/src/codegen/sparc64/CodeGen.zig b/src/codegen/sparc64/CodeGen.zig index 684bfcfabb..4cbe07c762 100644 --- a/src/codegen/sparc64/CodeGen.zig +++ b/src/codegen/sparc64/CodeGen.zig @@ -479,6 +479,11 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { self.reused_operands = @TypeOf(self.reused_operands).initEmpty(); switch (air_tags[@intFromEnum(inst)]) { // zig fmt: off + + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .ptr_add => try self.airPtrArithmetic(inst, .ptr_add), .ptr_sub => try self.airPtrArithmetic(inst, .ptr_sub), diff --git a/src/codegen/wasm/CodeGen.zig b/src/codegen/wasm/CodeGen.zig index b7f7aa151d..684513bf82 100644 --- a/src/codegen/wasm/CodeGen.zig +++ b/src/codegen/wasm/CodeGen.zig @@ -1786,6 +1786,10 @@ fn buildPointerOffset(cg: *CodeGen, ptr_value: WValue, offset: u64, action: enum fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { const air_tags = cg.air.instructions.items(.tag); return switch (air_tags[@intFromEnum(inst)]) { + // No "scalarize" legalizations are enabled, so these instructions never appear. + .legalize_vec_elem_val => unreachable, + .legalize_vec_store_elem => unreachable, + .inferred_alloc, .inferred_alloc_comptime => unreachable, .add => cg.airBinOp(inst, .add), diff --git a/src/codegen/x86_64/CodeGen.zig b/src/codegen/x86_64/CodeGen.zig index 94394185bd..f0772dcd73 100644 --- a/src/codegen/x86_64/CodeGen.zig +++ b/src/codegen/x86_64/CodeGen.zig @@ -103926,7 +103926,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { try ops[0].toOffset(0, cg); try ops[0].finish(inst, &.{ty_op.operand}, &ops, cg); }, - .array_elem_val => { + .array_elem_val, .legalize_vec_elem_val => { const bin_op = air_datas[@intFromEnum(inst)].bin_op; const array_ty = cg.typeOf(bin_op.lhs); const res_ty = array_ty.elemType2(zcu); @@ -173061,6 +173061,634 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .c_va_copy => try cg.airVaCopy(inst), .c_va_end => try cg.airVaEnd(inst), .c_va_start => try cg.airVaStart(inst), + .legalize_vec_store_elem => { + const pl_op = air_datas[@intFromEnum(inst)].pl_op; + const bin = cg.air.extraData(Air.Bin, pl_op.payload).data; + // vector_ptr, index, elem_val + var ops = try cg.tempsFromOperands(inst, .{ pl_op.operand, bin.lhs, bin.rhs }); + cg.select(&.{}, &.{}, &ops, comptime &.{ .{ + .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } }, + }, + .extra_temps = .{ + .{ .type = .u8, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ }, + .{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } }, + }, + .extra_temps = .{ + .{ .type = .u8, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ }, + .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ }, + } }, + }, .{ + .required_features = .{ .cmov, null, null, null }, + .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u8, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u8, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ }, + .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ }, + .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0b), .tmp0b, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .{ .ptr_bool_vec = .byte }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u8, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .movzx, .tmp0d, .lea(.src0b), ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._nz, .j, .@"0f", ._, ._, ._ }, + .{ ._, ._r, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._mp, .j, .@"1f", ._, ._, ._ }, + .{ .@"0:", ._s, .bt, .tmp0d, .src1d, ._, ._ }, + .{ .@"1:", ._, .mov, .lea(.src0b), .tmp0b, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ }, + } }, + }, .{ + .required_features = .{ .cmov, null, null, null }, + .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u16, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u16, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .movzx, .tmp0d, .lea(.src0w), ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ }, + .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ }, + .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0w), .tmp0w, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .{ .ptr_bool_vec = .word }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._nz, .j, .@"1f", ._, ._, ._ }, + .{ ._, ._r, .bt, .lea(.src0w), .src1w, ._, ._ }, + .{ ._, ._mp, .j, .@"0f", ._, ._, ._ }, + .{ .@"1:", ._s, .bt, .lea(.src0w), .src1w, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .ptr_any_bool_vec, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 0 } } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .ptr_any_bool_vec, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .{ .imm = 1 } } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._s, .bt, .lea(.src0d), .src1d, ._, ._ }, + } }, + }, .{ + .required_features = .{ .cmov, null, null, null }, + .src_constraints = .{ .{ .ptr_bool_vec = .dword }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .lea(.src0d), ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .tmp0d, ._, ._ }, + .{ ._, ._r, .bt, .tmp1d, .src1d, ._, ._ }, + .{ ._, ._s, .bt, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._z, .cmov, .tmp0d, .tmp1d, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0d), .tmp0d, ._, ._ }, + } }, + }, .{ + .required_features = .{ .@"64bit", .cmov, null, null }, + .src_constraints = .{ .{ .ptr_bool_vec = .qword }, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u64, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u64, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0q, .lea(.src0q), ._, ._ }, + .{ ._, ._, .mov, .tmp1q, .tmp0q, ._, ._ }, + .{ ._, ._r, .bt, .tmp1q, .src1q, ._, ._ }, + .{ ._, ._s, .bt, .tmp0q, .src1q, ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._z, .cmov, .tmp0q, .tmp1q, ._, ._ }, + .{ ._, ._, .mov, .lea(.src0q), .tmp0q, ._, ._ }, + } }, + }, .{ + .required_features = .{ .cmov, null, null, null }, + .src_constraints = .{ .ptr_any_bool_vec, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .extra_temps = .{ + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .{ .type = .u32, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .tmp0d, .src1d, ._, ._ }, + .{ ._, ._r, .sh, .tmp0d, .ui(5), ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .leasi(.src0d, .@"4", .tmp0), ._, ._ }, + .{ ._, ._, .mov, .tmp2d, .tmp1d, ._, ._ }, + .{ ._, ._r, .bt, .tmp2d, .src1d, ._, ._ }, + .{ ._, ._s, .bt, .tmp1d, .src1d, ._, ._ }, + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._z, .cmov, .tmp1d, .tmp2d, ._, ._ }, + .{ ._, ._, .mov, .leasi(.src0d, .@"4", .tmp0), .tmp1d, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .ptr_any_bool_vec, .any, .bool }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .clobbers = .{ .eflags = true }, + .each = .{ .once = &.{ + .{ ._, ._, .@"test", .src2b, .si(1), ._, ._ }, + .{ ._, ._nz, .j, .@"1f", ._, ._, ._ }, + .{ ._, ._r, .bt, .lea(.src0d), .src1d, ._, ._ }, + .{ ._, ._mp, .j, .@"0f", ._, ._, ._ }, + .{ .@"1:", ._s, .bt, .lea(.src0d), .src1d, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .byte } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .imm8 } }, + .{ .src = .{ .to_gpr, .simm32, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leaa(.src0b, .add_src0_elem_size_mul_src1), .src2b, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .byte } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .imm8 } }, + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leai(.src0b, .src1), .src2b, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .imm16 } }, + .{ .src = .{ .to_gpr, .simm32, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2w, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .imm16 } }, + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .src2w, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .vp_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ }, + } }, + }, .{ + .required_features = .{ .sse4_1, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .p_w, .extr, .leaa(.src0w, .add_src0_elem_size_mul_src1), .src2x, .ui(0), ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .extra_temps = .{ + .{ .type = .f16, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .each = .{ .once = &.{ + .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ }, + .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp0w, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .extra_temps = .{ + .{ .type = .f32, .kind = .mem }, + .{ .type = .f16, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .each = .{ .once = &.{ + .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ }, + .{ ._, ._, .mov, .leaa(.src0w, .add_src0_elem_size_mul_src1), .tmp1w, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .vp_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ }, + } }, + }, .{ + .required_features = .{ .sse4_1, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .p_w, .extr, .leasi(.src0w, .@"2", .src1), .src2x, .ui(0), ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .extra_temps = .{ + .{ .type = .f16, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .each = .{ .once = &.{ + .{ ._, .p_w, .extr, .tmp0d, .src2x, .ui(0), ._ }, + .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp0w, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .word } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .extra_temps = .{ + .{ .type = .f32, .kind = .mem }, + .{ .type = .f16, .kind = .{ .rc = .general_purpose } }, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + .unused, + }, + .each = .{ .once = &.{ + .{ ._, ._ss, .mov, .mem(.tmp1d), .src2x, ._, ._ }, + .{ ._, ._, .mov, .tmp1d, .mem(.tmp1d), ._, ._ }, + .{ ._, ._, .mov, .leasi(.src0w, .@"2", .src1), .tmp1w, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .imm32 } }, + .{ .src = .{ .to_gpr, .simm32, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2d, ._, ._ }, + } }, + }, .{ + .src_constraints = .{ .any, .any, .{ .int = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .imm32 } }, + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leasi(.src0d, .@"4", .src1), .src2d, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .v_ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._ss, .mov, .leaa(.src0d, .add_src0_elem_size_mul_src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .v_ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .dword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._ss, .mov, .leasi(.src0d, .@"4", .src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .@"64bit", null, null, null }, + .src_constraints = .{ .any, .any, .{ .int = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .simm32 } }, + .{ .src = .{ .to_gpr, .simm32, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2q, ._, ._ }, + } }, + }, .{ + .required_features = .{ .@"64bit", null, null, null }, + .src_constraints = .{ .any, .any, .{ .int = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .simm32 } }, + .{ .src = .{ .to_gpr, .to_gpr, .to_gpr } }, + }, + .each = .{ .once = &.{ + .{ ._, ._, .mov, .leasi(.src0q, .@"8", .src1), .src2q, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .v_sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._sd, .mov, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .simm32, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._ps, .movl, .leaa(.src0q, .add_src0_elem_size_mul_src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .avx, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, .v_sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse2, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._sd, .mov, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ }, + } }, + }, .{ + .required_features = .{ .sse, null, null, null }, + .src_constraints = .{ .any, .any, .{ .float = .qword } }, + .patterns = &.{ + .{ .src = .{ .to_gpr, .to_gpr, .to_sse } }, + }, + .each = .{ .once = &.{ + .{ ._, ._ps, .movl, .leasi(.src0q, .@"8", .src1), .src2x, ._, ._ }, + } }, + } }) catch |err| switch (err) { + error.SelectFailed => { + const elem_size = cg.typeOf(bin.rhs).abiSize(zcu); + while (try ops[0].toRegClass(true, .general_purpose, cg) or + try ops[1].toRegClass(true, .general_purpose, cg)) + {} + const base_reg = ops[0].tracking(cg).short.register.to64(); + const rhs_reg = ops[1].tracking(cg).short.register.to64(); + if (!std.math.isPowerOfTwo(elem_size)) { + try cg.spillEflagsIfOccupied(); + try cg.asmRegisterRegisterImmediate( + .{ .i_, .mul }, + rhs_reg, + rhs_reg, + .u(elem_size), + ); + try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{ + .base = .{ .reg = base_reg }, + .mod = .{ .rm = .{ .index = rhs_reg } }, + }); + } else if (elem_size > 8) { + try cg.spillEflagsIfOccupied(); + try cg.asmRegisterImmediate( + .{ ._l, .sh }, + rhs_reg, + .u(std.math.log2_int(u64, elem_size)), + ); + try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{ + .base = .{ .reg = base_reg }, + .mod = .{ .rm = .{ .index = rhs_reg } }, + }); + } else try cg.asmRegisterMemory(.{ ._, .lea }, base_reg, .{ + .base = .{ .reg = base_reg }, + .mod = .{ .rm = .{ + .index = rhs_reg, + .scale = .fromFactor(@intCast(elem_size)), + } }, + }); + try ops[0].store(&ops[2], .{}, cg); + }, + else => |e| return e, + }; + for (ops) |op| try op.die(cg); + }, .work_item_id, .work_group_size, .work_group_id => unreachable, } try cg.resetTemps(@enumFromInt(0));