From add2976a9ba76ec661ae5668eb2a8dca2ccfad42 Mon Sep 17 00:00:00 2001 From: mlugg Date: Mon, 26 May 2025 05:07:13 +0100 Subject: [PATCH] compiler: implement better shuffle AIR Runtime `@shuffle` has two cases which backends generally want to handle differently for efficiency: * One runtime vector operand; some result elements may be comptime-known * Two runtime vector operands; some result elements may be undefined The latter case happens if both vectors given to `@shuffle` are runtime-known and they are both used (i.e. the mask refers to them). Otherwise, if the result is not entirely comptime-known, we are in the former case. `Sema` now diffentiates these two cases in the AIR so that backends can easily handle them however they want to. Note that this *doesn't* really involve Sema doing any more work than it would otherwise need to, so there's not really a negative here! Most existing backends have their lowerings for `@shuffle` migrated in this commit. The LLVM backend uses new lowerings suggested by Jacob as ones which it will handle effectively. The x86_64 backend has not yet been migrated; for now there's a panic in there. Jacob will implement that before this is merged anywhere. --- src/Air.zig | 131 ++++++++- src/Air/Legalize.zig | 3 +- src/Air/Liveness.zig | 33 ++- src/Air/Liveness/Verify.zig | 13 +- src/Air/types_resolved.zig | 22 +- src/Sema.zig | 266 +++++++++--------- src/Zcu/PerThread.zig | 3 +- src/arch/aarch64/CodeGen.zig | 16 +- src/arch/arm/CodeGen.zig | 15 +- src/arch/riscv64/CodeGen.zig | 15 +- src/arch/sparc64/CodeGen.zig | 3 +- src/arch/wasm/CodeGen.zig | 125 +++++--- src/arch/x86_64/CodeGen.zig | 2 +- src/codegen/c.zig | 74 +++-- src/codegen/llvm.zig | 211 ++++++++++++-- src/codegen/spirv.zig | 76 +++-- src/print_air.zig | 42 ++- ...elected_index_past_first_vector_length.zig | 26 +- 18 files changed, 755 insertions(+), 321 deletions(-) diff --git a/src/Air.zig b/src/Air.zig index 94e3550e79..ccfe9e9694 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -699,9 +699,21 @@ pub const Inst = struct { /// equal to the scalar value. /// Uses the `ty_op` field. splat, - /// Constructs a vector by selecting elements from `a` and `b` based on `mask`. - /// Uses the `ty_pl` field with payload `Shuffle`. - shuffle, + /// Constructs a vector by selecting elements from a single vector based on a mask. Each + /// mask element is either an index into the vector, or a comptime-known value, or "undef". + /// Uses the `ty_pl` field, where the payload index points to: + /// 1. mask_elem: ShuffleOneMask // for each `mask_len`, which comes from `ty_pl.ty` + /// 2. operand: Ref // guaranteed not to be an interned value + /// See `unwrapShufleOne`. + shuffle_one, + /// Constructs a vector by selecting elements from two vectors based on a mask. Each mask + /// element is either an index into one of the vectors, or "undef". + /// Uses the `ty_pl` field, where the payload index points to: + /// 1. mask_elem: ShuffleOneMask // for each `mask_len`, which comes from `ty_pl.ty` + /// 2. operand_a: Ref // guaranteed not to be an interned value + /// 3. operand_b: Ref // guaranteed not to be an interned value + /// See `unwrapShufleTwo`. + shuffle_two, /// Constructs a vector element-wise from `a` or `b` based on `pred`. /// Uses the `pl_op` field with `pred` as operand, and payload `Bin`. select, @@ -1299,13 +1311,6 @@ pub const FieldParentPtr = struct { field_index: u32, }; -pub const Shuffle = struct { - a: Inst.Ref, - b: Inst.Ref, - mask: InternPool.Index, - mask_len: u32, -}; - pub const VectorCmp = struct { lhs: Inst.Ref, rhs: Inst.Ref, @@ -1320,6 +1325,64 @@ pub const VectorCmp = struct { } }; +/// Used by `Inst.Tag.shuffle_one`. Represents a mask element which either indexes into a +/// runtime-known vector, or is a comptime-known value. +pub const ShuffleOneMask = packed struct(u32) { + index: u31, + kind: enum(u1) { elem, value }, + pub fn elem(idx: u32) ShuffleOneMask { + return .{ .index = @intCast(idx), .kind = .elem }; + } + pub fn value(val: Value) ShuffleOneMask { + return .{ .index = @intCast(@intFromEnum(val.toIntern())), .kind = .value }; + } + pub const Unwrapped = union(enum) { + /// The resulting element is this index into the runtime vector. + elem: u32, + /// The resulting element is this comptime-known value. + /// It is correctly typed. It might be `undefined`. + value: InternPool.Index, + }; + pub fn unwrap(raw: ShuffleOneMask) Unwrapped { + return switch (raw.kind) { + .elem => .{ .elem = raw.index }, + .value => .{ .value = @enumFromInt(raw.index) }, + }; + } +}; + +/// Used by `Inst.Tag.shuffle_two`. Represents a mask element which either indexes into one +/// of two runtime-known vectors, or is undefined. +pub const ShuffleTwoMask = enum(u32) { + undef = std.math.maxInt(u32), + _, + pub fn aElem(idx: u32) ShuffleTwoMask { + return @enumFromInt(idx << 1); + } + pub fn bElem(idx: u32) ShuffleTwoMask { + return @enumFromInt(idx << 1 | 1); + } + pub const Unwrapped = union(enum) { + /// The resulting element is this index into the first runtime vector. + a_elem: u32, + /// The resulting element is this index into the second runtime vector. + b_elem: u32, + /// The resulting element is `undefined`. + undef, + }; + pub fn unwrap(raw: ShuffleTwoMask) Unwrapped { + switch (raw) { + .undef => return .undef, + _ => {}, + } + const x = @intFromEnum(raw); + return switch (@as(u1, @truncate(x))) { + 0 => .{ .a_elem = x >> 1 }, + 1 => .{ .b_elem = x >> 1 }, + }; + } +}; + /// Trailing: /// 0. `Inst.Ref` for every outputs_len /// 1. `Inst.Ref` for every inputs_len @@ -1503,7 +1566,6 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool) .cmpxchg_weak, .cmpxchg_strong, .slice, - .shuffle, .aggregate_init, .union_init, .field_parent_ptr, @@ -1517,6 +1579,8 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool) .ptr_sub, .try_ptr, .try_ptr_cold, + .shuffle_one, + .shuffle_two, => return datas[@intFromEnum(inst)].ty_pl.ty.toType(), .not, @@ -1903,7 +1967,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool { .reduce, .reduce_optimized, .splat, - .shuffle, + .shuffle_one, + .shuffle_two, .select, .is_named_enum_value, .tag_name, @@ -2030,6 +2095,48 @@ pub fn unwrapSwitch(air: *const Air, switch_inst: Inst.Index) UnwrappedSwitch { }; } +pub fn unwrapShuffleOne(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct { + result_ty: Type, + operand: Inst.Ref, + mask: []const ShuffleOneMask, +} { + const inst = air.instructions.get(@intFromEnum(inst_index)); + switch (inst.tag) { + .shuffle_one => {}, + else => unreachable, // assertion failure + } + const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?); + const mask_len: u32 = result_ty.vectorLen(zcu); + const extra_idx = inst.data.ty_pl.payload; + return .{ + .result_ty = result_ty, + .operand = @enumFromInt(air.extra.items[extra_idx + mask_len]), + .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]), + }; +} + +pub fn unwrapShuffleTwo(air: *const Air, zcu: *const Zcu, inst_index: Inst.Index) struct { + result_ty: Type, + operand_a: Inst.Ref, + operand_b: Inst.Ref, + mask: []const ShuffleTwoMask, +} { + const inst = air.instructions.get(@intFromEnum(inst_index)); + switch (inst.tag) { + .shuffle_two => {}, + else => unreachable, // assertion failure + } + const result_ty: Type = .fromInterned(inst.data.ty_pl.ty.toInterned().?); + const mask_len: u32 = result_ty.vectorLen(zcu); + const extra_idx = inst.data.ty_pl.payload; + return .{ + .result_ty = result_ty, + .operand_a = @enumFromInt(air.extra.items[extra_idx + mask_len + 0]), + .operand_b = @enumFromInt(air.extra.items[extra_idx + mask_len + 1]), + .mask = @ptrCast(air.extra.items[extra_idx..][0..mask_len]), + }; +} + pub const typesFullyResolved = types_resolved.typesFullyResolved; pub const typeFullyResolved = types_resolved.checkType; pub const valFullyResolved = types_resolved.checkVal; diff --git a/src/Air/Legalize.zig b/src/Air/Legalize.zig index 85db181bd1..b71725995a 100644 --- a/src/Air/Legalize.zig +++ b/src/Air/Legalize.zig @@ -521,7 +521,8 @@ fn legalizeBody(l: *Legalize, body_start: usize, body_len: usize) Error!void { } }, .splat, - .shuffle, + .shuffle_one, + .shuffle_two, => {}, .select, => if (l.features.contains(.scalarize_select)) continue :inst try l.scalarize(inst, .select_pl_op_bin), diff --git a/src/Air/Liveness.zig b/src/Air/Liveness.zig index 34ecde26e2..7acba48ed0 100644 --- a/src/Air/Liveness.zig +++ b/src/Air/Liveness.zig @@ -15,6 +15,7 @@ const Liveness = @This(); const trace = @import("../tracy.zig").trace; const Air = @import("../Air.zig"); const InternPool = @import("../InternPool.zig"); +const Zcu = @import("../Zcu.zig"); pub const Verify = @import("Liveness/Verify.zig"); @@ -136,12 +137,15 @@ fn LivenessPassData(comptime pass: LivenessPass) type { }; } -pub fn analyze(gpa: Allocator, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness { +pub fn analyze(zcu: *Zcu, air: Air, intern_pool: *InternPool) Allocator.Error!Liveness { const tracy = trace(@src()); defer tracy.end(); + const gpa = zcu.gpa; + var a: Analysis = .{ .gpa = gpa, + .zcu = zcu, .air = air, .tomb_bits = try gpa.alloc( usize, @@ -220,6 +224,7 @@ const OperandCategory = enum { pub fn categorizeOperand( l: Liveness, air: Air, + zcu: *Zcu, inst: Air.Inst.Index, operand: Air.Inst.Index, ip: *const InternPool, @@ -511,10 +516,15 @@ pub fn categorizeOperand( if (extra.rhs == operand_ref) return matchOperandSmallIndex(l, inst, 2, .none); return .none; }, - .shuffle => { - const extra = air.extraData(Air.Shuffle, air_datas[@intFromEnum(inst)].ty_pl.payload).data; - if (extra.a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none); - if (extra.b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none); + .shuffle_one => { + const unwrapped = air.unwrapShuffleOne(zcu, inst); + if (unwrapped.operand == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none); + return .none; + }, + .shuffle_two => { + const unwrapped = air.unwrapShuffleTwo(zcu, inst); + if (unwrapped.operand_a == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none); + if (unwrapped.operand_b == operand_ref) return matchOperandSmallIndex(l, inst, 1, .none); return .none; }, .reduce, .reduce_optimized => { @@ -639,7 +649,7 @@ pub fn categorizeOperand( var operand_live: bool = true; for (&[_]Air.Inst.Index{ then_body[0], else_body[0] }) |cond_inst| { - if (l.categorizeOperand(air, cond_inst, operand, ip) == .tomb) + if (l.categorizeOperand(air, zcu, cond_inst, operand, ip) == .tomb) operand_live = false; switch (air_tags[@intFromEnum(cond_inst)]) { @@ -824,6 +834,7 @@ pub const BigTomb = struct { /// In-progress data; on successful analysis converted into `Liveness`. const Analysis = struct { gpa: Allocator, + zcu: *Zcu, air: Air, intern_pool: *InternPool, tomb_bits: []usize, @@ -1119,9 +1130,13 @@ fn analyzeInst( const extra = a.air.extraData(Air.Bin, pl_op.payload).data; return analyzeOperands(a, pass, data, inst, .{ pl_op.operand, extra.lhs, extra.rhs }); }, - .shuffle => { - const extra = a.air.extraData(Air.Shuffle, inst_datas[@intFromEnum(inst)].ty_pl.payload).data; - return analyzeOperands(a, pass, data, inst, .{ extra.a, extra.b, .none }); + .shuffle_one => { + const unwrapped = a.air.unwrapShuffleOne(a.zcu, inst); + return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand, .none, .none }); + }, + .shuffle_two => { + const unwrapped = a.air.unwrapShuffleTwo(a.zcu, inst); + return analyzeOperands(a, pass, data, inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none }); }, .reduce, .reduce_optimized => { const reduce = inst_datas[@intFromEnum(inst)].reduce; diff --git a/src/Air/Liveness/Verify.zig b/src/Air/Liveness/Verify.zig index e7ed37956d..4ad24cf924 100644 --- a/src/Air/Liveness/Verify.zig +++ b/src/Air/Liveness/Verify.zig @@ -1,6 +1,7 @@ //! Verifies that Liveness information is valid. gpa: std.mem.Allocator, +zcu: *Zcu, air: Air, liveness: Liveness, live: LiveMap = .{}, @@ -287,10 +288,13 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void { const extra = self.air.extraData(Air.Bin, ty_pl.payload).data; try self.verifyInstOperands(inst, .{ extra.lhs, extra.rhs, .none }); }, - .shuffle => { - const ty_pl = data[@intFromEnum(inst)].ty_pl; - const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data; - try self.verifyInstOperands(inst, .{ extra.a, extra.b, .none }); + .shuffle_one => { + const unwrapped = self.air.unwrapShuffleOne(self.zcu, inst); + try self.verifyInstOperands(inst, .{ unwrapped.operand, .none, .none }); + }, + .shuffle_two => { + const unwrapped = self.air.unwrapShuffleTwo(self.zcu, inst); + try self.verifyInstOperands(inst, .{ unwrapped.operand_a, unwrapped.operand_b, .none }); }, .cmp_vector, .cmp_vector_optimized, @@ -639,4 +643,5 @@ const log = std.log.scoped(.liveness_verify); const Air = @import("../../Air.zig"); const Liveness = @import("../Liveness.zig"); const InternPool = @import("../../InternPool.zig"); +const Zcu = @import("../../Zcu.zig"); const Verify = @This(); diff --git a/src/Air/types_resolved.zig b/src/Air/types_resolved.zig index 873f70ec50..eb17402ebe 100644 --- a/src/Air/types_resolved.zig +++ b/src/Air/types_resolved.zig @@ -249,12 +249,22 @@ fn checkBody(air: Air, body: []const Air.Inst.Index, zcu: *Zcu) bool { if (!checkRef(extra.struct_operand, zcu)) return false; }, - .shuffle => { - const extra = air.extraData(Air.Shuffle, data.ty_pl.payload).data; - if (!checkType(data.ty_pl.ty.toType(), zcu)) return false; - if (!checkRef(extra.a, zcu)) return false; - if (!checkRef(extra.b, zcu)) return false; - if (!checkVal(Value.fromInterned(extra.mask), zcu)) return false; + .shuffle_one => { + const unwrapped = air.unwrapShuffleOne(zcu, inst); + if (!checkType(unwrapped.result_ty, zcu)) return false; + if (!checkRef(unwrapped.operand, zcu)) return false; + for (unwrapped.mask) |m| switch (m.unwrap()) { + .elem => {}, + .value => |val| if (!checkVal(.fromInterned(val), zcu)) return false, + }; + }, + + .shuffle_two => { + const unwrapped = air.unwrapShuffleTwo(zcu, inst); + if (!checkType(unwrapped.result_ty, zcu)) return false; + if (!checkRef(unwrapped.operand_a, zcu)) return false; + if (!checkRef(unwrapped.operand_b, zcu)) return false; + // No values to check because there are no comptime-known values other than undef }, .cmpxchg_weak, diff --git a/src/Sema.zig b/src/Sema.zig index 34fb468716..3c4fc555cb 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -24256,8 +24256,8 @@ fn analyzeShuffle( block: *Block, src_node: std.zig.Ast.Node.Offset, elem_ty: Type, - a_arg: Air.Inst.Ref, - b_arg: Air.Inst.Ref, + a_uncoerced: Air.Inst.Ref, + b_uncoerced: Air.Inst.Ref, mask: Value, mask_len: u32, ) CompileError!Air.Inst.Ref { @@ -24266,150 +24266,154 @@ fn analyzeShuffle( const a_src = block.builtinCallArgSrc(src_node, 1); const b_src = block.builtinCallArgSrc(src_node, 2); const mask_src = block.builtinCallArgSrc(src_node, 3); - var a = a_arg; - var b = b_arg; - const res_ty = try pt.vectorType(.{ - .len = mask_len, - .child = elem_ty.toIntern(), - }); - - const maybe_a_len = switch (sema.typeOf(a).zigTypeTag(zcu)) { - .array, .vector => sema.typeOf(a).arrayLen(zcu), - .undefined => null, - else => return sema.fail(block, a_src, "expected vector or array with element type '{}', found '{}'", .{ - elem_ty.fmt(pt), - sema.typeOf(a).fmt(pt), - }), + // If the type of `a` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector. + const a_len: u32 = switch (sema.typeOf(a_uncoerced).zigTypeTag(zcu)) { + .array, .vector => @intCast(sema.typeOf(a_uncoerced).arrayLen(zcu)), + .undefined => 0, + else => return sema.fail(block, a_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(a_uncoerced).fmt(pt) }), }; - const maybe_b_len = switch (sema.typeOf(b).zigTypeTag(zcu)) { - .array, .vector => sema.typeOf(b).arrayLen(zcu), - .undefined => null, - else => return sema.fail(block, b_src, "expected vector or array with element type '{}', found '{}'", .{ - elem_ty.fmt(pt), - sema.typeOf(b).fmt(pt), - }), + const a_ty = try pt.vectorType(.{ .len = a_len, .child = elem_ty.toIntern() }); + const a_coerced = try sema.coerce(block, a_ty, a_uncoerced, a_src); + + // If the type of `b` is `@Type(.undefined)`, i.e. the argument is untyped, this is 0, because it is an error to index into this vector. + const b_len: u32 = switch (sema.typeOf(b_uncoerced).zigTypeTag(zcu)) { + .array, .vector => @intCast(sema.typeOf(b_uncoerced).arrayLen(zcu)), + .undefined => 0, + else => return sema.fail(block, b_src, "expected vector of '{}', found '{}'", .{ elem_ty.fmt(pt), sema.typeOf(b_uncoerced).fmt(pt) }), }; - if (maybe_a_len == null and maybe_b_len == null) { - return pt.undefRef(res_ty); - } - const a_len: u32 = @intCast(maybe_a_len orelse maybe_b_len.?); - const b_len: u32 = @intCast(maybe_b_len orelse a_len); + const b_ty = try pt.vectorType(.{ .len = b_len, .child = elem_ty.toIntern() }); + const b_coerced = try sema.coerce(block, b_ty, b_uncoerced, b_src); - const a_ty = try pt.vectorType(.{ - .len = a_len, - .child = elem_ty.toIntern(), - }); - const b_ty = try pt.vectorType(.{ - .len = b_len, - .child = elem_ty.toIntern(), - }); + const result_ty = try pt.vectorType(.{ .len = mask_len, .child = elem_ty.toIntern() }); - if (maybe_a_len == null) a = try pt.undefRef(a_ty) else a = try sema.coerce(block, a_ty, a, a_src); - if (maybe_b_len == null) b = try pt.undefRef(b_ty) else b = try sema.coerce(block, b_ty, b, b_src); + // We're going to pre-emptively reserve space in `sema.air_extra`. The reason for this is we need + // a `u32` buffer of length `mask_len` anyway, and putting it in `sema.air_extra` avoids a copy + // in the runtime case. If the result is comptime-known, we'll shrink `air_extra` back. + const air_extra_idx: u32 = @intCast(sema.air_extra.items.len); + const air_mask_buf = try sema.air_extra.addManyAsSlice(sema.gpa, mask_len); - const operand_info = [2]std.meta.Tuple(&.{ u64, LazySrcLoc, Type }){ - .{ a_len, a_src, a_ty }, - .{ b_len, b_src, b_ty }, - }; + // We want to interpret that buffer in `air_extra` in a few ways. Initially, we'll consider its + // elements as `Air.Inst.ShuffleTwoMask`, essentially representing the raw mask values; then, we'll + // convert it to `InternPool.Index` or `Air.Inst.ShuffleOneMask` if there are comptime-known operands. + const mask_ip_index: []InternPool.Index = @ptrCast(air_mask_buf); + const mask_shuffle_one: []Air.ShuffleOneMask = @ptrCast(air_mask_buf); + const mask_shuffle_two: []Air.ShuffleTwoMask = @ptrCast(air_mask_buf); - for (0..@intCast(mask_len)) |i| { - const elem = try mask.elemValue(pt, i); - if (elem.isUndef(zcu)) continue; - const elem_resolved = try sema.resolveLazyValue(elem); - const int = elem_resolved.toSignedInt(zcu); - var unsigned: u32 = undefined; - var chosen: u32 = undefined; - if (int >= 0) { - unsigned = @intCast(int); - chosen = 0; - } else { - unsigned = @intCast(~int); - chosen = 1; + // Initial loop: check mask elements, populate `mask_shuffle_two`. + var a_used = false; + var b_used = false; + for (mask_shuffle_two, 0..mask_len) |*out, mask_idx| { + const mask_val = try mask.elemValue(pt, mask_idx); + if (mask_val.isUndef(zcu)) { + out.* = .undef; + continue; } - if (unsigned >= operand_info[chosen][0]) { - const msg = msg: { - const msg = try sema.errMsg(mask_src, "mask index '{d}' has out-of-bounds selection", .{i}); + // Safe because mask elements are `i32` and we already checked for undef: + const raw = (try sema.resolveLazyValue(mask_val)).toSignedInt(zcu); + if (raw >= 0) { + const idx: u32 = @intCast(raw); + a_used = true; + out.* = .aElem(idx); + if (idx >= a_len) return sema.failWithOwnedErrorMsg(block, msg: { + const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx}); errdefer msg.destroy(sema.gpa); - - try sema.errNote(operand_info[chosen][1], msg, "selected index '{d}' out of bounds of '{}'", .{ - unsigned, - operand_info[chosen][2].fmt(pt), - }); - - if (chosen == 0) { - try sema.errNote(b_src, msg, "selections from the second vector are specified with negative numbers", .{}); + try sema.errNote(a_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, a_ty.fmt(pt) }); + if (idx < b_len) { + try sema.errNote(b_src, msg, "use '~@as(u32, {d})' to index into second vector given here", .{idx}); } - break :msg msg; - }; - return sema.failWithOwnedErrorMsg(block, msg); - } - } - - if (try sema.resolveValue(a)) |a_val| { - if (try sema.resolveValue(b)) |b_val| { - const values = try sema.arena.alloc(InternPool.Index, mask_len); - for (values, 0..) |*value, i| { - const mask_elem_val = try mask.elemValue(pt, i); - if (mask_elem_val.isUndef(zcu)) { - value.* = try pt.intern(.{ .undef = elem_ty.toIntern() }); - continue; - } - const int = mask_elem_val.toSignedInt(zcu); - const unsigned: u32 = @intCast(if (int >= 0) int else ~int); - values[i] = (try (if (int >= 0) a_val else b_val).elemValue(pt, unsigned)).toIntern(); - } - return Air.internedToRef((try pt.intern(.{ .aggregate = .{ - .ty = res_ty.toIntern(), - .storage = .{ .elems = values }, - } }))); - } - } - - // All static analysis passed, and not comptime. - // For runtime codegen, vectors a and b must be the same length. Here we - // recursively @shuffle the smaller vector to append undefined elements - // to it up to the length of the longer vector. This recursion terminates - // in 1 call because these calls to analyzeShuffle guarantee a_len == b_len. - if (a_len != b_len) { - const min_len = @min(a_len, b_len); - const max_src = if (a_len > b_len) a_src else b_src; - const max_len = try sema.usizeCast(block, max_src, @max(a_len, b_len)); - - const expand_mask_values = try sema.arena.alloc(InternPool.Index, max_len); - for (@intCast(0)..@intCast(min_len)) |i| { - expand_mask_values[i] = (try pt.intValue(.comptime_int, i)).toIntern(); - } - for (@intCast(min_len)..@intCast(max_len)) |i| { - expand_mask_values[i] = .negative_one; - } - const expand_mask = try pt.intern(.{ .aggregate = .{ - .ty = (try pt.vectorType(.{ .len = @intCast(max_len), .child = .comptime_int_type })).toIntern(), - .storage = .{ .elems = expand_mask_values }, - } }); - - if (a_len < b_len) { - const undef = try pt.undefRef(a_ty); - a = try sema.analyzeShuffle(block, src_node, elem_ty, a, undef, Value.fromInterned(expand_mask), @intCast(max_len)); + }); } else { - const undef = try pt.undefRef(b_ty); - b = try sema.analyzeShuffle(block, src_node, elem_ty, b, undef, Value.fromInterned(expand_mask), @intCast(max_len)); + const idx: u32 = @intCast(~raw); + b_used = true; + out.* = .bElem(idx); + if (idx >= b_len) return sema.failWithOwnedErrorMsg(block, msg: { + const msg = try sema.errMsg(mask_src, "mask element at index '{d}' selects out-of-bounds index", .{mask_idx}); + errdefer msg.destroy(sema.gpa); + try sema.errNote(b_src, msg, "index '{d}' exceeds bounds of '{}' given here", .{ idx, b_ty.fmt(pt) }); + break :msg msg; + }); } } - return block.addInst(.{ - .tag = .shuffle, - .data = .{ .ty_pl = .{ - .ty = Air.internedToRef(res_ty.toIntern()), - .payload = try block.sema.addExtra(Air.Shuffle{ - .a = a, - .b = b, - .mask = mask.toIntern(), - .mask_len = mask_len, - }), - } }, - }); + const maybe_a_val = try sema.resolveValue(a_coerced); + const maybe_b_val = try sema.resolveValue(b_coerced); + + const a_rt = a_used and maybe_a_val == null; + const b_rt = b_used and maybe_b_val == null; + + if (a_rt and b_rt) { + // Both operands are needed and runtime-known. We need a `[]ShuffleTwomask`... which is + // exactly what we already have in `mask_shuffle_two`! So, we're basically done already. + // We just need to append the two operands. + try sema.air_extra.ensureUnusedCapacity(sema.gpa, 2); + sema.appendRefsAssumeCapacity(&.{ a_coerced, b_coerced }); + return block.addInst(.{ + .tag = .shuffle_two, + .data = .{ .ty_pl = .{ + .ty = Air.internedToRef(result_ty.toIntern()), + .payload = air_extra_idx, + } }, + }); + } else if (a_rt) { + // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`. + for (mask_shuffle_two, mask_shuffle_one) |in, *out| { + out.* = switch (in.unwrap()) { + .undef => .value(try pt.undefValue(elem_ty)), + .a_elem => |idx| .elem(idx), + .b_elem => |idx| .value(try maybe_b_val.?.elemValue(pt, idx)), + }; + } + // Now just append our single runtime operand, and we're done. + try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1); + sema.appendRefsAssumeCapacity(&.{a_coerced}); + return block.addInst(.{ + .tag = .shuffle_one, + .data = .{ .ty_pl = .{ + .ty = Air.internedToRef(result_ty.toIntern()), + .payload = air_extra_idx, + } }, + }); + } else if (b_rt) { + // We need to convert the `ShuffleTwoMask` values to `ShuffleOneMask`. + for (mask_shuffle_two, mask_shuffle_one) |in, *out| { + out.* = switch (in.unwrap()) { + .undef => .value(try pt.undefValue(elem_ty)), + .a_elem => |idx| .value(try maybe_a_val.?.elemValue(pt, idx)), + .b_elem => |idx| .elem(idx), + }; + } + // Now just append our single runtime operand, and we're done. + try sema.air_extra.ensureUnusedCapacity(sema.gpa, 1); + sema.appendRefsAssumeCapacity(&.{b_coerced}); + return block.addInst(.{ + .tag = .shuffle_one, + .data = .{ .ty_pl = .{ + .ty = Air.internedToRef(result_ty.toIntern()), + .payload = air_extra_idx, + } }, + }); + } else { + // The result will be comptime-known. We must convert the `ShuffleTwoMask` values to + // `InternPool.Index` values using the known operands. + for (mask_shuffle_two, mask_ip_index) |in, *out| { + const val: Value = switch (in.unwrap()) { + .undef => try pt.undefValue(elem_ty), + .a_elem => |idx| try maybe_a_val.?.elemValue(pt, idx), + .b_elem => |idx| try maybe_b_val.?.elemValue(pt, idx), + }; + out.* = val.toIntern(); + } + const res = try pt.intern(.{ .aggregate = .{ + .ty = result_ty.toIntern(), + .storage = .{ .elems = mask_ip_index }, + } }); + // We have a comptime-known result, so didn't need `air_mask_buf` -- remove it from `sema.air_extra`. + assert(sema.air_extra.items.len == air_extra_idx + air_mask_buf.len); + sema.air_extra.shrinkRetainingCapacity(air_extra_idx); + return Air.internedToRef(res); + } } fn zirSelect(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData) CompileError!Air.Inst.Ref { diff --git a/src/Zcu/PerThread.zig b/src/Zcu/PerThread.zig index f8ae173b75..8e3d07627f 100644 --- a/src/Zcu/PerThread.zig +++ b/src/Zcu/PerThread.zig @@ -1745,7 +1745,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A try air.legalize(pt, @import("../codegen.zig").legalizeFeatures(pt, nav_index) orelse break :legalize); } - var liveness = try Air.Liveness.analyze(gpa, air.*, ip); + var liveness = try Air.Liveness.analyze(zcu, air.*, ip); defer liveness.deinit(gpa); if (build_options.enable_debug_extensions and comp.verbose_air) { @@ -1757,6 +1757,7 @@ pub fn linkerUpdateFunc(pt: Zcu.PerThread, func_index: InternPool.Index, air: *A if (std.debug.runtime_safety) { var verify: Air.Liveness.Verify = .{ .gpa = gpa, + .zcu = zcu, .air = air.*, .liveness = liveness, .intern_pool = ip, diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig index e9e7159938..c01fa24ecc 100644 --- a/src/arch/aarch64/CodeGen.zig +++ b/src/arch/aarch64/CodeGen.zig @@ -778,7 +778,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .error_name => try self.airErrorName(inst), .splat => try self.airSplat(inst), .select => try self.airSelect(inst), - .shuffle => try self.airShuffle(inst), + .shuffle_one => try self.airShuffleOne(inst), + .shuffle_two => try self.airShuffleTwo(inst), .reduce => try self.airReduce(inst), .aggregate_init => try self.airAggregateInit(inst), .union_init => try self.airUnionInit(inst), @@ -6049,11 +6050,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) InnerError!void { return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs }); } -fn airShuffle(self: *Self, inst: Air.Inst.Index) InnerError!void { - const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data; - const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for {}", .{self.target.cpu.arch}); - return self.finishAir(inst, result, .{ extra.a, extra.b, .none }); +fn airShuffleOne(self: *Self, inst: Air.Inst.Index) InnerError!void { + _ = inst; + return self.fail("TODO implement airShuffleOne for {}", .{self.target.cpu.arch}); +} + +fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) InnerError!void { + _ = inst; + return self.fail("TODO implement airShuffleTwo for {}", .{self.target.cpu.arch}); } fn airReduce(self: *Self, inst: Air.Inst.Index) InnerError!void { diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig index 8cc1d0a607..d687c74c15 100644 --- a/src/arch/arm/CodeGen.zig +++ b/src/arch/arm/CodeGen.zig @@ -767,7 +767,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .error_name => try self.airErrorName(inst), .splat => try self.airSplat(inst), .select => try self.airSelect(inst), - .shuffle => try self.airShuffle(inst), + .shuffle_one => try self.airShuffleOne(inst), + .shuffle_two => try self.airShuffleTwo(inst), .reduce => try self.airReduce(inst), .aggregate_init => try self.airAggregateInit(inst), .union_init => try self.airUnionInit(inst), @@ -6021,10 +6022,14 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void { return self.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs }); } -fn airShuffle(self: *Self, inst: Air.Inst.Index) !void { - const ty_op = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; - const result: MCValue = if (self.liveness.isUnused(inst)) .dead else return self.fail("TODO implement airShuffle for arm", .{}); - return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); +fn airShuffleOne(self: *Self, inst: Air.Inst.Index) !void { + _ = inst; + return self.fail("TODO implement airShuffleOne for arm", .{}); +} + +fn airShuffleTwo(self: *Self, inst: Air.Inst.Index) !void { + _ = inst; + return self.fail("TODO implement airShuffleTwo for arm", .{}); } fn airReduce(self: *Self, inst: Air.Inst.Index) !void { diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig index 8a40c61cdc..1d17d34189 100644 --- a/src/arch/riscv64/CodeGen.zig +++ b/src/arch/riscv64/CodeGen.zig @@ -1586,7 +1586,8 @@ fn genBody(func: *Func, body: []const Air.Inst.Index) InnerError!void { .error_name => try func.airErrorName(inst), .splat => try func.airSplat(inst), .select => try func.airSelect(inst), - .shuffle => try func.airShuffle(inst), + .shuffle_one => try func.airShuffleOne(inst), + .shuffle_two => try func.airShuffleTwo(inst), .reduce => try func.airReduce(inst), .aggregate_init => try func.airAggregateInit(inst), .union_init => try func.airUnionInit(inst), @@ -8030,10 +8031,14 @@ fn airSelect(func: *Func, inst: Air.Inst.Index) !void { return func.finishAir(inst, result, .{ pl_op.operand, extra.lhs, extra.rhs }); } -fn airShuffle(func: *Func, inst: Air.Inst.Index) !void { - const ty_op = func.air.instructions.items(.data)[@intFromEnum(inst)].ty_op; - const result: MCValue = if (func.liveness.isUnused(inst)) .unreach else return func.fail("TODO implement airShuffle for riscv64", .{}); - return func.finishAir(inst, result, .{ ty_op.operand, .none, .none }); +fn airShuffleOne(func: *Func, inst: Air.Inst.Index) !void { + _ = inst; + return func.fail("TODO implement airShuffleOne for riscv64", .{}); +} + +fn airShuffleTwo(func: *Func, inst: Air.Inst.Index) !void { + _ = inst; + return func.fail("TODO implement airShuffleTwo for riscv64", .{}); } fn airReduce(func: *Func, inst: Air.Inst.Index) !void { diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig index d473222288..439e5e6dbb 100644 --- a/src/arch/sparc64/CodeGen.zig +++ b/src/arch/sparc64/CodeGen.zig @@ -621,7 +621,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .error_name => try self.airErrorName(inst), .splat => try self.airSplat(inst), .select => @panic("TODO try self.airSelect(inst)"), - .shuffle => @panic("TODO try self.airShuffle(inst)"), + .shuffle_one => @panic("TODO try self.airShuffleOne(inst)"), + .shuffle_two => @panic("TODO try self.airShuffleTwo(inst)"), .reduce => @panic("TODO try self.airReduce(inst)"), .aggregate_init => try self.airAggregateInit(inst), .union_init => try self.airUnionInit(inst), diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig index 36908eb236..ebc46179c3 100644 --- a/src/arch/wasm/CodeGen.zig +++ b/src/arch/wasm/CodeGen.zig @@ -2004,7 +2004,8 @@ fn genInst(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { .ret_load => cg.airRetLoad(inst), .splat => cg.airSplat(inst), .select => cg.airSelect(inst), - .shuffle => cg.airShuffle(inst), + .shuffle_one => cg.airShuffleOne(inst), + .shuffle_two => cg.airShuffleTwo(inst), .reduce => cg.airReduce(inst), .aggregate_init => cg.airAggregateInit(inst), .union_init => cg.airUnionInit(inst), @@ -5177,66 +5178,100 @@ fn airSelect(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { return cg.fail("TODO: Implement wasm airSelect", .{}); } -fn airShuffle(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { +fn airShuffleOne(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { const pt = cg.pt; const zcu = pt.zcu; - const inst_ty = cg.typeOfIndex(inst); - const ty_pl = cg.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = cg.air.extraData(Air.Shuffle, ty_pl.payload).data; - const a = try cg.resolveInst(extra.a); - const b = try cg.resolveInst(extra.b); - const mask = Value.fromInterned(extra.mask); - const mask_len = extra.mask_len; + const unwrapped = cg.air.unwrapShuffleOne(zcu, inst); + const result_ty = unwrapped.result_ty; + const mask = unwrapped.mask; + const operand = try cg.resolveInst(unwrapped.operand); - const child_ty = inst_ty.childType(zcu); - const elem_size = child_ty.abiSize(zcu); + const elem_ty = result_ty.childType(zcu); + const elem_size = elem_ty.abiSize(zcu); - // TODO: One of them could be by ref; handle in loop - if (isByRef(cg.typeOf(extra.a), zcu, cg.target) or isByRef(inst_ty, zcu, cg.target)) { - const result = try cg.allocStack(inst_ty); + // TODO: this function could have an `i8x16_shuffle` fast path like `airShuffleTwo` if we were + // to lower the comptime-known operands to a non-by-ref vector value. - for (0..mask_len) |index| { - const value = (try mask.elemValue(pt, index)).toSignedInt(zcu); + // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible. + // I tried to fix it, but I couldn't make much sense of how this backend handles memory. - try cg.emitWValue(result); + const dest_alloc = try cg.allocStack(result_ty); + for (mask, 0..) |mask_elem, out_idx| { + try cg.emitWValue(dest_alloc); + const elem_val = switch (mask_elem.unwrap()) { + .elem => |idx| try cg.load(operand, elem_ty, @intCast(elem_size * idx)), + .value => |val| try cg.lowerConstant(.fromInterned(val), elem_ty), + }; + try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx)); + } + return cg.finishAir(inst, dest_alloc, &.{unwrapped.operand}); +} - const loaded = if (value >= 0) - try cg.load(a, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * value))) - else - try cg.load(b, child_ty, @as(u32, @intCast(@as(i64, @intCast(elem_size)) * ~value))); +fn airShuffleTwo(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { + const pt = cg.pt; + const zcu = pt.zcu; - try cg.store(.stack, loaded, child_ty, result.stack_offset.value + @as(u32, @intCast(elem_size)) * @as(u32, @intCast(index))); - } + const unwrapped = cg.air.unwrapShuffleTwo(zcu, inst); + const result_ty = unwrapped.result_ty; + const mask = unwrapped.mask; + const operand_a = try cg.resolveInst(unwrapped.operand_a); + const operand_b = try cg.resolveInst(unwrapped.operand_b); - return cg.finishAir(inst, result, &.{ extra.a, extra.b }); - } else { - var operands = [_]u32{ - @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle), - } ++ [1]u32{undefined} ** 4; + const a_ty = cg.typeOf(unwrapped.operand_a); + const b_ty = cg.typeOf(unwrapped.operand_b); + const elem_ty = result_ty.childType(zcu); + const elem_size = elem_ty.abiSize(zcu); - var lanes = mem.asBytes(operands[1..]); - for (0..@as(usize, @intCast(mask_len))) |index| { - const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu); - const base_index = if (mask_elem >= 0) - @as(u8, @intCast(@as(i64, @intCast(elem_size)) * mask_elem)) - else - 16 + @as(u8, @intCast(@as(i64, @intCast(elem_size)) * ~mask_elem)); - - for (0..@as(usize, @intCast(elem_size))) |byte_offset| { - lanes[index * @as(usize, @intCast(elem_size)) + byte_offset] = base_index + @as(u8, @intCast(byte_offset)); + // WASM has `i8x16_shuffle`, which we can apply if the element type bit size is a multiple of 8 + // and the input and output vectors have a bit size of 128 (and are hence not by-ref). Otherwise, + // we fall back to a naive loop lowering. + if (!isByRef(a_ty, zcu, cg.target) and + !isByRef(b_ty, zcu, cg.target) and + !isByRef(result_ty, zcu, cg.target) and + elem_ty.bitSize(zcu) % 8 == 0) + { + var lane_map: [16]u8 align(4) = undefined; + const lanes_per_elem = elem_ty.bitSize(zcu) / 8; + for (mask, 0..) |mask_elem, out_idx| { + const out_first_lane = out_idx * lanes_per_elem; + const in_first_lane = switch (mask_elem.unwrap()) { + .a_elem => |i| i * lanes_per_elem, + .b_elem => |i| i * lanes_per_elem + 16, + .undef => 0, // doesn't matter + }; + for (lane_map[out_first_lane..][0..lanes_per_elem], in_first_lane..) |*out, in| { + out.* = @intCast(in); } } - - try cg.emitWValue(a); - try cg.emitWValue(b); - + try cg.emitWValue(operand_a); + try cg.emitWValue(operand_b); const extra_index = cg.extraLen(); - try cg.mir_extra.appendSlice(cg.gpa, &operands); + try cg.mir_extra.appendSlice(cg.gpa, &.{ + @intFromEnum(std.wasm.SimdOpcode.i8x16_shuffle), + @bitCast(lane_map[0..4].*), + @bitCast(lane_map[4..8].*), + @bitCast(lane_map[8..12].*), + @bitCast(lane_map[12..].*), + }); try cg.addInst(.{ .tag = .simd_prefix, .data = .{ .payload = extra_index } }); - - return cg.finishAir(inst, .stack, &.{ extra.a, extra.b }); + return cg.finishAir(inst, .stack, &.{ unwrapped.operand_a, unwrapped.operand_b }); } + + // TODO: this is incorrect if either operand or the result is *not* by-ref, which is possible. + // I tried to fix it, but I couldn't make much sense of how this backend handles memory. + + const dest_alloc = try cg.allocStack(result_ty); + for (mask, 0..) |mask_elem, out_idx| { + try cg.emitWValue(dest_alloc); + const elem_val = switch (mask_elem.unwrap()) { + .a_elem => |idx| try cg.load(operand_a, elem_ty, @intCast(elem_size * idx)), + .b_elem => |idx| try cg.load(operand_b, elem_ty, @intCast(elem_size * idx)), + .undef => try cg.emitUndefined(elem_ty), + }; + try cg.store(.stack, elem_val, elem_ty, @intCast(dest_alloc.offset() + elem_size * out_idx)); + } + return cg.finishAir(inst, dest_alloc, &.{ unwrapped.operand_a, unwrapped.operand_b }); } fn airReduce(cg: *CodeGen, inst: Air.Inst.Index) InnerError!void { diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index f943b7f415..f6d8d61adc 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2490,7 +2490,7 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { switch (air_tags[@intFromEnum(inst)]) { // zig fmt: off .select => try cg.airSelect(inst), - .shuffle => try cg.airShuffle(inst), + .shuffle_one, .shuffle_two => @panic("x86_64 TODO: shuffle_one/shuffle_two"), // zig fmt: on .arg => if (cg.debug_output != .none) { diff --git a/src/codegen/c.zig b/src/codegen/c.zig index 8d947ce56a..c68abc06ce 100644 --- a/src/codegen/c.zig +++ b/src/codegen/c.zig @@ -3374,7 +3374,8 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, .error_name => try airErrorName(f, inst), .splat => try airSplat(f, inst), .select => try airSelect(f, inst), - .shuffle => try airShuffle(f, inst), + .shuffle_one => try airShuffleOne(f, inst), + .shuffle_two => try airShuffleTwo(f, inst), .reduce => try airReduce(f, inst), .aggregate_init => try airAggregateInit(f, inst), .union_init => try airUnionInit(f, inst), @@ -7163,34 +7164,73 @@ fn airSelect(f: *Function, inst: Air.Inst.Index) !CValue { return local; } -fn airShuffle(f: *Function, inst: Air.Inst.Index) !CValue { +fn airShuffleOne(f: *Function, inst: Air.Inst.Index) !CValue { const pt = f.object.dg.pt; const zcu = pt.zcu; - const ty_pl = f.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = f.air.extraData(Air.Shuffle, ty_pl.payload).data; - const mask = Value.fromInterned(extra.mask); - const lhs = try f.resolveInst(extra.a); - const rhs = try f.resolveInst(extra.b); - - const inst_ty = f.typeOfIndex(inst); + const unwrapped = f.air.unwrapShuffleOne(zcu, inst); + const mask = unwrapped.mask; + const operand = try f.resolveInst(unwrapped.operand); + const inst_ty = unwrapped.result_ty; const writer = f.object.writer(); const local = try f.allocLocal(inst, inst_ty); - try reap(f, inst, &.{ extra.a, extra.b }); // local cannot alias operands - for (0..extra.mask_len) |index| { + try reap(f, inst, &.{unwrapped.operand}); // local cannot alias operand + for (mask, 0..) |mask_elem, out_idx| { try f.writeCValue(writer, local, .Other); try writer.writeByte('['); - try f.object.dg.renderValue(writer, try pt.intValue(.usize, index), .Other); + try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other); try writer.writeAll("] = "); + switch (mask_elem.unwrap()) { + .elem => |src_idx| { + try f.writeCValue(writer, operand, .Other); + try writer.writeByte('['); + try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other); + try writer.writeByte(']'); + }, + .value => |val| try f.object.dg.renderValue(writer, .fromInterned(val), .Other), + } + try writer.writeAll(";\n"); + } - const mask_elem = (try mask.elemValue(pt, index)).toSignedInt(zcu); - const src_val = try pt.intValue(.usize, @as(u64, @intCast(mask_elem ^ mask_elem >> 63))); + return local; +} - try f.writeCValue(writer, if (mask_elem >= 0) lhs else rhs, .Other); +fn airShuffleTwo(f: *Function, inst: Air.Inst.Index) !CValue { + const pt = f.object.dg.pt; + const zcu = pt.zcu; + + const unwrapped = f.air.unwrapShuffleTwo(zcu, inst); + const mask = unwrapped.mask; + const operand_a = try f.resolveInst(unwrapped.operand_a); + const operand_b = try f.resolveInst(unwrapped.operand_b); + const inst_ty = unwrapped.result_ty; + const elem_ty = inst_ty.childType(zcu); + + const writer = f.object.writer(); + const local = try f.allocLocal(inst, inst_ty); + try reap(f, inst, &.{ unwrapped.operand_a, unwrapped.operand_b }); // local cannot alias operands + for (mask, 0..) |mask_elem, out_idx| { + try f.writeCValue(writer, local, .Other); try writer.writeByte('['); - try f.object.dg.renderValue(writer, src_val, .Other); - try writer.writeAll("];\n"); + try f.object.dg.renderValue(writer, try pt.intValue(.usize, out_idx), .Other); + try writer.writeAll("] = "); + switch (mask_elem.unwrap()) { + .a_elem => |src_idx| { + try f.writeCValue(writer, operand_a, .Other); + try writer.writeByte('['); + try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other); + try writer.writeByte(']'); + }, + .b_elem => |src_idx| { + try f.writeCValue(writer, operand_b, .Other); + try writer.writeByte('['); + try f.object.dg.renderValue(writer, try pt.intValue(.usize, src_idx), .Other); + try writer.writeByte(']'); + }, + .undef => try f.object.dg.renderUndefValue(writer, elem_ty, .Other), + } + try writer.writeAll(";\n"); } return local; diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 77d8f3ff47..960d5f819b 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -4969,7 +4969,8 @@ pub const FuncGen = struct { .error_name => try self.airErrorName(inst), .splat => try self.airSplat(inst), .select => try self.airSelect(inst), - .shuffle => try self.airShuffle(inst), + .shuffle_one => try self.airShuffleOne(inst), + .shuffle_two => try self.airShuffleTwo(inst), .aggregate_init => try self.airAggregateInit(inst), .union_init => try self.airUnionInit(inst), .prefetch => try self.airPrefetch(inst), @@ -9666,7 +9667,7 @@ pub const FuncGen = struct { const zcu = o.pt.zcu; const ip = &zcu.intern_pool; for (body_tail[1..]) |body_inst| { - switch (fg.liveness.categorizeOperand(fg.air, body_inst, body_tail[0], ip)) { + switch (fg.liveness.categorizeOperand(fg.air, zcu, body_inst, body_tail[0], ip)) { .none => continue, .write, .noret, .complex => return false, .tomb => return true, @@ -10421,42 +10422,192 @@ pub const FuncGen = struct { return self.wip.select(.normal, pred, a, b, ""); } - fn airShuffle(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value { - const o = self.ng.object; + fn airShuffleOne(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value { + const o = fg.ng.object; const pt = o.pt; const zcu = pt.zcu; - const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data; - const a = try self.resolveInst(extra.a); - const b = try self.resolveInst(extra.b); - const mask = Value.fromInterned(extra.mask); - const mask_len = extra.mask_len; - const a_len = self.typeOf(extra.a).vectorLen(zcu); + const gpa = zcu.gpa; - // LLVM uses integers larger than the length of the first array to - // index into the second array. This was deemed unnecessarily fragile - // when changing code, so Zig uses negative numbers to index the - // second vector. These start at -1 and go down, and are easiest to use - // with the ~ operator. Here we convert between the two formats. - const values = try self.gpa.alloc(Builder.Constant, mask_len); - defer self.gpa.free(values); + const unwrapped = fg.air.unwrapShuffleOne(zcu, inst); - for (values, 0..) |*val, i| { - const elem = try mask.elemValue(pt, i); - if (elem.isUndef(zcu)) { - val.* = try o.builder.undefConst(.i32); - } else { - const int = elem.toSignedInt(zcu); - const unsigned: u32 = @intCast(if (int >= 0) int else ~int + a_len); - val.* = try o.builder.intConst(.i32, unsigned); + const operand = try fg.resolveInst(unwrapped.operand); + const mask = unwrapped.mask; + const operand_ty = fg.typeOf(unwrapped.operand); + const llvm_operand_ty = try o.lowerType(operand_ty); + const llvm_result_ty = try o.lowerType(unwrapped.result_ty); + const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu)); + const llvm_poison_elem = try o.builder.poisonConst(llvm_elem_ty); + const llvm_poison_mask_elem = try o.builder.poisonConst(.i32); + const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32); + + // LLVM requires that the two input vectors have the same length, so lowering isn't trivial. + // And, in the words of jacobly0: "llvm sucks at shuffles so we do have to hold its hand at + // least a bit". So, there are two cases here. + // + // If the operand length equals the mask length, we do just the one `shufflevector`, where + // the second operand is a constant vector with comptime-known elements at the right indices + // and poison values elsewhere (in the indices which won't be selected). + // + // Otherwise, we lower to *two* `shufflevector` instructions. The first shuffles the runtime + // operand with an all-poison vector to extract and correctly position all of the runtime + // elements. We also make a constant vector with all of the comptime elements correctly + // positioned. Then, our second instruction selects elements from those "runtime-or-poison" + // and "comptime-or-poison" vectors to compute the result. + + // This buffer is used primarily for the mask constants. + const llvm_elem_buf = try gpa.alloc(Builder.Constant, mask.len); + defer gpa.free(llvm_elem_buf); + + // ...but first, we'll collect all of the comptime-known values. + var any_defined_comptime_value = false; + for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| { + llvm_elem.* = switch (mask_elem.unwrap()) { + .elem => llvm_poison_elem, + .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) elem: { + any_defined_comptime_value = true; + break :elem try o.lowerValue(val); + } else llvm_poison_elem, + }; + } + // This vector is like the result, but runtime elements are replaced with poison. + const comptime_and_poison: Builder.Value = if (any_defined_comptime_value) vec: { + break :vec try o.builder.vectorValue(llvm_result_ty, llvm_elem_buf); + } else try o.builder.poisonValue(llvm_result_ty); + + if (operand_ty.vectorLen(zcu) == mask.len) { + // input length equals mask/output length, so we lower to one instruction + for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| { + llvm_elem.* = switch (mask_elem.unwrap()) { + .elem => |idx| try o.builder.intConst(.i32, idx), + .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: { + break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx); + } else llvm_poison_mask_elem, + }; } + return fg.wip.shuffleVector( + operand, + comptime_and_poison, + try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf), + "", + ); } - const llvm_mask_value = try o.builder.vectorValue( - try o.builder.vectorType(.normal, mask_len, .i32), - values, + for (mask, llvm_elem_buf) |mask_elem, *llvm_elem| { + llvm_elem.* = switch (mask_elem.unwrap()) { + .elem => |idx| try o.builder.intConst(.i32, idx), + .value => llvm_poison_mask_elem, + }; + } + // This vector is like our result, but all comptime-known elements are poison. + const runtime_and_poison = try fg.wip.shuffleVector( + operand, + try o.builder.poisonValue(llvm_operand_ty), + try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf), + "", + ); + + if (!any_defined_comptime_value) { + // `comptime_and_poison` is just poison; a second shuffle would be a nop. + return runtime_and_poison; + } + + // In this second shuffle, the inputs, the mask, and the output all have the same length. + for (mask, llvm_elem_buf, 0..) |mask_elem, *llvm_elem, elem_idx| { + llvm_elem.* = switch (mask_elem.unwrap()) { + .elem => try o.builder.intConst(.i32, elem_idx), + .value => |val| if (!Value.fromInterned(val).isUndef(zcu)) mask_val: { + break :mask_val try o.builder.intConst(.i32, mask.len + elem_idx); + } else llvm_poison_mask_elem, + }; + } + // Merge the runtime and comptime elements with the mask we just built. + return fg.wip.shuffleVector( + runtime_and_poison, + comptime_and_poison, + try o.builder.vectorValue(llvm_mask_ty, llvm_elem_buf), + "", + ); + } + + fn airShuffleTwo(fg: *FuncGen, inst: Air.Inst.Index) !Builder.Value { + const o = fg.ng.object; + const pt = o.pt; + const zcu = pt.zcu; + const gpa = zcu.gpa; + + const unwrapped = fg.air.unwrapShuffleTwo(zcu, inst); + + const mask = unwrapped.mask; + const llvm_elem_ty = try o.lowerType(unwrapped.result_ty.childType(zcu)); + const llvm_mask_ty = try o.builder.vectorType(.normal, @intCast(mask.len), .i32); + const llvm_poison_mask_elem = try o.builder.poisonConst(.i32); + + // This is kind of simpler than in `airShuffleOne`. We extend the shorter vector to the + // length of the longer one with an initial `shufflevector` if necessary, and then do the + // actual computation with a second `shufflevector`. + + const operand_a_len = fg.typeOf(unwrapped.operand_a).vectorLen(zcu); + const operand_b_len = fg.typeOf(unwrapped.operand_b).vectorLen(zcu); + const operand_len: u32 = @max(operand_a_len, operand_b_len); + + // If we need to extend an operand, this is the type that mask will have. + const llvm_operand_mask_ty = try o.builder.vectorType(.normal, operand_len, .i32); + + const llvm_elem_buf = try gpa.alloc(Builder.Constant, @max(mask.len, operand_len)); + defer gpa.free(llvm_elem_buf); + + const operand_a: Builder.Value = extend: { + const raw = try fg.resolveInst(unwrapped.operand_a); + if (operand_a_len == operand_len) break :extend raw; + // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>` + const mask_elems = llvm_elem_buf[0..operand_len]; + for (mask_elems[0..operand_a_len], 0..) |*llvm_elem, elem_idx| { + llvm_elem.* = try o.builder.intConst(.i32, elem_idx); + } + @memset(mask_elems[operand_a_len..], llvm_poison_mask_elem); + const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_a_len, llvm_elem_ty); + break :extend try fg.wip.shuffleVector( + raw, + try o.builder.poisonValue(llvm_this_operand_ty), + try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems), + "", + ); + }; + const operand_b: Builder.Value = extend: { + const raw = try fg.resolveInst(unwrapped.operand_b); + if (operand_b_len == operand_len) break :extend raw; + // Extend with a `shufflevector`, with a mask `<0, 1, ..., n, poison, poison, ..., poison>` + const mask_elems = llvm_elem_buf[0..operand_len]; + for (mask_elems[0..operand_b_len], 0..) |*llvm_elem, elem_idx| { + llvm_elem.* = try o.builder.intConst(.i32, elem_idx); + } + @memset(mask_elems[operand_b_len..], llvm_poison_mask_elem); + const llvm_this_operand_ty = try o.builder.vectorType(.normal, operand_b_len, llvm_elem_ty); + break :extend try fg.wip.shuffleVector( + raw, + try o.builder.poisonValue(llvm_this_operand_ty), + try o.builder.vectorValue(llvm_operand_mask_ty, mask_elems), + "", + ); + }; + + // `operand_a` and `operand_b` now have the same length (we've extended the shorter one with + // an initial shuffle if necessary). Now for the easy bit. + + const mask_elems = llvm_elem_buf[0..mask.len]; + for (mask, mask_elems) |mask_elem, *llvm_mask_elem| { + llvm_mask_elem.* = switch (mask_elem.unwrap()) { + .a_elem => |idx| try o.builder.intConst(.i32, idx), + .b_elem => |idx| try o.builder.intConst(.i32, operand_len + idx), + .undef => llvm_poison_mask_elem, + }; + } + return fg.wip.shuffleVector( + operand_a, + operand_b, + try o.builder.vectorValue(llvm_mask_ty, mask_elems), + "", ); - return self.wip.shuffleVector(a, b, llvm_mask_value, ""); } /// Reduce a vector by repeatedly applying `llvm_fn` to produce an accumulated result. diff --git a/src/codegen/spirv.zig b/src/codegen/spirv.zig index 1381a79075..f83c6979ff 100644 --- a/src/codegen/spirv.zig +++ b/src/codegen/spirv.zig @@ -3252,7 +3252,8 @@ const NavGen = struct { .splat => try self.airSplat(inst), .reduce, .reduce_optimized => try self.airReduce(inst), - .shuffle => try self.airShuffle(inst), + .shuffle_one => try self.airShuffleOne(inst), + .shuffle_two => try self.airShuffleTwo(inst), .ptr_add => try self.airPtrAdd(inst), .ptr_sub => try self.airPtrSub(inst), @@ -4047,40 +4048,57 @@ const NavGen = struct { return result_id; } - fn airShuffle(self: *NavGen, inst: Air.Inst.Index) !?IdRef { - const pt = self.pt; + fn airShuffleOne(ng: *NavGen, inst: Air.Inst.Index) !?IdRef { + const pt = ng.pt; const zcu = pt.zcu; - const ty_pl = self.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = self.air.extraData(Air.Shuffle, ty_pl.payload).data; - const a = try self.resolve(extra.a); - const b = try self.resolve(extra.b); - const mask = Value.fromInterned(extra.mask); + const gpa = zcu.gpa; - // Note: number of components in the result, a, and b may differ. - const result_ty = self.typeOfIndex(inst); - const scalar_ty = result_ty.scalarType(zcu); - const scalar_ty_id = try self.resolveType(scalar_ty, .direct); + const unwrapped = ng.air.unwrapShuffleOne(zcu, inst); + const mask = unwrapped.mask; + const result_ty = unwrapped.result_ty; + const elem_ty = result_ty.childType(zcu); + const operand = try ng.resolve(unwrapped.operand); - const constituents = try self.gpa.alloc(IdRef, result_ty.vectorLen(zcu)); - defer self.gpa.free(constituents); + const constituents = try gpa.alloc(IdRef, mask.len); + defer gpa.free(constituents); - for (constituents, 0..) |*id, i| { - const elem = try mask.elemValue(pt, i); - if (elem.isUndef(zcu)) { - id.* = try self.spv.constUndef(scalar_ty_id); - continue; - } - - const index = elem.toSignedInt(zcu); - if (index >= 0) { - id.* = try self.extractVectorComponent(scalar_ty, a, @intCast(index)); - } else { - id.* = try self.extractVectorComponent(scalar_ty, b, @intCast(~index)); - } + for (constituents, mask) |*id, mask_elem| { + id.* = switch (mask_elem.unwrap()) { + .elem => |idx| try ng.extractVectorComponent(elem_ty, operand, idx), + .value => |val| try ng.constant(elem_ty, .fromInterned(val), .direct), + }; } - const result_ty_id = try self.resolveType(result_ty, .direct); - return try self.constructComposite(result_ty_id, constituents); + const result_ty_id = try ng.resolveType(result_ty, .direct); + return try ng.constructComposite(result_ty_id, constituents); + } + + fn airShuffleTwo(ng: *NavGen, inst: Air.Inst.Index) !?IdRef { + const pt = ng.pt; + const zcu = pt.zcu; + const gpa = zcu.gpa; + + const unwrapped = ng.air.unwrapShuffleTwo(zcu, inst); + const mask = unwrapped.mask; + const result_ty = unwrapped.result_ty; + const elem_ty = result_ty.childType(zcu); + const elem_ty_id = try ng.resolveType(elem_ty, .direct); + const operand_a = try ng.resolve(unwrapped.operand_a); + const operand_b = try ng.resolve(unwrapped.operand_b); + + const constituents = try gpa.alloc(IdRef, mask.len); + defer gpa.free(constituents); + + for (constituents, mask) |*id, mask_elem| { + id.* = switch (mask_elem.unwrap()) { + .a_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_a, idx), + .b_elem => |idx| try ng.extractVectorComponent(elem_ty, operand_b, idx), + .undef => try ng.spv.constUndef(elem_ty_id), + }; + } + + const result_ty_id = try ng.resolveType(result_ty, .direct); + return try ng.constructComposite(result_ty_id, constituents); } fn indicesToIds(self: *NavGen, indices: []const u32) ![]IdRef { diff --git a/src/print_air.zig b/src/print_air.zig index 0f658dcd9f..6085adbcdc 100644 --- a/src/print_air.zig +++ b/src/print_air.zig @@ -315,7 +315,8 @@ const Writer = struct { .wasm_memory_grow => try w.writeWasmMemoryGrow(s, inst), .mul_add => try w.writeMulAdd(s, inst), .select => try w.writeSelect(s, inst), - .shuffle => try w.writeShuffle(s, inst), + .shuffle_one => try w.writeShuffleOne(s, inst), + .shuffle_two => try w.writeShuffleTwo(s, inst), .reduce, .reduce_optimized => try w.writeReduce(s, inst), .cmp_vector, .cmp_vector_optimized => try w.writeCmpVector(s, inst), .vector_store_elem => try w.writeVectorStoreElem(s, inst), @@ -499,14 +500,39 @@ const Writer = struct { try w.writeOperand(s, inst, 2, pl_op.operand); } - fn writeShuffle(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void { - const ty_pl = w.air.instructions.items(.data)[@intFromEnum(inst)].ty_pl; - const extra = w.air.extraData(Air.Shuffle, ty_pl.payload).data; - - try w.writeOperand(s, inst, 0, extra.a); + fn writeShuffleOne(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void { + const unwrapped = w.air.unwrapShuffleOne(w.pt.zcu, inst); + try w.writeType(s, unwrapped.result_ty); try s.writeAll(", "); - try w.writeOperand(s, inst, 1, extra.b); - try s.print(", mask {d}, len {d}", .{ extra.mask, extra.mask_len }); + try w.writeOperand(s, inst, 0, unwrapped.operand); + try s.writeAll(", ["); + for (unwrapped.mask, 0..) |mask_elem, mask_idx| { + if (mask_idx > 0) try s.writeAll(", "); + switch (mask_elem.unwrap()) { + .elem => |idx| try s.print("elem {d}", .{idx}), + .value => |val| try s.print("val {}", .{Value.fromInterned(val).fmtValue(w.pt)}), + } + } + try s.writeByte(']'); + } + + fn writeShuffleTwo(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void { + const unwrapped = w.air.unwrapShuffleTwo(w.pt.zcu, inst); + try w.writeType(s, unwrapped.result_ty); + try s.writeAll(", "); + try w.writeOperand(s, inst, 0, unwrapped.operand_a); + try s.writeAll(", "); + try w.writeOperand(s, inst, 1, unwrapped.operand_b); + try s.writeAll(", ["); + for (unwrapped.mask, 0..) |mask_elem, mask_idx| { + if (mask_idx > 0) try s.writeAll(", "); + switch (mask_elem.unwrap()) { + .a_elem => |idx| try s.print("a_elem {d}", .{idx}), + .b_elem => |idx| try s.print("b_elem {d}", .{idx}), + .undef => try s.writeAll("undef"), + } + } + try s.writeByte(']'); } fn writeSelect(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void { diff --git a/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig b/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig index c1594d55fb..4ad01d28c4 100644 --- a/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig +++ b/test/cases/compile_errors/shuffle_with_selected_index_past_first_vector_length.zig @@ -1,14 +1,20 @@ -export fn entry() void { - const v: @Vector(4, u32) = [4]u32{ 10, 11, 12, 13 }; - const x: @Vector(4, u32) = [4]u32{ 14, 15, 16, 17 }; - const z = @shuffle(u32, v, x, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 }); - _ = z; +export fn foo() void { + // Here, the bad index ('7') is not less than 'b.len', so the error shouldn't have a note suggesting a negative index. + const a: @Vector(4, u32) = .{ 10, 11, 12, 13 }; + const b: @Vector(4, u32) = .{ 14, 15, 16, 17 }; + _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 }); +} +export fn bar() void { + // Here, the bad index ('7') *is* less than 'b.len', so the error *should* have a note suggesting a negative index. + const a: @Vector(4, u32) = .{ 10, 11, 12, 13 }; + const b: @Vector(9, u32) = .{ 14, 15, 16, 17, 18, 19, 20, 21, 22 }; + _ = @shuffle(u32, a, b, [8]i32{ 0, 1, 2, 3, 7, 6, 5, 4 }); } // error -// backend=stage2 -// target=native // -// :4:41: error: mask index '4' has out-of-bounds selection -// :4:29: note: selected index '7' out of bounds of '@Vector(4, u32)' -// :4:32: note: selections from the second vector are specified with negative numbers +// :5:35: error: mask element at index '4' selects out-of-bounds index +// :5:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here +// :11:35: error: mask element at index '4' selects out-of-bounds index +// :11:23: note: index '7' exceeds bounds of '@Vector(4, u32)' given here +// :11:26: note: use '~@as(u32, 7)' to index into second vector given here