diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig index 3d82cad4ec..a4fa7e179c 100644 --- a/src/arch/sparc64/CodeGen.zig +++ b/src/arch/sparc64/CodeGen.zig @@ -22,6 +22,7 @@ const Type = @import("../../type.zig").Type; const CodeGenError = codegen.CodeGenError; const Result = @import("../../codegen.zig").Result; const DebugInfoOutput = @import("../../codegen.zig").DebugInfoOutput; +const Endian = std.builtin.Endian; const build_options = @import("build_options"); @@ -30,6 +31,7 @@ const abi = @import("abi.zig"); const errUnionPayloadOffset = codegen.errUnionPayloadOffset; const errUnionErrorOffset = codegen.errUnionErrorOffset; const Instruction = bits.Instruction; +const ASI = Instruction.ASI; const ShiftWidth = Instruction.ShiftWidth; const RegisterManager = abi.RegisterManager; const RegisterLock = RegisterManager.RegisterLock; @@ -615,7 +617,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .clz => try self.airClz(inst), .ctz => try self.airCtz(inst), .popcount => try self.airPopcount(inst), - .byte_swap => @panic("TODO try self.airByteSwap(inst)"), + .byte_swap => try self.airByteSwap(inst), .bit_reverse => try self.airBitReverse(inst), .tag_name => try self.airTagName(inst), .error_name => try self.airErrorName(inst), @@ -1200,6 +1202,90 @@ fn airBreakpoint(self: *Self) !void { return self.finishAirBookkeeping(); } +fn airByteSwap(self: *Self, inst: Air.Inst.Index) !void { + const ty_op = self.air.instructions.items(.data)[inst].ty_op; + + // We have hardware byteswapper in SPARCv9, don't let mainstream compilers mislead you. + // That being said, the strategy to lower this is: + // - If src is an immediate, comptime-swap it. + // - If src is in memory then issue an LD*A with #ASI_P_[oppposite-endian] + // - If src is a register then issue an ST*A with #ASI_P_[oppposite-endian] + // to a stack slot, then follow with a normal load from said stack slot. + // This is because on some implementations, ASI-tagged memory operations are non-piplelinable + // and loads tend to have longer latency than stores, so the sequence will minimize stall. + // The result will always be either another immediate or stored in a register. + // TODO: Fold byteswap+store into a single ST*A and load+byteswap into a single LD*A. + const result: MCValue = if (self.liveness.isUnused(inst)) .dead else result: { + const operand = try self.resolveInst(ty_op.operand); + const operand_ty = self.air.typeOf(ty_op.operand); + switch (operand_ty.zigTypeTag()) { + .Vector => return self.fail("TODO byteswap for vectors", .{}), + .Int => { + const int_info = operand_ty.intInfo(self.target.*); + if (int_info.bits == 8) break :result operand; + + const abi_size = int_info.bits >> 3; + const abi_align = operand_ty.abiAlignment(self.target.*); + const opposite_endian_asi = switch (self.target.cpu.arch.endian()) { + Endian.Big => ASI.asi_primary_little, + Endian.Little => ASI.asi_primary, + }; + + switch (operand) { + .immediate => |imm| { + const swapped = switch (int_info.bits) { + 16 => @byteSwap(@intCast(u16, imm)), + 24 => @byteSwap(@intCast(u24, imm)), + 32 => @byteSwap(@intCast(u32, imm)), + 40 => @byteSwap(@intCast(u40, imm)), + 48 => @byteSwap(@intCast(u48, imm)), + 56 => @byteSwap(@intCast(u56, imm)), + 64 => @byteSwap(@intCast(u64, imm)), + else => return self.fail("TODO synthesize SPARCv9 byteswap for other integer sizes", .{}), + }; + break :result .{ .immediate = swapped }; + }, + .register => |reg| { + if (int_info.bits > 64 or @popCount(int_info.bits) != 1) + return self.fail("TODO synthesize SPARCv9 byteswap for other integer sizes", .{}); + + const off = try self.allocMem(inst, abi_size, abi_align); + const off_reg = try self.copyToTmpRegister(operand_ty, .{ .immediate = realStackOffset(off) }); + + try self.genStoreASI(reg, .sp, off_reg, abi_size, opposite_endian_asi); + try self.genLoad(reg, .sp, Register, off_reg, abi_size); + break :result reg; + }, + .memory => { + if (int_info.bits > 64 or @popCount(int_info.bits) != 1) + return self.fail("TODO synthesize SPARCv9 byteswap for other integer sizes", .{}); + + const addr_reg = try self.copyToTmpRegister(operand_ty, operand); + const dst_reg = try self.register_manager.allocReg(null, gp); + + try self.genLoadASI(dst_reg, addr_reg, .g0, abi_size, opposite_endian_asi); + break :result dst_reg; + }, + .stack_offset => |off| { + if (int_info.bits > 64 or @popCount(int_info.bits) != 1) + return self.fail("TODO synthesize SPARCv9 byteswap for other integer sizes", .{}); + + const off_reg = try self.copyToTmpRegister(operand_ty, .{ .immediate = realStackOffset(off) }); + const dst_reg = try self.register_manager.allocReg(null, gp); + + try self.genLoadASI(dst_reg, .sp, off_reg, abi_size, opposite_endian_asi); + break :result dst_reg; + }, + else => unreachable, + } + }, + else => unreachable, + } + }; + + return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); +} + fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier) !void { if (modifier == .always_tail) return self.fail("TODO implement tail calls for {}", .{self.target.cpu.arch}); @@ -3583,6 +3669,34 @@ fn genLoad(self: *Self, value_reg: Register, addr_reg: Register, comptime off_ty } } +fn genLoadASI(self: *Self, value_reg: Register, addr_reg: Register, off_reg: Register, abi_size: u64, asi: ASI) !void { + switch (abi_size) { + 1, 2, 4, 8 => { + const tag: Mir.Inst.Tag = switch (abi_size) { + 1 => .lduba, + 2 => .lduha, + 4 => .lduwa, + 8 => .ldxa, + else => unreachable, // unexpected abi size + }; + + _ = try self.addInst(.{ + .tag = tag, + .data = .{ + .mem_asi = .{ + .rd = value_reg, + .rs1 = addr_reg, + .rs2 = off_reg, + .asi = asi, + }, + }, + }); + }, + 3, 5, 6, 7 => return self.fail("TODO: genLoad for more abi_sizes", .{}), + else => unreachable, + } +} + fn genSetReg(self: *Self, ty: Type, reg: Register, mcv: MCValue) InnerError!void { switch (mcv) { .dead => unreachable, @@ -3942,6 +4056,34 @@ fn genStore(self: *Self, value_reg: Register, addr_reg: Register, comptime off_t } } +fn genStoreASI(self: *Self, value_reg: Register, addr_reg: Register, off_reg: Register, abi_size: u64, asi: ASI) !void { + switch (abi_size) { + 1, 2, 4, 8 => { + const tag: Mir.Inst.Tag = switch (abi_size) { + 1 => .stba, + 2 => .stha, + 4 => .stwa, + 8 => .stxa, + else => unreachable, // unexpected abi size + }; + + _ = try self.addInst(.{ + .tag = tag, + .data = .{ + .mem_asi = .{ + .rd = value_reg, + .rs1 = addr_reg, + .rs2 = off_reg, + .asi = asi, + }, + }, + }); + }, + 3, 5, 6, 7 => return self.fail("TODO: genLoad for more abi_sizes", .{}), + else => unreachable, + } +} + fn genTypedValue(self: *Self, typed_value: TypedValue) InnerError!MCValue { const mcv: MCValue = switch (try codegen.genTypedValue( self.bin_file, @@ -4257,12 +4399,12 @@ fn processDeath(self: *Self, inst: Air.Inst.Index) void { /// Turns stack_offset MCV into a real SPARCv9 stack offset usable for asm. fn realStackOffset(off: u32) u32 { return off - // SPARCv9 %sp points away from the stack by some amount. - + abi.stack_bias - // The first couple bytes of each stack frame is reserved - // for ABI and hardware purposes. - + abi.stack_reserved_area; - // Only after that we have the usable stack frame portion. + // SPARCv9 %sp points away from the stack by some amount. + + abi.stack_bias + // The first couple bytes of each stack frame is reserved + // for ABI and hardware purposes. + + abi.stack_reserved_area; + // Only after that we have the usable stack frame portion. } /// Caller must call `CallMCValues.deinit`. diff --git a/src/arch/sparc64/Emit.zig b/src/arch/sparc64/Emit.zig index 7e71492af7..c0dbab5a14 100644 --- a/src/arch/sparc64/Emit.zig +++ b/src/arch/sparc64/Emit.zig @@ -91,6 +91,11 @@ pub fn emitMir( .lduw => try emit.mirArithmetic3Op(inst), .ldx => try emit.mirArithmetic3Op(inst), + .lduba => unreachable, + .lduha => unreachable, + .lduwa => unreachable, + .ldxa => unreachable, + .@"and" => try emit.mirArithmetic3Op(inst), .@"or" => try emit.mirArithmetic3Op(inst), .xor => try emit.mirArithmetic3Op(inst), @@ -127,6 +132,11 @@ pub fn emitMir( .stw => try emit.mirArithmetic3Op(inst), .stx => try emit.mirArithmetic3Op(inst), + .stba => unreachable, + .stha => unreachable, + .stwa => unreachable, + .stxa => unreachable, + .sub => try emit.mirArithmetic3Op(inst), .subcc => try emit.mirArithmetic3Op(inst), diff --git a/src/arch/sparc64/Mir.zig b/src/arch/sparc64/Mir.zig index f854152a2f..f9a4056705 100644 --- a/src/arch/sparc64/Mir.zig +++ b/src/arch/sparc64/Mir.zig @@ -15,6 +15,7 @@ const bits = @import("bits.zig"); const Air = @import("../../Air.zig"); const Instruction = bits.Instruction; +const ASI = bits.Instruction.ASI; const Register = bits.Register; instructions: std.MultiArrayList(Inst).Slice, @@ -70,6 +71,16 @@ pub const Inst = struct { lduw, ldx, + /// A.28 Load Integer from Alternate Space + /// This uses the mem_asi field. + /// Note that the ldda variant of this instruction is deprecated, so do not emit + /// it unless specifically requested (e.g. by inline assembly). + // TODO add other operations. + lduba, + lduha, + lduwa, + ldxa, + /// A.31 Logical Operations /// This uses the arithmetic_3op field. // TODO add other operations. @@ -132,6 +143,16 @@ pub const Inst = struct { stw, stx, + /// A.55 Store Integer into Alternate Space + /// This uses the mem_asi field. + /// Note that the stda variant of this instruction is deprecated, so do not emit + /// it unless specifically requested (e.g. by inline assembly). + // TODO add other operations. + stba, + stha, + stwa, + stxa, + /// A.56 Subtract /// This uses the arithmetic_3op field. // TODO add other operations. @@ -241,6 +262,15 @@ pub const Inst = struct { inst: Index, }, + /// ASI-tagged memory operations. + /// Used by e.g. ldxa, stxa + mem_asi: struct { + rd: Register, + rs1: Register, + rs2: Register = .g0, + asi: ASI, + }, + /// Membar mask, controls the barrier behavior /// Used by e.g. membar membar_mask: struct {