diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 7271f8f7f6..317a50e0ec 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2597,28 +2597,143 @@ fn airGetUnionTag(self: *Self, inst: Air.Inst.Index) !void { fn airClz(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; - const result: MCValue = if (self.liveness.isUnused(inst)) - .dead - else - return self.fail("TODO implement airClz for {}", .{self.target.cpu.arch}); + const result = result: { + if (self.liveness.isUnused(inst)) break :result .dead; + + const dst_ty = self.air.typeOfIndex(inst); + const src_ty = self.air.typeOf(ty_op.operand); + const src_bits = src_ty.bitSize(self.target.*); + + const src_mcv = try self.resolveInst(ty_op.operand); + const mat_src_mcv = switch (src_mcv) { + .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) }, + else => src_mcv, + }; + const mat_src_lock = switch (mat_src_mcv) { + .register => |reg| self.register_manager.lockReg(reg), + else => null, + }; + defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock); + + const dst_reg = try self.register_manager.allocReg(inst, gp); + const dst_mcv = MCValue{ .register = dst_reg }; + const dst_lock = self.register_manager.lockReg(dst_reg); + defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + + if (Target.x86.featureSetHas(self.target.cpu.features, .lzcnt)) { + try self.genBinOpMir(.lzcnt, src_ty, dst_mcv, mat_src_mcv); + const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); + const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits; + if (extra_bits > 0) { + try self.genBinOpMir(.sub, dst_ty, dst_mcv, .{ .immediate = extra_bits }); + } + break :result dst_mcv; + } + + const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits }); + const width_mcv = MCValue{ .register = width_reg }; + try self.genBinOpMir(.bsr, src_ty, dst_mcv, mat_src_mcv); + + const dst_abi_size = @intCast(u32, @max(dst_ty.abiSize(self.target.*), 2)); + try self.asmCmovccRegisterRegister( + registerAlias(dst_reg, dst_abi_size), + registerAlias(width_reg, dst_abi_size), + .z, + ); + + try self.genBinOpMir(.sub, dst_ty, width_mcv, dst_mcv); + break :result width_mcv; + }; return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airCtz(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; - const result: MCValue = if (self.liveness.isUnused(inst)) - .dead - else - return self.fail("TODO implement airCtz for {}", .{self.target.cpu.arch}); + const result = result: { + if (self.liveness.isUnused(inst)) break :result .dead; + + const dst_ty = self.air.typeOfIndex(inst); + const src_ty = self.air.typeOf(ty_op.operand); + const src_bits = src_ty.bitSize(self.target.*); + + const src_mcv = try self.resolveInst(ty_op.operand); + const mat_src_mcv = switch (src_mcv) { + .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) }, + else => src_mcv, + }; + const mat_src_lock = switch (mat_src_mcv) { + .register => |reg| self.register_manager.lockReg(reg), + else => null, + }; + defer if (mat_src_lock) |lock| self.register_manager.unlockReg(lock); + + const dst_reg = try self.register_manager.allocReg(inst, gp); + const dst_mcv = MCValue{ .register = dst_reg }; + const dst_lock = self.register_manager.lockReg(dst_reg); + defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + + if (Target.x86.featureSetHas(self.target.cpu.features, .bmi)) { + const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); + const extra_bits = registerAlias(dst_reg, src_abi_size).bitSize() - src_bits; + const masked_mcv = if (extra_bits > 0) masked: { + const mask_mcv = MCValue{ + .immediate = ((@as(u64, 1) << @intCast(u6, extra_bits)) - 1) << @intCast(u6, src_bits), + }; + const tmp_mcv = tmp: { + if (src_mcv.isImmediate() or self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) { + break :tmp src_mcv; + } + try self.genSetReg(src_ty, dst_reg, src_mcv); + break :tmp dst_mcv; + }; + try self.genBinOpMir(.@"or", src_ty, tmp_mcv, mask_mcv); + break :masked tmp_mcv; + } else mat_src_mcv; + try self.genBinOpMir(.tzcnt, src_ty, dst_mcv, masked_mcv); + break :result dst_mcv; + } + + const width_reg = try self.copyToTmpRegister(dst_ty, .{ .immediate = src_bits }); + try self.genBinOpMir(.bsf, src_ty, dst_mcv, mat_src_mcv); + + const abi_size = @max(@intCast(u32, dst_ty.abiSize(self.target.*)), 2); + try self.asmCmovccRegisterRegister( + registerAlias(dst_reg, abi_size), + registerAlias(width_reg, abi_size), + .z, + ); + + break :result dst_mcv; + }; return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airPopcount(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; - const result: MCValue = if (self.liveness.isUnused(inst)) - .dead - else - return self.fail("TODO implement airPopcount for {}", .{self.target.cpu.arch}); + const result = result: { + if (self.liveness.isUnused(inst)) break :result .dead; + + const op_ty = self.air.typeOf(ty_op.operand); + + if (Target.x86.featureSetHas(self.target.cpu.features, .popcnt)) { + const op_mcv = try self.resolveInst(ty_op.operand); + const mat_op_mcv = switch (op_mcv) { + .immediate => MCValue{ .register = try self.copyToTmpRegister(op_ty, op_mcv) }, + else => op_mcv, + }; + const mat_op_lock = switch (mat_op_mcv) { + .register => |reg| self.register_manager.lockReg(reg), + else => null, + }; + defer if (mat_op_lock) |lock| self.register_manager.unlockReg(lock); + + const dst_mcv = MCValue{ .register = try self.register_manager.allocReg(inst, gp) }; + try self.genBinOpMir(.popcnt, op_ty, dst_mcv, mat_op_mcv); + break :result dst_mcv; + } + + return self.fail("TODO implement airPopcount for {}", .{op_ty.fmt(self.bin_file.options.module.?)}); + }; return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } @@ -3491,7 +3606,7 @@ fn genBinOp( if (lhs.isRegister() and self.reuseOperand(inst, lhs_air, 0, lhs)) { break :blk lhs; } - if (rhs.isRegister() and is_commutative and self.reuseOperand(inst, rhs_air, 1, rhs)) { + if (is_commutative and rhs.isRegister() and self.reuseOperand(inst, rhs_air, 1, rhs)) { flipped = true; break :blk rhs; } diff --git a/src/arch/x86_64/Emit.zig b/src/arch/x86_64/Emit.zig index cf948f6023..2da56c08cd 100644 --- a/src/arch/x86_64/Emit.zig +++ b/src/arch/x86_64/Emit.zig @@ -73,6 +73,8 @@ pub fn lowerMir(emit: *Emit) InnerError!void { .adc, .add, .@"and", + .bsf, + .bsr, .call, .cbw, .cwde, @@ -89,12 +91,14 @@ pub fn lowerMir(emit: *Emit) InnerError!void { .int3, .jmp, .lea, + .lzcnt, .mov, .movzx, .mul, .nop, .@"or", .pop, + .popcnt, .push, .ret, .sal, @@ -105,6 +109,7 @@ pub fn lowerMir(emit: *Emit) InnerError!void { .sub, .syscall, .@"test", + .tzcnt, .ud2, .xor, diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 96396640d6..177b996346 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -307,6 +307,7 @@ pub const Mnemonic = enum { // zig fmt: off // General-purpose adc, add, @"and", + bsf, bsr, call, cbw, cdq, cdqe, cmova, cmovae, cmovb, cmovbe, cmovc, cmove, cmovg, cmovge, cmovl, cmovle, cmovna, cmovnae, cmovnb, cmovnbe, cmovnc, cmovne, cmovng, cmovnge, cmovnl, cmovnle, cmovno, @@ -322,12 +323,13 @@ pub const Mnemonic = enum { jmp, lea, lods, lodsb, lodsd, lodsq, lodsw, + lzcnt, mov, movs, movsb, movsd, movsq, movsw, movsx, movsxd, movzx, mul, nop, @"or", - pop, push, + pop, popcnt, push, ret, sal, sar, sbb, scas, scasb, scasd, scasq, scasw, @@ -336,7 +338,7 @@ pub const Mnemonic = enum { setnb, setnbe, setnc, setne, setng, setnge, setnl, setnle, setno, setnp, setns, setnz, seto, setp, setpe, setpo, sets, setz, stos, stosb, stosd, stosq, stosw, - @"test", + @"test", tzcnt, ud2, xor, // SSE diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 2f2c424517..befd1c692d 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -38,6 +38,10 @@ pub const Inst = struct { add, /// Logical and @"and", + /// Bit scan forward + bsf, + /// Bit scan reverse + bsr, /// Call call, /// Convert byte to word @@ -70,6 +74,8 @@ pub const Inst = struct { jmp, /// Load effective address lea, + /// Count the number of leading zero bits + lzcnt, /// Move mov, /// Move with sign extension @@ -84,6 +90,8 @@ pub const Inst = struct { @"or", /// Pop pop, + /// Return the count of number of bits set to 1 + popcnt, /// Push push, /// Return @@ -104,6 +112,8 @@ pub const Inst = struct { syscall, /// Test condition @"test", + /// Count the number of trailing zero bits + tzcnt, /// Undefined instruction ud2, /// Logical exclusive-or diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 6f2334ba4a..1d9663d6c4 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -81,6 +81,14 @@ pub const table = &[_]Entry{ .{ .@"and", .rm, .r32, .rm32, .none, .none, &.{ 0x23 }, 0, .none }, .{ .@"and", .rm, .r64, .rm64, .none, .none, &.{ 0x23 }, 0, .long }, + .{ .bsf, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbc }, 0, .none }, + .{ .bsf, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbc }, 0, .none }, + .{ .bsf, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbc }, 0, .long }, + + .{ .bsr, .rm, .r16, .rm16, .none, .none, &.{ 0x0f, 0xbd }, 0, .none }, + .{ .bsr, .rm, .r32, .rm32, .none, .none, &.{ 0x0f, 0xbd }, 0, .none }, + .{ .bsr, .rm, .r64, .rm64, .none, .none, &.{ 0x0f, 0xbd }, 0, .long }, + // This is M encoding according to Intel, but D makes more sense here. .{ .call, .d, .rel32, .none, .none, .none, &.{ 0xe8 }, 0, .none }, .{ .call, .m, .rm64, .none, .none, .none, &.{ 0xff }, 2, .none }, @@ -301,6 +309,10 @@ pub const table = &[_]Entry{ .{ .lodsd, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .none }, .{ .lodsq, .np, .none, .none, .none, .none, &.{ 0xad }, 0, .long }, + .{ .lzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none }, + .{ .lzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .none }, + .{ .lzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbd }, 0, .long }, + .{ .mov, .mr, .rm8, .r8, .none, .none, &.{ 0x88 }, 0, .none }, .{ .mov, .mr, .rm8, .r8, .none, .none, &.{ 0x88 }, 0, .rex }, .{ .mov, .mr, .rm16, .r16, .none, .none, &.{ 0x89 }, 0, .none }, @@ -397,6 +409,10 @@ pub const table = &[_]Entry{ .{ .pop, .m, .rm16, .none, .none, .none, &.{ 0x8f }, 0, .none }, .{ .pop, .m, .rm64, .none, .none, .none, &.{ 0x8f }, 0, .none }, + .{ .popcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none }, + .{ .popcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none }, + .{ .popcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long }, + .{ .push, .o, .r16, .none, .none, .none, &.{ 0x50 }, 0, .none }, .{ .push, .o, .r64, .none, .none, .none, &.{ 0x50 }, 0, .none }, .{ .push, .m, .rm16, .none, .none, .none, &.{ 0xff }, 6, .none }, @@ -596,8 +612,8 @@ pub const table = &[_]Entry{ .{ .sub, .rm, .r32, .rm32, .none, .none, &.{ 0x2b }, 0, .none }, .{ .sub, .rm, .r64, .rm64, .none, .none, &.{ 0x2b }, 0, .long }, - .{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none }, - + .{ .syscall, .np, .none, .none, .none, .none, &.{ 0x0f, 0x05 }, 0, .none } +, .{ .@"test", .zi, .al, .imm8, .none, .none, &.{ 0xa8 }, 0, .none }, .{ .@"test", .zi, .ax, .imm16, .none, .none, &.{ 0xa9 }, 0, .none }, .{ .@"test", .zi, .eax, .imm32, .none, .none, &.{ 0xa9 }, 0, .none }, @@ -613,6 +629,10 @@ pub const table = &[_]Entry{ .{ .@"test", .mr, .rm32, .r32, .none, .none, &.{ 0x85 }, 0, .none }, .{ .@"test", .mr, .rm64, .r64, .none, .none, &.{ 0x85 }, 0, .long }, + .{ .tzcnt, .rm, .r16, .rm16, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none }, + .{ .tzcnt, .rm, .r32, .rm32, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .none }, + .{ .tzcnt, .rm, .r64, .rm64, .none, .none, &.{ 0xf3, 0x0f, 0xbc }, 0, .long }, + .{ .ud2, .np, .none, .none, .none, .none, &.{ 0x0f, 0x0b }, 0, .none }, .{ .xor, .zi, .al, .imm8, .none, .none, &.{ 0x34 }, 0, .none },