From ff7412752690d1690cfaedaa605128a992a6b280 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 17 Feb 2025 05:35:57 -0500 Subject: [PATCH] x86_64: implement prefetch --- src/arch/x86_64/CodeGen.zig | 33 ++++++++++++++++++++++----- src/arch/x86_64/Encoding.zig | 42 ++++++++++++++++++++++------------- src/arch/x86_64/Mir.zig | 14 ++++++++++++ src/arch/x86_64/encodings.zig | 12 ++++++++++ 4 files changed, 80 insertions(+), 21 deletions(-) diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 201a1ccfe6..92504679e1 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2484,7 +2484,6 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { .reduce => try cg.airReduce(inst), .reduce_optimized => try cg.airReduce(inst), .aggregate_init => try cg.airAggregateInit(inst), - .prefetch => try cg.airPrefetch(inst), // zig fmt: on .arg => if (cg.debug_output != .none) { @@ -76418,6 +76417,33 @@ fn genBody(cg: *CodeGen, body: []const Air.Inst.Index) InnerError!void { }, cg); try res.finish(inst, &.{extra.init}, &ops, cg); }, + .prefetch => { + const prefetch = air_datas[@intFromEnum(inst)].prefetch; + var ops = try cg.tempsFromOperands(inst, .{prefetch.ptr}); + switch (prefetch.cache) { + .instruction => {}, // prefetchi requires rip-relative addressing, which is currently non-trivial to emit from an arbitrary ptr value + .data => if (prefetch.rw == .write and prefetch.locality <= 2 and cg.hasFeature(.prefetchwt1)) { + try ops[0].toSlicePtr(cg); + while (try ops[0].toLea(cg)) {} + try cg.asmMemory(.{ ._wt1, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })); + } else if (prefetch.rw == .write and cg.hasFeature(.prfchw)) { + try ops[0].toSlicePtr(cg); + while (try ops[0].toLea(cg)) {} + try cg.asmMemory(.{ ._w, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })); + } else if (cg.hasFeature(.sse) or cg.hasFeature(.prfchw) or cg.hasFeature(.prefetchi) or cg.hasFeature(.prefetchwt1)) { + try ops[0].toSlicePtr(cg); + while (try ops[0].toLea(cg)) {} + switch (prefetch.locality) { + 0 => try cg.asmMemory(.{ ._nta, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })), + 1 => try cg.asmMemory(.{ ._t2, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })), + 2 => try cg.asmMemory(.{ ._t1, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })), + 3 => try cg.asmMemory(.{ ._t0, .prefetch }, try ops[0].tracking(cg).short.deref().mem(cg, .{ .size = .byte })), + } + }, + } + const res = try cg.tempInit(.void, .none); + try res.finish(inst, &.{prefetch.ptr}, &ops, cg); + }, .mul_add => |air_tag| if (use_old) try cg.airMulAdd(inst) else { const pl_op = air_datas[@intFromEnum(inst)].pl_op; const bin_op = cg.air.extraData(Air.Bin, pl_op.payload).data; @@ -94743,11 +94769,6 @@ fn airUnionInit(self: *CodeGen, inst: Air.Inst.Index) !void { return self.finishAir(inst, result, .{ extra.init, .none, .none }); } -fn airPrefetch(self: *CodeGen, inst: Air.Inst.Index) !void { - const prefetch = self.air.instructions.items(.data)[@intFromEnum(inst)].prefetch; - return self.finishAir(inst, .unreach, .{ prefetch.ptr, .none, .none }); -} - fn airMulAdd(self: *CodeGen, inst: Air.Inst.Index) !void { const pt = self.pt; const zcu = pt.zcu; diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 1fa3f5dece..65dd1bedbc 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -78,7 +78,7 @@ pub fn findByMnemonic( ), .x86_64 => false, }, - inline .@"invpcid 64bit", .@"rdpid 64bit" => |tag| switch (target.cpu.arch) { + inline .@"invpcid 64bit", .@"rdpid 64bit", .@"prefetchi 64bit" => |tag| switch (target.cpu.arch) { else => unreachable, .x86 => false, .x86_64 => std.Target.x86.featureSetHas( @@ -86,6 +86,7 @@ pub fn findByMnemonic( @field(std.Target.x86.Feature, @tagName(tag)[0 .. @tagName(tag).len - " 64bit".len]), ), }, + .prefetch => std.Target.x86.featureSetHasAny(target.cpu.features, .{ .sse, .prfchw, .prefetchi, .prefetchwt1 }), inline else => |tag| has_features: { comptime var feature_it = std.mem.splitScalar(u8, @tagName(tag), ' '); comptime var features: []const std.Target.x86.Feature = &.{}; @@ -375,6 +376,7 @@ pub const Mnemonic = enum { orps, pextrw, pinsrw, pmaxsw, pmaxub, pminsw, pminub, pmovmskb, + prefetchit0, prefetchit1, prefetchnta, prefetcht0, prefetcht1, prefetcht2, prefetchw, prefetchwt1, shufps, sqrtps, sqrtss, stmxcsr, @@ -562,8 +564,7 @@ pub const Op = enum { r32_m8, r32_m16, r64_m16, m8, m16, m32, m64, m80, m128, m256, rel8, rel16, rel32, - m, - moffs, + m, moffs, mrip8, sreg, st0, st, mm, mm_m64, xmm0, xmm, xmm_m8, xmm_m16, xmm_m32, xmm_m64, xmm_m128, @@ -617,7 +618,7 @@ pub const Op = enum { .mem => |mem| switch (mem) { .moffs => .moffs, - .sib, .rip => switch (mem.bitSize(target)) { + .sib => switch (mem.bitSize(target)) { 0 => .m, 8 => .m8, 16 => .m16, @@ -628,6 +629,16 @@ pub const Op = enum { 256 => .m256, else => unreachable, }, + .rip => switch (mem.bitSize(target)) { + 0, 8 => .mrip8, + 16 => .m16, + 32 => .m32, + 64 => .m64, + 80 => .m80, + 128 => .m128, + 256 => .m256, + else => unreachable, + }, }, .imm => |imm| switch (imm) { @@ -680,7 +691,7 @@ pub const Op = enum { pub fn immBitSize(op: Op) u64 { return switch (op) { - .none, .moffs, .m, .sreg => unreachable, + .none, .m, .moffs, .mrip8, .sreg => unreachable, .al, .cl, .dx, .rip, .eip, .ip, .r8, .rm8, .r32_m8 => unreachable, .ax, .r16, .rm16 => unreachable, .eax, .r32, .rm32, .r32_m16 => unreachable, @@ -700,7 +711,7 @@ pub const Op = enum { pub fn regBitSize(op: Op) u64 { return switch (op) { - .none, .moffs, .m, .sreg => unreachable, + .none, .m, .moffs, .mrip8, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, @@ -716,13 +727,13 @@ pub const Op = enum { pub fn memBitSize(op: Op) u64 { return switch (op) { - .none, .moffs, .m, .sreg => unreachable, + .none, .m, .moffs, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, .al, .cl, .r8, .ax, .dx, .ip, .r16, .eax, .eip, .r32, .rax, .rip, .r64 => unreachable, .st0, .st, .mm, .xmm0, .xmm, .ymm => unreachable, .cr, .dr => unreachable, - .m8, .rm8, .r32_m8, .xmm_m8 => 8, + .mrip8, .m8, .rm8, .r32_m8, .xmm_m8 => 8, .m16, .rm16, .r32_m16, .r64_m16, .xmm_m16 => 16, .m32, .rm32, .xmm_m32 => 32, .m64, .rm64, .mm_m64, .xmm_m64 => 64, @@ -783,7 +794,7 @@ pub const Op = enum { .rm8, .rm16, .rm32, .rm64, .r32_m8, .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m256, - .m, + .m, .moffs, .mrip8, .mm_m64, .xmm_m8, .xmm_m16, .xmm_m32, .xmm_m64, .xmm_m128, .ymm_m256, @@ -821,11 +832,7 @@ pub const Op = enum { /// Given an operand `op` checks if `target` is a subset for the purposes of the encoding. pub fn isSubset(op: Op, target: Op) bool { switch (op) { - .moffs, .sreg => return op == target, - .none => switch (target) { - .none => return true, - else => return false, - }, + .none, .m, .moffs, .sreg => return op == target, else => { if (op.isRegister() and target.isRegister()) { return switch (target.toReg()) { @@ -836,6 +843,7 @@ pub const Op = enum { if (op.isMemory() and target.isMemory()) { switch (target) { .m => return true, + .moffs, .mrip8 => return op == target, else => return op.memBitSize() == target.memBitSize(), } } @@ -962,6 +970,10 @@ pub const Feature = enum { @"pclmul avx", pku, popcnt, + prefetch, + @"prefetchi 64bit", + prefetchwt1, + prfchw, rdrnd, rdseed, @"rdpid 32bit", @@ -1002,7 +1014,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op } const mnemonic_to_encodings_map = init: { - @setEvalBranchQuota(5_700); + @setEvalBranchQuota(5_800); const mnemonic_count = @typeInfo(Mnemonic).@"enum".fields.len; var mnemonic_map: [mnemonic_count][]Data = @splat(&.{}); const encodings = @import("encodings.zig"); diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 2d49bc365f..d7f16695eb 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -34,8 +34,18 @@ pub const Inst = struct { /// ___ 4 _4, + /// ___ With NTA Hint + _nta, /// System Call ___ sys_, + /// ___ With T0 Hint + _t0, + /// ___ With T1 Hint + _t1, + /// ___ With T2 Hint + _t2, + /// ___ With Intent to Write and T1 Hint + _wt1, /// ___ crement Shadow Stack Pointer Doubleword _csspd, @@ -198,6 +208,7 @@ pub const Inst = struct { //_b, /// ___ Word /// ___ For Writing + /// ___ With Intent to Write _w, /// ___ Doubleword //_d, @@ -975,6 +986,9 @@ pub const Inst = struct { /// Move unaligned packed single-precision floating-point values /// Move unaligned packed double-precision floating-point values movu, + /// Prefetch data into caches + /// Prefetch data into caches with intent to write + prefetch, /// Packed interleave shuffle of quadruplets of single-precision floating-point values /// Packed interleave shuffle of pairs of double-precision floating-point values /// Shuffle packed doublewords diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 2c11bb8ff0..2a6c1898af 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1370,6 +1370,18 @@ pub const table = [_]Entry{ .{ .pmovmskb, .rm, &.{ .r32, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse }, .{ .pmovmskb, .rm, &.{ .r64, .xmm }, &.{ 0x66, 0x0f, 0xd7 }, 0, .none, .sse }, + .{ .prefetchit0, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 7, .none, .@"prefetchi 64bit" }, + .{ .prefetchit1, .m, &.{ .mrip8 }, &.{ 0x0f, 0x18 }, 6, .none, .@"prefetchi 64bit" }, + + .{ .prefetchnta, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 0, .none, .prefetch }, + .{ .prefetcht0, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 1, .none, .prefetch }, + .{ .prefetcht1, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 2, .none, .prefetch }, + .{ .prefetcht2, .m, &.{ .m8 }, &.{ 0x0f, 0x18 }, 3, .none, .prefetch }, + + .{ .prefetchw, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 1, .none, .prfchw }, + + .{ .prefetchwt1, .m, &.{ .m8 }, &.{ 0x0f, 0x0d }, 2, .none, .prefetchwt1 }, + .{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse }, .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse },