x86_64: implement some float and float vector movement

This allows actually storing value of these supported types in
registers, and not restricting them to stack slots.
This commit is contained in:
Jacob Young 2023-05-06 22:27:39 -04:00
parent 3a5e3c52e0
commit cba195c117
8 changed files with 176 additions and 43 deletions

View File

@ -2008,6 +2008,11 @@ fn computeFrameLayout(self: *Self) !FrameLayout {
};
}
fn getFrameAddrAlignment(self: *Self, frame_addr: FrameAddr) u32 {
const alloc_align = @as(u32, 1) << self.frame_allocs.get(@enumToInt(frame_addr.index)).abi_align;
return @min(alloc_align, @bitCast(u32, frame_addr.off) & (alloc_align - 1));
}
fn allocFrameIndex(self: *Self, alloc: FrameAlloc) !FrameIndex {
const frame_allocs_slice = self.frame_allocs.slice();
const frame_size = frame_allocs_slice.items(.abi_size);
@ -2051,24 +2056,36 @@ fn allocTempRegOrMem(self: *Self, elem_ty: Type, reg_ok: bool) !MCValue {
return self.allocRegOrMemAdvanced(elem_ty, null, reg_ok);
}
fn allocRegOrMemAdvanced(self: *Self, elem_ty: Type, inst: ?Air.Inst.Index, reg_ok: bool) !MCValue {
const abi_size = math.cast(u32, elem_ty.abiSize(self.target.*)) orelse {
fn allocRegOrMemAdvanced(self: *Self, ty: Type, inst: ?Air.Inst.Index, reg_ok: bool) !MCValue {
const abi_size = math.cast(u32, ty.abiSize(self.target.*)) orelse {
const mod = self.bin_file.options.module.?;
return self.fail("type '{}' too big to fit into stack frame", .{elem_ty.fmt(mod)});
return self.fail("type '{}' too big to fit into stack frame", .{ty.fmt(mod)});
};
if (reg_ok) {
// Make sure the type can fit in a register before we try to allocate one.
const ptr_bits = self.target.cpu.arch.ptrBitWidth();
const ptr_bytes: u64 = @divExact(ptr_bits, 8);
if (abi_size <= ptr_bytes) {
if (self.register_manager.tryAllocReg(inst, regClassForType(elem_ty))) |reg| {
if (reg_ok) need_mem: {
if (abi_size <= @as(u32, switch (ty.zigTypeTag()) {
.Float => switch (ty.floatBits(self.target.*)) {
16, 32, 64, 128 => 16,
80 => break :need_mem,
else => unreachable,
},
.Vector => switch (ty.childType().zigTypeTag()) {
.Float => switch (ty.childType().floatBits(self.target.*)) {
16, 32, 64 => if (self.hasFeature(.avx)) 32 else 16,
80, 128 => break :need_mem,
else => unreachable,
},
else => break :need_mem,
},
else => 8,
})) {
if (self.register_manager.tryAllocReg(inst, regClassForType(ty))) |reg| {
return MCValue{ .register = registerAlias(reg, abi_size) };
}
}
}
const frame_index = try self.allocFrameIndex(FrameAlloc.initType(elem_ty, self.target.*));
const frame_index = try self.allocFrameIndex(FrameAlloc.initType(ty, self.target.*));
return .{ .load_frame = .{ .index = frame_index } };
}
@ -4442,12 +4459,19 @@ fn airRound(self: *Self, inst: Air.Inst.Index, mode: Immediate) !void {
}),
};
assert(dst_mcv.isRegister());
const abi_size = @intCast(u32, ty.abiSize(self.target.*));
const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
if (src_mcv.isRegister())
try self.asmRegisterRegisterImmediate(mir_tag, dst_mcv.getReg().?, src_mcv.getReg().?, mode)
try self.asmRegisterRegisterImmediate(
mir_tag,
dst_reg,
registerAlias(src_mcv.getReg().?, abi_size),
mode,
)
else
try self.asmRegisterMemoryImmediate(
mir_tag,
dst_mcv.getReg().?,
dst_reg,
src_mcv.mem(Memory.PtrSize.fromSize(@intCast(u32, ty.abiSize(self.target.*)))),
mode,
);
@ -7847,19 +7871,43 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
return self.finishAirResult(inst, result);
}
fn movMirTag(self: *Self, ty: Type) !Mir.Inst.Tag {
return switch (ty.zigTypeTag()) {
else => .mov,
fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag {
switch (ty.zigTypeTag()) {
else => return .mov,
.Float => switch (ty.floatBits(self.target.*)) {
16 => unreachable, // needs special handling
32 => .movss,
64 => .movsd,
128 => .movaps,
else => return self.fail("TODO movMirTag from {}", .{
ty.fmt(self.bin_file.options.module.?),
}),
32 => return if (self.hasFeature(.avx)) .vmovss else .movss,
64 => return if (self.hasFeature(.avx)) .vmovsd else .movsd,
128 => return if (self.hasFeature(.avx))
if (aligned) .vmovaps else .vmovups
else if (aligned) .movaps else .movups,
else => {},
},
};
.Vector => switch (ty.childType().zigTypeTag()) {
.Float => switch (ty.childType().floatBits(self.target.*)) {
16 => unreachable, // needs special handling
32 => switch (ty.vectorLen()) {
1 => return if (self.hasFeature(.avx)) .vmovss else .movss,
2...4 => return if (self.hasFeature(.avx))
if (aligned) .vmovaps else .vmovups
else if (aligned) .movaps else .movups,
5...8 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups,
else => {},
},
64 => switch (ty.vectorLen()) {
1 => return if (self.hasFeature(.avx)) .vmovsd else .movsd,
2 => return if (self.hasFeature(.avx))
if (aligned) .vmovaps else .vmovups
else if (aligned) .movaps else .movups,
3...4 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups,
else => {},
},
else => {},
},
else => {},
},
}
return self.fail("TODO movMirTag for {}", .{ty.fmt(self.bin_file.options.module.?)});
}
fn genCopy(self: *Self, ty: Type, dst_mcv: MCValue, src_mcv: MCValue) InnerError!void {
@ -8016,7 +8064,11 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
0 => return self.genSetReg(dst_reg, ty, .{ .register = reg_off.reg }),
else => .lea,
},
.indirect, .load_frame => try self.movMirTag(ty),
.indirect => try self.movMirTag(ty, false),
.load_frame => |frame_addr| try self.movMirTag(
ty,
self.getFrameAddrAlignment(frame_addr) >= ty.abiAlignment(self.target.*),
),
.lea_frame => .lea,
else => unreachable,
},
@ -8040,7 +8092,11 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
)
else
self.asmRegisterMemory(
try self.movMirTag(ty),
try self.movMirTag(ty, mem.isAlignedGeneric(
u32,
@bitCast(u32, small_addr),
ty.abiAlignment(self.target.*),
)),
registerAlias(dst_reg, abi_size),
src_mem,
);
@ -8080,7 +8136,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr
)
else
try self.asmRegisterMemory(
try self.movMirTag(ty),
try self.movMirTag(ty, false),
registerAlias(dst_reg, abi_size),
src_mem,
);
@ -8194,7 +8250,24 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal
)
else
try self.asmMemoryRegister(
try self.movMirTag(ty),
try self.movMirTag(ty, switch (base) {
.none => mem.isAlignedGeneric(
u32,
@bitCast(u32, disp),
ty.abiAlignment(self.target.*),
),
.reg => |reg| switch (reg) {
.es, .cs, .ss, .ds => mem.isAlignedGeneric(
u32,
@bitCast(u32, disp),
ty.abiAlignment(self.target.*),
),
else => false,
},
.frame => |frame_index| self.getFrameAddrAlignment(
.{ .index = frame_index, .off = disp },
) >= ty.abiAlignment(self.target.*),
}),
dst_mem,
registerAlias(src_reg, abi_size),
);
@ -8415,7 +8488,7 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void {
defer if (operand_lock) |lock| self.register_manager.unlockReg(lock);
const dest = try self.allocRegOrMem(inst, true);
try self.genCopy(self.air.typeOfIndex(inst), dest, operand);
try self.genCopy(if (!dest.isMemory() or operand.isMemory()) dst_ty else src_ty, dest, operand);
break :result dest;
};
return self.finishAir(inst, result, .{ ty_op.operand, .none, .none });

View File

@ -206,7 +206,7 @@ pub fn format(
try writer.print("+{s} ", .{tag});
},
.m, .mi, .m1, .mc, .vmi => try writer.print("/{d} ", .{encoding.modRmExt()}),
.mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi => try writer.writeAll("/r "),
.mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi, .mvr => try writer.writeAll("/r "),
}
switch (encoding.data.op_en) {
@ -230,7 +230,7 @@ pub fn format(
};
try writer.print("{s} ", .{tag});
},
.np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm => {},
.np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm, .mvr => {},
}
try writer.print("{s} ", .{@tagName(encoding.mnemonic)});
@ -332,7 +332,12 @@ pub const Mnemonic = enum {
// SSE4.1
roundsd, roundss,
// AVX
vmovddup, vmovshdup, vmovsldup,
vmovapd, vmovaps,
vmovddup,
vmovsd,
vmovshdup, vmovsldup,
vmovss,
vmovupd, vmovups,
vpextrw, vpinsrw,
vpshufhw, vpshuflw,
vpsrld, vpsrlq, vpsrlw,
@ -357,7 +362,7 @@ pub const OpEn = enum {
fd, td,
m1, mc, mi, mr, rm,
rmi, mri, mrc,
vmi, rvm, rvmi,
vmi, rvm, rvmi, mvr,
// zig fmt: on
};
@ -549,9 +554,10 @@ pub const Op = enum {
return switch (op) {
.rm8, .rm16, .rm32, .rm64,
.r32_m16, .r64_m16,
.m8, .m16, .m32, .m64, .m80, .m128,
.m8, .m16, .m32, .m64, .m80, .m128, .m256,
.m,
.xmm_m32, .xmm_m64, .xmm_m128, .ymm_m256,
.xmm_m32, .xmm_m64, .xmm_m128,
.ymm_m256,
=> true,
else => false,
};

View File

@ -184,9 +184,15 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.roundsd,
.roundss,
.vmovapd,
.vmovaps,
.vmovddup,
.vmovsd,
.vmovshdup,
.vmovsldup,
.vmovss,
.vmovupd,
.vmovups,
.vpextrw,
.vpinsrw,
.vpshufhw,

View File

@ -282,12 +282,24 @@ pub const Inst = struct {
/// Round scalar single-precision floating-point values
roundss,
/// Move aligned packed double-precision floating-point values
vmovapd,
/// Move aligned packed single-precision floating-point values
vmovaps,
/// Replicate double floating-point values
vmovddup,
/// Move or merge scalar double-precision floating-point value
vmovsd,
/// Replicate single floating-point values
vmovshdup,
/// Replicate single floating-point values
vmovsldup,
/// Move or merge scalar single-precision floating-point value
vmovss,
/// Move unaligned packed double-precision floating-point values
vmovupd,
/// Move unaligned packed single-precision floating-point values
vmovups,
/// Extract word
vpextrw,
/// Insert word

View File

@ -228,7 +228,7 @@ pub const Instruction = struct {
.td => try encoder.imm64(inst.ops[0].mem.moffs.offset),
else => {
const mem_op = switch (data.op_en) {
.m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0],
.m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0],
.rm, .rmi, .vmi => inst.ops[1],
.rvm, .rvmi => inst.ops[2],
else => unreachable,
@ -239,6 +239,7 @@ pub const Instruction = struct {
.m, .mi, .m1, .mc, .vmi => enc.modRmExt(),
.mr, .mri, .mrc => inst.ops[1].reg.lowEnc(),
.rm, .rmi, .rvm, .rvmi => inst.ops[0].reg.lowEnc(),
.mvr => inst.ops[2].reg.lowEnc(),
else => unreachable,
};
try encoder.modRm_direct(rm, reg.lowEnc());
@ -248,6 +249,7 @@ pub const Instruction = struct {
.m, .mi, .m1, .mc, .vmi => .none,
.mr, .mri, .mrc => inst.ops[1],
.rm, .rmi, .rvm, .rvmi => inst.ops[0],
.mvr => inst.ops[2],
else => unreachable,
};
try encodeMemory(enc, mem, op, encoder);
@ -315,7 +317,7 @@ pub const Instruction = struct {
}
else
null,
.vmi, .rvm, .rvmi => unreachable,
.vmi, .rvm, .rvmi, .mvr => unreachable,
};
if (segment_override) |seg| {
legacy.setSegmentOverride(seg);
@ -350,7 +352,7 @@ pub const Instruction = struct {
rex.b = b_x_op.isBaseExtended();
rex.x = b_x_op.isIndexExtended();
},
.vmi, .rvm, .rvmi => unreachable,
.vmi, .rvm, .rvmi, .mvr => unreachable,
}
try encoder.rex(rex);
@ -372,10 +374,11 @@ pub const Instruction = struct {
switch (op_en) {
.np, .i, .zi, .fd, .td, .d => {},
.o, .oi => vex.b = inst.ops[0].reg.isExtended(),
.m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi => {
.m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi, .mvr => {
const r_op = switch (op_en) {
.rm, .rmi, .rvm, .rvmi => inst.ops[0],
.mr, .mri, .mrc => inst.ops[1],
.mvr => inst.ops[2],
.m, .mi, .m1, .mc, .vmi => .none,
else => unreachable,
};
@ -383,7 +386,7 @@ pub const Instruction = struct {
const b_x_op = switch (op_en) {
.rm, .rmi, .vmi => inst.ops[1],
.m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0],
.m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0],
.rvm, .rvmi => inst.ops[2],
else => unreachable,
};

View File

@ -974,12 +974,42 @@ pub const table = [_]Entry{
.{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 },
// AVX
.{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128, .avx },
.{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128, .avx },
.{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256, .avx },
.{ .vmovapd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_256, .avx },
.{ .vmovaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .vex_128, .avx },
.{ .vmovaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .vex_128, .avx },
.{ .vmovaps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x28 }, 0, .vex_256, .avx },
.{ .vmovaps, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x29 }, 0, .vex_256, .avx },
.{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128, .avx },
.{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovsd, .mr, &.{ .m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_128, .avx },
.{ .vmovsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_128, .avx },
.{ .vmovss, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovss, .rm, &.{ .xmm, .m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovss, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovss, .mr, &.{ .m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovupd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_256, .avx },
.{ .vmovupd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_256, .avx },
.{ .vmovups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .vex_128, .avx },
.{ .vmovups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .vex_128, .avx },
.{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256, .avx },
.{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256, .avx },
.{ .vpextrw, .mri, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128, .avx },
.{ .vpextrw, .mri, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_long, .avx },
.{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128, .avx },

View File

@ -399,7 +399,8 @@ fn testBinaryNot128(comptime Type: type, x: Type) !void {
test "division" {
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64 and
!comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO

View File

@ -2,9 +2,11 @@ const std = @import("std");
const builtin = @import("builtin");
const expect = std.testing.expect;
const stage2_x86_64_without_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and
!std.Target.x86.featureSetHas(builtin.cpu.features, .fma);
test "@mulAdd" {
if (builtin.zig_backend == .stage2_x86_64 and
!comptime std.Target.x86.featureSetHas(builtin.cpu.features, .fma)) return error.SkipZigTest; // TODO
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@ -118,7 +120,7 @@ fn vector32() !void {
test "vector f32" {
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO
@ -141,7 +143,7 @@ fn vector64() !void {
test "vector f64" {
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO