x86_64: implement f16 conversions when supported

This commit is contained in:
Jacob Young 2023-05-05 01:32:39 -04:00
parent 1a261917ce
commit 32ab930f1d
7 changed files with 1081 additions and 857 deletions

View File

@ -2172,12 +2172,9 @@ fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void {
fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
const dst_ty = self.air.typeOfIndex(inst);
const dst_bits = dst_ty.floatBits(self.target.*);
const src_ty = self.air.typeOf(ty_op.operand);
if (dst_ty.floatBits(self.target.*) != 32 or src_ty.floatBits(self.target.*) != 64 or
!Target.x86.featureSetHas(self.target.cpu.features, .sse2))
return self.fail("TODO implement airFptrunc from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
});
const src_bits = src_ty.floatBits(self.target.*);
const src_mcv = try self.resolveInst(ty_op.operand);
const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
@ -2187,19 +2184,32 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void {
const dst_lock = self.register_manager.lockReg(dst_mcv.register);
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv);
if (src_bits == 32 and dst_bits == 16 and self.hasFeature(.f16c))
try self.asmRegisterRegisterImmediate(
.vcvtps2ph,
dst_mcv.register,
if (src_mcv.isRegister()) src_mcv.getReg().? else src_reg: {
const src_reg = dst_mcv.register;
try self.genSetReg(src_reg, src_ty, src_mcv);
break :src_reg src_reg;
},
Immediate.u(0b1_00),
)
else if (src_bits == 64 and dst_bits == 32)
try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv)
else
return self.fail("TODO implement airFptrunc from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
});
return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
}
fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
const ty_op = self.air.instructions.items(.data)[inst].ty_op;
const dst_ty = self.air.typeOfIndex(inst);
const dst_bits = dst_ty.floatBits(self.target.*);
const src_ty = self.air.typeOf(ty_op.operand);
if (dst_ty.floatBits(self.target.*) != 64 or src_ty.floatBits(self.target.*) != 32 or
!Target.x86.featureSetHas(self.target.cpu.features, .sse2))
return self.fail("TODO implement airFpext from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
});
const src_bits = src_ty.floatBits(self.target.*);
const src_mcv = try self.resolveInst(ty_op.operand);
const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv))
@ -2209,7 +2219,19 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void {
const dst_lock = self.register_manager.lockReg(dst_mcv.register);
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
try self.genBinOpMir(.cvtss2sd, src_ty, dst_mcv, src_mcv);
try self.genBinOpMir(
if (src_bits == 16 and dst_bits == 32 and self.hasFeature(.f16c))
.vcvtph2ps
else if (src_bits == 32 and dst_bits == 64)
.cvtss2sd
else
return self.fail("TODO implement airFpext from {} to {}", .{
src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?),
}),
src_ty,
dst_mcv,
src_mcv,
);
return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
}
@ -3802,7 +3824,7 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void {
defer self.register_manager.unlockReg(dst_lock);
const src_bits = src_ty.bitSize(self.target.*);
if (Target.x86.featureSetHas(self.target.cpu.features, .lzcnt)) {
if (self.hasFeature(.lzcnt)) {
if (src_bits <= 64) {
try self.genBinOpMir(.lzcnt, src_ty, dst_mcv, mat_src_mcv);
@ -3888,7 +3910,7 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void {
const dst_lock = self.register_manager.lockReg(dst_reg);
defer if (dst_lock) |lock| self.register_manager.unlockReg(lock);
if (Target.x86.featureSetHas(self.target.cpu.features, .bmi)) {
if (self.hasFeature(.bmi)) {
if (src_bits <= 64) {
const extra_bits = self.regExtraBits(src_ty);
const masked_mcv = if (extra_bits > 0) masked: {
@ -3956,7 +3978,7 @@ fn airPopcount(self: *Self, inst: Air.Inst.Index) !void {
const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*));
const src_mcv = try self.resolveInst(ty_op.operand);
if (Target.x86.featureSetHas(self.target.cpu.features, .popcnt)) {
if (self.hasFeature(.popcnt)) {
const mat_src_mcv = switch (src_mcv) {
.immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) },
else => src_mcv,
@ -4309,7 +4331,7 @@ fn airRound(self: *Self, inst: Air.Inst.Index, mode: Immediate) !void {
const un_op = self.air.instructions.items(.data)[inst].un_op;
const ty = self.air.typeOf(un_op);
if (!Target.x86.featureSetHas(self.target.cpu.features, .sse4_1))
if (!self.hasFeature(.sse4_1))
return self.fail("TODO implement airRound without sse4_1 feature", .{});
const src_mcv = try self.resolveInst(un_op);
@ -5712,7 +5734,7 @@ fn genBinOp(
=> {},
.div_trunc,
.div_floor,
=> if (Target.x86.featureSetHas(self.target.cpu.features, .sse4_1)) {
=> if (self.hasFeature(.sse4_1)) {
const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*));
const dst_alias = registerAlias(dst_mcv.register, abi_size);
try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) {
@ -9593,3 +9615,13 @@ fn regBitSize(self: *Self, ty: Type) u64 {
fn regExtraBits(self: *Self, ty: Type) u64 {
return self.regBitSize(ty) - ty.bitSize(self.target.*);
}
fn hasFeature(self: *Self, feature: Target.x86.Feature) bool {
return Target.x86.featureSetHas(self.target.cpu.features, feature);
}
fn hasAnyFeatures(self: *Self, features: anytype) bool {
return Target.x86.featureSetHasAny(self.target.cpu.features, features);
}
fn hasAllFeatures(self: *Self, features: anytype) bool {
return Target.x86.featureSetHasAll(self.target.cpu.features, features);
}

View File

@ -23,6 +23,7 @@ const Data = struct {
opc: [7]u8,
modrm_ext: u3,
mode: Mode,
feature: Feature,
};
pub fn findByMnemonic(
@ -58,7 +59,7 @@ pub fn findByMnemonic(
next: for (mnemonic_to_encodings_map[@enumToInt(mnemonic)]) |data| {
switch (data.mode) {
.rex => if (!rex_required) continue,
.long, .sse_long, .sse2_long => {},
.long => {},
else => if (rex_required) continue,
}
for (input_ops, data.ops) |input_op, data_op|
@ -136,22 +137,20 @@ pub fn modRmExt(encoding: Encoding) u3 {
}
pub fn operandBitSize(encoding: Encoding) u64 {
switch (encoding.data.mode) {
.short => return 16,
.long, .sse_long, .sse2_long => return 64,
else => {},
}
const bit_size: u64 = switch (encoding.data.op_en) {
.np => switch (encoding.data.ops[0]) {
.o16 => 16,
.o32 => 32,
.o64 => 64,
else => 32,
return switch (encoding.data.mode) {
.short => 16,
.long => 64,
else => switch (encoding.data.op_en) {
.np => switch (encoding.data.ops[0]) {
.o16 => 16,
.o32 => 32,
.o64 => 64,
else => 32,
},
.td => encoding.data.ops[1].bitSize(),
else => encoding.data.ops[0].bitSize(),
},
.td => encoding.data.ops[1].bitSize(),
else => encoding.data.ops[0].bitSize(),
};
return bit_size;
}
pub fn format(
@ -162,12 +161,50 @@ pub fn format(
) !void {
_ = options;
_ = fmt;
var opc = encoding.opcode();
switch (encoding.data.mode) {
.long, .sse_long, .sse2_long => try writer.writeAll("REX.W + "),
else => {},
.long => try writer.writeAll("REX.W + "),
.vex_128, .vex_128_long, .vex_256, .vex_256_long => {
try writer.writeAll("VEX.");
switch (encoding.data.mode) {
.vex_128, .vex_128_long => try writer.writeAll("128"),
.vex_256, .vex_256_long => try writer.writeAll("256"),
else => unreachable,
}
switch (opc[0]) {
else => {},
0x66, 0xf3, 0xf2 => {
try writer.print(".{X:0>2}", .{opc[0]});
opc = opc[1..];
},
}
try writer.print(".{X:0>2}", .{opc[0]});
opc = opc[1..];
switch (opc[0]) {
else => {},
0x38, 0x3A => {
try writer.print("{X:0>2}", .{opc[0]});
opc = opc[1..];
},
}
try writer.writeByte('.');
try writer.writeAll(switch (encoding.data.mode) {
.vex_128, .vex_256 => "W0",
.vex_128_long, .vex_256_long => "W1",
else => unreachable,
});
try writer.writeByte(' ');
},
}
for (encoding.opcode()) |byte| {
for (opc) |byte| {
try writer.print("{x:0>2} ", .{byte});
}
@ -184,15 +221,16 @@ pub fn format(
try writer.print("+{s} ", .{tag});
},
.m, .mi, .m1, .mc => try writer.print("/{d} ", .{encoding.modRmExt()}),
.mr, .rm, .rmi, .mri, .mrc => try writer.writeAll("/r "),
.mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => try writer.writeAll("/r "),
}
switch (encoding.data.op_en) {
.i, .d, .zi, .oi, .mi, .rmi, .mri => {
.i, .d, .zi, .oi, .mi, .rmi, .mri, .rrmi => {
const op = switch (encoding.data.op_en) {
.i, .d => encoding.data.ops[0],
.zi, .oi, .mi => encoding.data.ops[1],
.rmi, .mri => encoding.data.ops[2],
.rrmi => encoding.data.ops[3],
else => unreachable,
};
const tag = switch (op) {
@ -207,7 +245,7 @@ pub fn format(
};
try writer.print("{s} ", .{tag});
},
.np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc => {},
.np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rrm => {},
}
try writer.print("{s} ", .{@tagName(encoding.mnemonic)});
@ -305,6 +343,8 @@ pub const Mnemonic = enum {
// SSE4.1
roundss,
roundsd,
// F16C
vcvtph2ps, vcvtps2ph,
// zig fmt: on
};
@ -317,6 +357,7 @@ pub const OpEn = enum {
fd, td,
m1, mc, mi, mr, rm,
rmi, mri, mrc,
rrm, rrmi,
// zig fmt: on
};
@ -549,14 +590,21 @@ pub const Op = enum {
pub const Mode = enum {
none,
short,
fpu,
rex,
long,
vex_128,
vex_128_long,
vex_256,
vex_256_long,
};
pub const Feature = enum {
none,
f16c,
sse,
sse_long,
sse2,
sse2_long,
sse4_1,
x87,
};
fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Operand) usize {
@ -593,6 +641,7 @@ const mnemonic_to_encodings_map = init: {
.opc = undefined,
.modrm_ext = entry[4],
.mode = entry[5],
.feature = entry[6],
};
// TODO: use `@memcpy` for these. When I did that, I got a false positive
// compile error for this copy happening at compile time.

View File

@ -133,6 +133,9 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction {
.subsd,
.ucomisd,
.xorpd,
.vcvtph2ps,
.vcvtps2ph,
=> try lower.mirGeneric(inst),
.cmps,

View File

@ -247,6 +247,11 @@ pub const Inst = struct {
/// Bitwise logical xor of packed double precision floating-point values
xorpd,
/// Convert 16-bit floating-point values to single-precision floating-point values
vcvtph2ps,
/// Convert single-precision floating-point values to 16-bit floating-point values
vcvtps2ph,
/// Compare string operands
cmps,
/// Load string

View File

@ -209,10 +209,19 @@ pub const Instruction = struct {
const enc = inst.encoding;
const data = enc.data;
try inst.encodeLegacyPrefixes(encoder);
try inst.encodeMandatoryPrefix(encoder);
try inst.encodeRexPrefix(encoder);
try inst.encodeOpcode(encoder);
switch (data.mode) {
.none, .short, .rex, .long => {
try inst.encodeLegacyPrefixes(encoder);
try inst.encodeMandatoryPrefix(encoder);
try inst.encodeRexPrefix(encoder);
try inst.encodeOpcode(encoder);
},
.vex_128, .vex_128_long, .vex_256, .vex_256_long => {
try inst.encodeVexPrefix(encoder);
const opc = inst.encoding.opcode();
try encoder.opcode_1byte(opc[opc.len - 1]);
},
}
switch (data.op_en) {
.np, .o => {},
@ -309,6 +318,7 @@ pub const Instruction = struct {
}
else
null,
.rrm, .rrmi => unreachable,
};
if (segment_override) |seg| {
legacy.setSegmentOverride(seg);
@ -322,10 +332,7 @@ pub const Instruction = struct {
var rex = Rex{};
rex.present = inst.encoding.data.mode == .rex;
switch (inst.encoding.data.mode) {
.long, .sse_long, .sse2_long => rex.w = true,
else => {},
}
rex.w = inst.encoding.data.mode == .long;
switch (op_en) {
.np, .i, .zi, .fd, .td, .d => {},
@ -346,11 +353,76 @@ pub const Instruction = struct {
rex.b = b_x_op.isBaseExtended();
rex.x = b_x_op.isIndexExtended();
},
.rrm, .rrmi => unreachable,
}
try encoder.rex(rex);
}
fn encodeVexPrefix(inst: Instruction, encoder: anytype) !void {
const op_en = inst.encoding.data.op_en;
const opc = inst.encoding.opcode();
const mand_pre = inst.encoding.mandatoryPrefix();
var vex = Vex{};
vex.w = switch (inst.encoding.data.mode) {
.vex_128, .vex_256 => false,
.vex_128_long, .vex_256_long => true,
else => unreachable,
};
switch (op_en) {
.np, .i, .zi, .fd, .td, .d => {},
.o, .oi => vex.b = inst.ops[0].reg.isExtended(),
.m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => {
const r_op = switch (op_en) {
.rm, .rmi, .rrm, .rrmi => inst.ops[0],
.mr, .mri, .mrc => inst.ops[1],
else => .none,
};
vex.r = r_op.isBaseExtended();
const b_x_op = switch (op_en) {
.rm, .rmi => inst.ops[1],
.m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0],
.rrm, .rrmi => inst.ops[2],
else => unreachable,
};
vex.b = b_x_op.isBaseExtended();
vex.x = b_x_op.isIndexExtended();
},
}
vex.l = switch (inst.encoding.data.mode) {
.vex_128, .vex_128_long => false,
.vex_256, .vex_256_long => true,
else => unreachable,
};
vex.p = if (mand_pre) |mand| switch (mand) {
0x66 => .@"66",
0xf2 => .f2,
0xf3 => .f3,
else => unreachable,
} else .none;
const leading: usize = if (mand_pre) |_| 1 else 0;
assert(opc[leading] == 0x0f);
vex.m = switch (opc[leading + 1]) {
else => .@"0f",
0x38 => .@"0f38",
0x3a => .@"0f3a",
};
switch (op_en) {
else => {},
.rrm, .rrmi => vex.v = inst.ops[1].reg,
}
try encoder.vex(vex);
}
fn encodeMandatoryPrefix(inst: Instruction, encoder: anytype) !void {
const prefix = inst.encoding.mandatoryPrefix() orelse return;
try encoder.opcode_1byte(prefix);
@ -562,17 +634,48 @@ fn Encoder(comptime T: type, comptime opts: Options) type {
/// or one of reg, index, r/m, base, or opcode-reg might be extended.
///
/// See struct `Rex` for a description of each field.
pub fn rex(self: Self, byte: Rex) !void {
if (!byte.present and !byte.isSet()) return;
pub fn rex(self: Self, fields: Rex) !void {
if (!fields.present and !fields.isSet()) return;
var value: u8 = 0b0100_0000;
var byte: u8 = 0b0100_0000;
if (byte.w) value |= 0b1000;
if (byte.r) value |= 0b0100;
if (byte.x) value |= 0b0010;
if (byte.b) value |= 0b0001;
if (fields.w) byte |= 0b1000;
if (fields.r) byte |= 0b0100;
if (fields.x) byte |= 0b0010;
if (fields.b) byte |= 0b0001;
try self.writer.writeByte(value);
try self.writer.writeByte(byte);
}
/// Encodes a VEX prefix given all the fields
///
/// See struct `Vex` for a description of each field.
pub fn vex(self: Self, fields: Vex) !void {
if (fields.is3Byte()) {
try self.writer.writeByte(0b1100_0100);
try self.writer.writeByte(
@as(u8, ~@boolToInt(fields.r)) << 7 |
@as(u8, ~@boolToInt(fields.x)) << 6 |
@as(u8, ~@boolToInt(fields.b)) << 5 |
@as(u8, @enumToInt(fields.m)) << 0,
);
try self.writer.writeByte(
@as(u8, @boolToInt(fields.w)) << 7 |
@as(u8, ~fields.v.enc()) << 3 |
@as(u8, @boolToInt(fields.l)) << 2 |
@as(u8, @enumToInt(fields.p)) << 0,
);
} else {
try self.writer.writeByte(0b1100_0101);
try self.writer.writeByte(
@as(u8, ~@boolToInt(fields.r)) << 7 |
@as(u8, ~fields.v.enc()) << 3 |
@as(u8, @boolToInt(fields.l)) << 2 |
@as(u8, @enumToInt(fields.p)) << 0,
);
}
}
// ------
@ -848,6 +951,31 @@ pub const Rex = struct {
}
};
pub const Vex = struct {
w: bool = false,
r: bool = false,
x: bool = false,
b: bool = false,
l: bool = false,
p: enum(u2) {
none = 0b00,
@"66" = 0b01,
f3 = 0b10,
f2 = 0b11,
} = .none,
m: enum(u5) {
@"0f" = 0b0_0001,
@"0f38" = 0b0_0010,
@"0f3a" = 0b0_0011,
_,
} = .@"0f",
v: Register = .ymm0,
pub fn is3Byte(vex: Vex) bool {
return vex.w or vex.x or vex.b or vex.m != .@"0f";
}
};
// Tests
fn expectEqualHexStrings(expected: []const u8, given: []const u8, assembly: []const u8) !void {
assert(expected.len > 0);

File diff suppressed because it is too large Load Diff

View File

@ -168,7 +168,8 @@ test "array to vector" {
test "array to vector with element type coercion" {
if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_x86_64 and
!comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO