diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 9d5f877e14..b791ec5ecc 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -6520,6 +6520,57 @@ fn genBinOp( }, .Vector => switch (lhs_ty.childType().zigTypeTag()) { else => null, + .Int => switch (lhs_ty.childType().intInfo(self.target.*).bits) { + 8 => switch (lhs_ty.vectorLen()) { + 1...16 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_b, .add } else .{ .p_b, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_b, .sub } else .{ .p_b, .sub }, + else => null, + }, + else => null, + }, + 16 => switch (lhs_ty.vectorLen()) { + 1...8 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_w, .add } else .{ .p_w, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_w, .sub } else .{ .p_w, .sub }, + else => null, + }, + else => null, + }, + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_d, .add } else .{ .p_d, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_d, .sub } else .{ .p_d, .sub }, + else => null, + }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_q, .add } else .{ .p_q, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_q, .sub } else .{ .p_q, .sub }, + else => null, + }, + else => null, + }, + else => null, + }, .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) { 1 => { @@ -6812,7 +6863,7 @@ fn genBinOp( ); } switch (air_tag) { - .add, .sub, .mul, .div_float, .div_exact => {}, + .add, .addwrap, .sub, .subwrap, .mul, .mulwrap, .div_float, .div_exact => {}, .div_trunc, .div_floor => try self.genRound( lhs_ty, dst_reg, @@ -9043,14 +9094,33 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr .{ .register = try self.copyToTmpRegister(ty, src_mcv) }, ), .sse => try self.asmRegisterRegister( - switch (ty.scalarType().zigTypeTag()) { - else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + if (@as(?Mir.Inst.FixedTag, switch (ty.scalarType().zigTypeTag()) { + else => switch (abi_size) { + 1...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + 9...16 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + 17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null, + else => null, + }, .Float => switch (ty.floatBits(self.target.*)) { - else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + 16, 128 => switch (abi_size) { + 2...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + 9...16 => if (self.hasFeature(.avx)) + .{ .v_, .movdqa } + else + .{ ._, .movdqa }, + 17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null, + else => null, + }, 32 => if (self.hasFeature(.avx)) .{ .v_ps, .mova } else .{ ._ps, .mova }, 64 => if (self.hasFeature(.avx)) .{ .v_pd, .mova } else .{ ._pd, .mova }, + 80 => null, + else => unreachable, }, - }, + })) |tag| tag else return self.fail("TODO implement genSetReg for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), registerAlias(dst_reg, abi_size), registerAlias(src_reg, abi_size), ), diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 4014947673..c8919d062d 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -262,7 +262,9 @@ pub const Mnemonic = enum { fisttp, fld, // MMX movd, movq, + paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw, pand, pandn, por, pxor, + psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw, // SSE addps, addss, andps, @@ -341,12 +343,14 @@ pub const Mnemonic = enum { vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, vorpd, vorps, + vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw, vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, vpor, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, + vpsubb, vpsubd, vpsubq, vpsubsb, vpsubsw, vpsubusb, vpsubusw, vpsubw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, vpxor, @@ -746,7 +750,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op } const mnemonic_to_encodings_map = init: { - @setEvalBranchQuota(25_000); + @setEvalBranchQuota(30_000); const encodings = @import("encodings.zig"); var entries = encodings.table; std.sort.sort(encodings.Entry, &entries, {}, struct { diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 4d1f59e454..58eab29958 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -288,6 +288,7 @@ pub const Inst = struct { /// Add with carry adc, /// Add + /// Add packed integers /// Add packed single-precision floating-point values /// Add scalar single-precision floating-point values /// Add packed double-precision floating-point values @@ -420,6 +421,7 @@ pub const Inst = struct { /// Double precision shift right sh, /// Subtract + /// Subtract packed integers /// Subtract packed single-precision floating-point values /// Subtract scalar single-precision floating-point values /// Subtract packed double-precision floating-point values @@ -444,9 +446,18 @@ pub const Inst = struct { /// Bitwise logical xor of packed double-precision floating-point values xor, + /// Add packed signed integers with signed saturation + adds, + /// Add packed unsigned integers with unsigned saturation + addus, /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Subtract packed signed integers with signed saturation + subs, + /// Subtract packed unsigned integers with unsigned saturation + subus, + /// Convert packed doubleword integers to packed single-precision floating-point values /// Convert packed doubleword integers to packed double-precision floating-point values cvtpi2, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 3e57be61ea..820fd715ba 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -992,6 +992,17 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, + .{ .paddb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .none, .sse2 }, + .{ .paddw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .none, .sse2 }, + .{ .paddd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .none, .sse2 }, + .{ .paddq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .none, .sse2 }, + + .{ .paddsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xec }, 0, .none, .sse2 }, + .{ .paddsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xed }, 0, .none, .sse2 }, + + .{ .paddusb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdc }, 0, .none, .sse2 }, + .{ .paddusw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdd }, 0, .none, .sse2 }, + .{ .pand, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .none, .sse2 }, .{ .pandn, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .none, .sse2 }, @@ -1013,6 +1024,18 @@ pub const table = [_]Entry{ .{ .psrlq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .none, .sse2 }, .{ .psrlq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .none, .sse2 }, + .{ .psubb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .none, .sse2 }, + .{ .psubw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .none, .sse2 }, + .{ .psubd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .none, .sse2 }, + + .{ .psubsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .none, .sse2 }, + .{ .psubsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .none, .sse2 }, + + .{ .psubq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfb }, 0, .none, .sse2 }, + + .{ .psubusb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .none, .sse2 }, + .{ .psubusw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .none, .sse2 }, + .{ .punpckhbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .none, .sse2 }, .{ .punpckhwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .none, .sse2 }, .{ .punpckhdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .none, .sse2 }, @@ -1261,6 +1284,17 @@ pub const table = [_]Entry{ .{ .vorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, .{ .vorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + .{ .vpaddb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_128_wig, .avx }, + .{ .vpaddw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_128_wig, .avx }, + .{ .vpaddd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_128_wig, .avx }, + .{ .vpaddq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .vex_128_wig, .avx }, + + .{ .vpaddsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xec }, 0, .vex_128_wig, .avx }, + .{ .vpaddsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xed }, 0, .vex_128_wig, .avx }, + + .{ .vpaddusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_128_wig, .avx }, + .{ .vpaddusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_128_wig, .avx }, + .{ .vpand, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_128_wig, .avx }, .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx }, @@ -1287,6 +1321,18 @@ pub const table = [_]Entry{ .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128_wig, .avx }, .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128_wig, .avx }, + .{ .vpsubb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_128_wig, .avx }, + .{ .vpsubd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_128_wig, .avx }, + + .{ .vpsubsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .vex_128_wig, .avx }, + + .{ .vpsubq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfb }, 0, .vex_128_wig, .avx }, + + .{ .vpsubusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .vex_128_wig, .avx }, + .{ .vpunpckhbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128_wig, .avx }, .{ .vpunpckhwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128_wig, .avx }, .{ .vpunpckhdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128_wig, .avx }, @@ -1376,6 +1422,17 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpaddb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .vex_256_wig, .avx2 }, + + .{ .vpaddsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xec }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xed }, 0, .vex_256_wig, .avx2 }, + + .{ .vpaddusb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddusw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_256_wig, .avx2 }, + .{ .vpand, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_256_wig, .avx2 }, .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, @@ -1389,6 +1446,18 @@ pub const table = [_]Entry{ .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 }, .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 }, + .{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfb }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubusb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubusw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckhbw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpckhwd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpckhdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_256_wig, .avx2 },