mirror of
https://github.com/ziglang/zig.git
synced 2026-01-20 22:35:24 +00:00
crypto.sha2: Use intrinsics for SHA-256 on x86-64 and AArch64
There's probably plenty of room to optimize these further in the future, but for the moment this gives ~3x improvement on Intel x86-64 processors, ~5x on AMD, and ~10x on M1 Macs. These extensions are very new - Most processors prior to 2020 do not support them. AVX-512 is a slightly older alternative that we could use on Intel for a much bigger performance bump, but it's been fused off on Intel's latest hybrid architectures and it relies on computing independent SHA hashes in parallel. In contrast, these SHA intrinsics provide the usual single-threaded, single-stream interface, and should continue working on new processors. AArch64 also has SHA-512 intrinsics that we could take advantage of in the future
This commit is contained in:
parent
c616141241
commit
10edb6d352
@ -1,4 +1,5 @@
|
||||
const std = @import("../std.zig");
|
||||
const builtin = @import("builtin");
|
||||
const mem = std.mem;
|
||||
const math = std.math;
|
||||
const htest = @import("test.zig");
|
||||
@ -16,10 +17,9 @@ const RoundParam256 = struct {
|
||||
g: usize,
|
||||
h: usize,
|
||||
i: usize,
|
||||
k: u32,
|
||||
};
|
||||
|
||||
fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize, k: u32) RoundParam256 {
|
||||
fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize) RoundParam256 {
|
||||
return RoundParam256{
|
||||
.a = a,
|
||||
.b = b,
|
||||
@ -30,7 +30,6 @@ fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g:
|
||||
.g = g,
|
||||
.h = h,
|
||||
.i = i,
|
||||
.k = k,
|
||||
};
|
||||
}
|
||||
|
||||
@ -70,6 +69,8 @@ const Sha256Params = Sha2Params32{
|
||||
.digest_bits = 256,
|
||||
};
|
||||
|
||||
const v4u32 = @Vector(4, u32);
|
||||
|
||||
/// SHA-224
|
||||
pub const Sha224 = Sha2x32(Sha224Params);
|
||||
|
||||
@ -83,7 +84,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
|
||||
pub const digest_length = params.digest_bits / 8;
|
||||
pub const Options = struct {};
|
||||
|
||||
s: [8]u32,
|
||||
s: [8]u32 align(16),
|
||||
// Streaming Cache
|
||||
buf: [64]u8 = undefined,
|
||||
buf_len: u8 = 0,
|
||||
@ -168,8 +169,19 @@ fn Sha2x32(comptime params: Sha2Params32) type {
|
||||
}
|
||||
}
|
||||
|
||||
const W = [64]u32{
|
||||
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
|
||||
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
|
||||
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
|
||||
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
|
||||
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
|
||||
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
|
||||
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
|
||||
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
|
||||
};
|
||||
|
||||
fn round(d: *Self, b: *const [64]u8) void {
|
||||
var s: [64]u32 = undefined;
|
||||
var s: [64]u32 align(16) = undefined;
|
||||
|
||||
var i: usize = 0;
|
||||
while (i < 16) : (i += 1) {
|
||||
@ -179,6 +191,88 @@ fn Sha2x32(comptime params: Sha2Params32) type {
|
||||
s[i] |= @as(u32, b[i * 4 + 2]) << 8;
|
||||
s[i] |= @as(u32, b[i * 4 + 3]) << 0;
|
||||
}
|
||||
|
||||
if (builtin.cpu.arch == .aarch64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.aarch64.Feature.sha2))) {
|
||||
var x: v4u32 = d.s[0..4].*;
|
||||
var y: v4u32 = d.s[4..8].*;
|
||||
const s_v = @ptrCast(*[16]v4u32, &s);
|
||||
|
||||
comptime var k: u8 = 0;
|
||||
inline while (k < 16) : (k += 1) {
|
||||
if (k > 3) {
|
||||
s_v[k] = asm (
|
||||
\\sha256su0.4s %[w0_3], %[w4_7]
|
||||
\\sha256su1.4s %[w0_3], %[w8_11], %[w12_15]
|
||||
: [w0_3] "=w" (-> v4u32),
|
||||
: [_] "0" (s_v[k - 4]),
|
||||
[w4_7] "w" (s_v[k - 3]),
|
||||
[w8_11] "w" (s_v[k - 2]),
|
||||
[w12_15] "w" (s_v[k - 1]),
|
||||
);
|
||||
}
|
||||
|
||||
const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
|
||||
asm volatile (
|
||||
\\mov.4s v0, %[x]
|
||||
\\sha256h.4s %[x], %[y], %[w]
|
||||
\\sha256h2.4s %[y], v0, %[w]
|
||||
: [x] "=w" (x),
|
||||
[y] "=w" (y),
|
||||
: [_] "0" (x),
|
||||
[_] "1" (y),
|
||||
[w] "w" (w),
|
||||
: "v0"
|
||||
);
|
||||
}
|
||||
|
||||
d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*);
|
||||
d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*);
|
||||
return;
|
||||
} else if (builtin.cpu.arch == .x86_64 and builtin.cpu.features.isEnabled(@enumToInt(std.Target.x86.Feature.sha))) {
|
||||
var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] };
|
||||
var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] };
|
||||
const s_v = @ptrCast(*[16]v4u32, &s);
|
||||
|
||||
comptime var k: u8 = 0;
|
||||
inline while (k < 16) : (k += 1) {
|
||||
if (k < 12) {
|
||||
const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
|
||||
: [w0_3] "=x" (-> v4u32),
|
||||
: [_] "0" (s_v[k]),
|
||||
[w4_7] "x" (s_v[k + 1]),
|
||||
);
|
||||
const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
|
||||
s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
|
||||
: [t] "=x" (-> v4u32),
|
||||
: [_] "0" (r +% t),
|
||||
[w12_15] "x" (s_v[k + 3]),
|
||||
);
|
||||
}
|
||||
|
||||
const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
|
||||
asm volatile (
|
||||
\\sha256rnds2 %[x], %[y]
|
||||
\\pshufd $0xe, %%xmm0, %%xmm0
|
||||
\\sha256rnds2 %[y], %[x]
|
||||
: [y] "=x" (y),
|
||||
[x] "=x" (x),
|
||||
: [_] "0" (y),
|
||||
[_] "1" (x),
|
||||
[_] "{xmm0}" (w),
|
||||
);
|
||||
}
|
||||
|
||||
d.s[0] +%= x[3];
|
||||
d.s[1] +%= x[2];
|
||||
d.s[4] +%= x[1];
|
||||
d.s[5] +%= x[0];
|
||||
d.s[2] +%= y[3];
|
||||
d.s[3] +%= y[2];
|
||||
d.s[6] +%= y[1];
|
||||
d.s[7] +%= y[0];
|
||||
return;
|
||||
}
|
||||
|
||||
while (i < 64) : (i += 1) {
|
||||
s[i] = s[i - 16] +% s[i - 7] +% (math.rotr(u32, s[i - 15], @as(u32, 7)) ^ math.rotr(u32, s[i - 15], @as(u32, 18)) ^ (s[i - 15] >> 3)) +% (math.rotr(u32, s[i - 2], @as(u32, 17)) ^ math.rotr(u32, s[i - 2], @as(u32, 19)) ^ (s[i - 2] >> 10));
|
||||
}
|
||||
@ -195,73 +289,73 @@ fn Sha2x32(comptime params: Sha2Params32) type {
|
||||
};
|
||||
|
||||
const round0 = comptime [_]RoundParam256{
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0, 0x428A2F98),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1, 0x71374491),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2, 0xB5C0FBCF),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3, 0xE9B5DBA5),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4, 0x3956C25B),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5, 0x59F111F1),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6, 0x923F82A4),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7, 0xAB1C5ED5),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8, 0xD807AA98),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9, 0x12835B01),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10, 0x243185BE),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11, 0x550C7DC3),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12, 0x72BE5D74),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13, 0x80DEB1FE),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14, 0x9BDC06A7),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15, 0xC19BF174),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16, 0xE49B69C1),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17, 0xEFBE4786),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18, 0x0FC19DC6),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19, 0x240CA1CC),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20, 0x2DE92C6F),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21, 0x4A7484AA),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22, 0x5CB0A9DC),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23, 0x76F988DA),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24, 0x983E5152),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25, 0xA831C66D),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26, 0xB00327C8),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27, 0xBF597FC7),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28, 0xC6E00BF3),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29, 0xD5A79147),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30, 0x06CA6351),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31, 0x14292967),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32, 0x27B70A85),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33, 0x2E1B2138),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34, 0x4D2C6DFC),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35, 0x53380D13),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36, 0x650A7354),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37, 0x766A0ABB),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38, 0x81C2C92E),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39, 0x92722C85),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40, 0xA2BFE8A1),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41, 0xA81A664B),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42, 0xC24B8B70),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43, 0xC76C51A3),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44, 0xD192E819),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45, 0xD6990624),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46, 0xF40E3585),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47, 0x106AA070),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48, 0x19A4C116),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49, 0x1E376C08),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50, 0x2748774C),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51, 0x34B0BCB5),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52, 0x391C0CB3),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53, 0x4ED8AA4A),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54, 0x5B9CCA4F),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55, 0x682E6FF3),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56, 0x748F82EE),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57, 0x78A5636F),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58, 0x84C87814),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59, 0x8CC70208),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60, 0x90BEFFFA),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61, 0xA4506CEB),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62, 0xBEF9A3F7),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63, 0xC67178F2),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55),
|
||||
roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56),
|
||||
roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57),
|
||||
roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58),
|
||||
roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59),
|
||||
roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60),
|
||||
roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61),
|
||||
roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62),
|
||||
roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63),
|
||||
};
|
||||
inline for (round0) |r| {
|
||||
v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% r.k +% s[r.i];
|
||||
v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% W[r.i] +% s[r.i];
|
||||
|
||||
v[r.d] = v[r.d] +% v[r.h];
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user