From 4c1f71e866088a1a2e943331256115ed7e3daf98 Mon Sep 17 00:00:00 2001
From: Cody Tapscott <topolarity@tapscott.me>
Date: Mon, 24 Oct 2022 09:47:31 -0700
Subject: [PATCH] std.crypto: Optimize SHA-256 intrinsics for AMD x86-64

This gets us most of the way back to the performance I had when
I was using the LLVM intrinsics:
  - Intel Intel(R) Core(TM) i7-1068NG7 CPU @ 2.30GHz:
       190.67 MB/s (w/o intrinsics) -> 1285.08 MB/s
  - AMD EPYC 7763 (VM) @ 2.45 GHz:
       240.09 MB/s (w/o intrinsics) -> 1360.78 MB/s
  - Apple M1:
       216.96 MB/s (w/o intrinsics) -> 2133.69 MB/s

Minor changes to this source can swing performance from 400 MB/s to
1400 MB/s or... 20 MB/s, depending on how it interacts with the
optimizer. I have a sneaking suspicion that despite LLVM inheriting
GCC's extremely strict inline assembly semantics, its passes are
rather skittish around inline assembly (and almost certainly, its
instruction cost models can assume nothing)
---
 lib/std/crypto/sha2.zig | 46 ++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig
index 5b2c8a89e1..5380f32f0b 100644
--- a/lib/std/crypto/sha2.zig
+++ b/lib/std/crypto/sha2.zig
@@ -182,14 +182,8 @@ fn Sha2x32(comptime params: Sha2Params32) type {
 
         fn round(d: *Self, b: *const [64]u8) void {
             var s: [64]u32 align(16) = undefined;
-
-            var i: usize = 0;
-            while (i < 16) : (i += 1) {
-                s[i] = 0;
-                s[i] |= @as(u32, b[i * 4 + 0]) << 24;
-                s[i] |= @as(u32, b[i * 4 + 1]) << 16;
-                s[i] |= @as(u32, b[i * 4 + 2]) << 8;
-                s[i] |= @as(u32, b[i * 4 + 3]) << 0;
+            for (@ptrCast(*align(1) const [16]u32, b)) |*elem, i| {
+                s[i] = mem.readIntBig(u32, mem.asBytes(elem));
             }
 
             switch (builtin.cpu.arch) {
@@ -238,30 +232,35 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                     comptime var k: u8 = 0;
                     inline while (k < 16) : (k += 1) {
                         if (k < 12) {
-                            const r = asm ("sha256msg1 %[w4_7], %[w0_3]"
-                                : [w0_3] "=x" (-> v4u32),
-                                : [_] "0" (s_v[k]),
+                            var tmp = s_v[k];
+                            s_v[k + 4] = asm (
+                                \\ sha256msg1 %[w4_7], %[tmp]
+                                \\ vpalignr $0x4, %[w8_11], %[w12_15], %[result]
+                                \\ paddd %[tmp], %[result]
+                                \\ sha256msg2 %[w12_15], %[result]
+                                : [tmp] "=&x" (tmp),
+                                  [result] "=&x" (-> v4u32),
+                                : [_] "0" (tmp),
                                   [w4_7] "x" (s_v[k + 1]),
-                            );
-                            const t = @shuffle(u32, s_v[k + 2], s_v[k + 3], [_]i32{ 1, 2, 3, -1 });
-                            s_v[k + 4] = asm ("sha256msg2 %[w12_15], %[t]"
-                                : [t] "=x" (-> v4u32),
-                                : [_] "0" (r +% t),
+                                  [w8_11] "x" (s_v[k + 2]),
                                   [w12_15] "x" (s_v[k + 3]),
                             );
                         }
 
                         const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*);
-                        asm volatile (
-                            \\sha256rnds2 %[x], %[y]
-                            \\pshufd $0xe, %%xmm0, %%xmm0
-                            \\sha256rnds2 %[y], %[x]
-                            : [y] "=x" (y),
-                              [x] "=x" (x),
+                        y = asm ("sha256rnds2 %[x], %[y]"
+                            : [y] "=x" (-> v4u32),
                             : [_] "0" (y),
-                              [_] "1" (x),
+                              [x] "x" (x),
                               [_] "{xmm0}" (w),
                         );
+
+                        x = asm ("sha256rnds2 %[y], %[x]"
+                            : [x] "=x" (-> v4u32),
+                            : [_] "0" (x),
+                              [y] "x" (y),
+                              [_] "{xmm0}" (@bitCast(v4u32, @bitCast(u128, w) >> 64)),
+                        );
                     }
 
                     d.s[0] +%= x[3];
@@ -277,6 +276,7 @@ fn Sha2x32(comptime params: Sha2Params32) type {
                 else => {},
             }
 
+            var i: usize = 16;
             while (i < 64) : (i += 1) {
                 s[i] = s[i - 16] +% s[i - 7] +% (math.rotr(u32, s[i - 15], @as(u32, 7)) ^ math.rotr(u32, s[i - 15], @as(u32, 18)) ^ (s[i - 15] >> 3)) +% (math.rotr(u32, s[i - 2], @as(u32, 17)) ^ math.rotr(u32, s[i - 2], @as(u32, 19)) ^ (s[i - 2] >> 10));
             }