From a0b35249a29c6869650ba7e3515999fcced8dcf2 Mon Sep 17 00:00:00 2001 From: Frank Denis <124872+jedisct1@users.noreply.github.com> Date: Tue, 18 Jul 2023 00:40:31 +0200 Subject: [PATCH] Replace hand-written endian-specific loads with std.mem.readInt*() (#16431) And when we have the choice, favor little-endian because it's 2023. Gives a slight performance improvement: md5: 552 -> 555 MiB/s sha1: 768 -> 786 MiB/s sha512: 211 -> 217 MiB/s --- lib/std/crypto/aes/soft.zig | 34 +++++++++++++++++----------------- lib/std/crypto/md5.zig | 7 +------ lib/std/crypto/sha1.zig | 2 +- lib/std/crypto/sha2.zig | 10 +--------- 4 files changed, 20 insertions(+), 33 deletions(-) diff --git a/lib/std/crypto/aes/soft.zig b/lib/std/crypto/aes/soft.zig index 0b15555ad0..68a661a93a 100644 --- a/lib/std/crypto/aes/soft.zig +++ b/lib/std/crypto/aes/soft.zig @@ -122,14 +122,14 @@ pub const Block = struct { // Last round uses s-box directly and XORs to produce output. var x: [4]u8 = undefined; - x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s3 >> 24)), @as(u8, @truncate(s2 >> 16)), @as(u8, @truncate(s1 >> 8)), @as(u8, @truncate(s0))); - var t0 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s0 >> 24)), @as(u8, @truncate(s3 >> 16)), @as(u8, @truncate(s2 >> 8)), @as(u8, @truncate(s1))); - var t1 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s1 >> 24)), @as(u8, @truncate(s0 >> 16)), @as(u8, @truncate(s3 >> 8)), @as(u8, @truncate(s2))); - var t2 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s2 >> 24)), @as(u8, @truncate(s1 >> 16)), @as(u8, @truncate(s0 >> 8)), @as(u8, @truncate(s3))); - var t3 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); + x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s0)), @as(u8, @truncate(s1 >> 8)), @as(u8, @truncate(s2 >> 16)), @as(u8, @truncate(s3 >> 24))); + var t0 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s1)), @as(u8, @truncate(s2 >> 8)), @as(u8, @truncate(s3 >> 16)), @as(u8, @truncate(s0 >> 24))); + var t1 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s2)), @as(u8, @truncate(s3 >> 8)), @as(u8, @truncate(s0 >> 16)), @as(u8, @truncate(s1 >> 24))); + var t2 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_encrypt, @as(u8, @truncate(s3)), @as(u8, @truncate(s0 >> 8)), @as(u8, @truncate(s1 >> 16)), @as(u8, @truncate(s2 >> 24))); + var t3 = mem.readIntLittle(u32, &x); t0 ^= round_key.repr[0]; t1 ^= round_key.repr[1]; @@ -218,14 +218,14 @@ pub const Block = struct { // Last round uses s-box directly and XORs to produce output. var x: [4]u8 = undefined; - x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s1 >> 24)), @as(u8, @truncate(s2 >> 16)), @as(u8, @truncate(s3 >> 8)), @as(u8, @truncate(s0))); - var t0 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s2 >> 24)), @as(u8, @truncate(s3 >> 16)), @as(u8, @truncate(s0 >> 8)), @as(u8, @truncate(s1))); - var t1 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s3 >> 24)), @as(u8, @truncate(s0 >> 16)), @as(u8, @truncate(s1 >> 8)), @as(u8, @truncate(s2))); - var t2 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); - x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s0 >> 24)), @as(u8, @truncate(s1 >> 16)), @as(u8, @truncate(s2 >> 8)), @as(u8, @truncate(s3))); - var t3 = @as(u32, x[0]) << 24 | @as(u32, x[1]) << 16 | @as(u32, x[2]) << 8 | @as(u32, x[3]); + x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s0)), @as(u8, @truncate(s3 >> 8)), @as(u8, @truncate(s2 >> 16)), @as(u8, @truncate(s1 >> 24))); + var t0 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s1)), @as(u8, @truncate(s0 >> 8)), @as(u8, @truncate(s3 >> 16)), @as(u8, @truncate(s2 >> 24))); + var t1 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s2)), @as(u8, @truncate(s1 >> 8)), @as(u8, @truncate(s0 >> 16)), @as(u8, @truncate(s3 >> 24))); + var t2 = mem.readIntLittle(u32, &x); + x = sbox_lookup(&sbox_decrypt, @as(u8, @truncate(s3)), @as(u8, @truncate(s2 >> 8)), @as(u8, @truncate(s1 >> 16)), @as(u8, @truncate(s0 >> 24))); + var t3 = mem.readIntLittle(u32, &x); t0 ^= round_key.repr[0]; t1 ^= round_key.repr[1]; @@ -349,7 +349,7 @@ fn KeySchedule(comptime Aes: type) type { // Apply sbox_encrypt to each byte in w. fn func(w: u32) u32 { const x = sbox_lookup(&sbox_key_schedule, @as(u8, @truncate(w)), @as(u8, @truncate(w >> 8)), @as(u8, @truncate(w >> 16)), @as(u8, @truncate(w >> 24))); - return @as(u32, x[3]) << 24 | @as(u32, x[2]) << 16 | @as(u32, x[1]) << 8 | @as(u32, x[0]); + return mem.readIntLittle(u32, &x); } }.func; diff --git a/lib/std/crypto/md5.zig b/lib/std/crypto/md5.zig index b480cbcd8e..839c7b5ed8 100644 --- a/lib/std/crypto/md5.zig +++ b/lib/std/crypto/md5.zig @@ -121,12 +121,7 @@ pub const Md5 = struct { var i: usize = 0; while (i < 16) : (i += 1) { - // NOTE: Performing or's separately improves perf by ~10% - s[i] = 0; - s[i] |= @as(u32, b[i * 4 + 0]); - s[i] |= @as(u32, b[i * 4 + 1]) << 8; - s[i] |= @as(u32, b[i * 4 + 2]) << 16; - s[i] |= @as(u32, b[i * 4 + 3]) << 24; + s[i] = mem.readIntLittle(u32, b[i * 4 ..][0..4]); } var v: [4]u32 = [_]u32{ diff --git a/lib/std/crypto/sha1.zig b/lib/std/crypto/sha1.zig index 429b2e64f2..ee892237a7 100644 --- a/lib/std/crypto/sha1.zig +++ b/lib/std/crypto/sha1.zig @@ -151,7 +151,7 @@ pub const Sha1 = struct { roundParam(0, 1, 2, 3, 4, 15), }; inline for (round0a) |r| { - s[r.i] = (@as(u32, b[r.i * 4 + 0]) << 24) | (@as(u32, b[r.i * 4 + 1]) << 16) | (@as(u32, b[r.i * 4 + 2]) << 8) | (@as(u32, b[r.i * 4 + 3]) << 0); + s[r.i] = mem.readIntBig(u32, b[r.i * 4 ..][0..4]); v[r.e] = v[r.e] +% math.rotl(u32, v[r.a], @as(u32, 5)) +% 0x5A827999 +% s[r.i & 0xf] +% ((v[r.b] & v[r.c]) | (~v[r.b] & v[r.d])); v[r.b] = math.rotl(u32, v[r.b], @as(u32, 30)); diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig index 122ec6cd80..f87ea90d92 100644 --- a/lib/std/crypto/sha2.zig +++ b/lib/std/crypto/sha2.zig @@ -678,15 +678,7 @@ fn Sha2x64(comptime params: Sha2Params64) type { var i: usize = 0; while (i < 16) : (i += 1) { - s[i] = 0; - s[i] |= @as(u64, b[i * 8 + 0]) << 56; - s[i] |= @as(u64, b[i * 8 + 1]) << 48; - s[i] |= @as(u64, b[i * 8 + 2]) << 40; - s[i] |= @as(u64, b[i * 8 + 3]) << 32; - s[i] |= @as(u64, b[i * 8 + 4]) << 24; - s[i] |= @as(u64, b[i * 8 + 5]) << 16; - s[i] |= @as(u64, b[i * 8 + 6]) << 8; - s[i] |= @as(u64, b[i * 8 + 7]) << 0; + s[i] = mem.readIntBig(u64, b[i * 8 ..][0..8]); } while (i < 80) : (i += 1) { s[i] = s[i - 16] +% s[i - 7] +%