From 58873ed3f90325cee5c442169d609d02c71fd05a Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 30 Sep 2020 01:12:37 +0200 Subject: [PATCH 1/4] std/crypto: add GHASH implementation GHASH is required to implement AES-GCM. Optimized implementations for CPUs with instructions for carry-less multiplication will be added next. --- lib/std/crypto.zig | 1 + lib/std/crypto/benchmark.zig | 1 + lib/std/crypto/ghash.zig | 195 +++++++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+) create mode 100644 lib/std/crypto/ghash.zig diff --git a/lib/std/crypto.zig b/lib/std/crypto.zig index fa69d51d4d..5070305633 100644 --- a/lib/std/crypto.zig +++ b/lib/std/crypto.zig @@ -35,6 +35,7 @@ pub const aead = struct { /// MAC functions requiring single-use secret keys. pub const onetimeauth = struct { pub const Poly1305 = @import("crypto/poly1305.zig").Poly1305; + pub const Ghash = @import("crypto/ghash.zig").Ghash; }; /// A password hashing function derives a uniform key from low-entropy input material such as passwords. diff --git a/lib/std/crypto/benchmark.zig b/lib/std/crypto/benchmark.zig index 3c7e3445a2..d0ff29e896 100644 --- a/lib/std/crypto/benchmark.zig +++ b/lib/std/crypto/benchmark.zig @@ -57,6 +57,7 @@ pub fn benchmarkHash(comptime Hash: anytype, comptime bytes: comptime_int) !u64 } const macs = [_]Crypto{ + Crypto{ .ty = crypto.onetimeauth.Ghash, .name = "ghash" }, Crypto{ .ty = crypto.onetimeauth.Poly1305, .name = "poly1305" }, Crypto{ .ty = crypto.auth.hmac.HmacMd5, .name = "hmac-md5" }, Crypto{ .ty = crypto.auth.hmac.HmacSha1, .name = "hmac-sha1" }, diff --git a/lib/std/crypto/ghash.zig b/lib/std/crypto/ghash.zig new file mode 100644 index 0000000000..f05b5d7139 --- /dev/null +++ b/lib/std/crypto/ghash.zig @@ -0,0 +1,195 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2015-2020 Zig Contributors +// This file is part of [zig](https://ziglang.org/), which is MIT licensed. +// The MIT license requires this copyright notice to be included in all copies +// and substantial portions of the software. +// +// Adapted from BearSSL's ctmul64 implementation originally written by Thomas Pornin + +const std = @import("../std.zig"); +const assert = std.debug.assert; +const math = std.math; +const mem = std.mem; + +/// GHASH is a universal hash function that features multiplication +/// by a fixed parameter within a Galois field. +/// +/// It is not a general purpose hash function - The key must be secret, unpredictable and never reused. +/// +/// GHASH is typically used to compute the authentication tag in the AES-GCM construction. +pub const Ghash = struct { + pub const block_size: usize = 16; + pub const mac_length = 16; + pub const minimum_key_length = 16; + + y0: u64 = 0, + y1: u64 = 0, + h0: u64, + h1: u64, + h2: u64, + h0r: u64, + h1r: u64, + h2r: u64, + + leftover: usize = 0, + buf: [block_size]u8 align(16) = undefined, + + pub fn init(key: []const u8) Ghash { + assert(key.len >= minimum_key_length); + const h1 = mem.readIntBig(u64, key[0..8]); + const h0 = mem.readIntBig(u64, key[8..16]); + const h1r = @bitReverse(u64, h1); + const h0r = @bitReverse(u64, h0); + const h2 = h0 ^ h1; + const h2r = h0r ^ h1r; + + return Ghash{ + .h0 = h0, + .h1 = h1, + .h2 = h2, + .h0r = h0r, + .h1r = h1r, + .h2r = h2r, + }; + } + + fn bmul(x: u64, y: u64) u64 { + const x0 = x & 0x1111111111111111; + const x1 = x & 0x2222222222222222; + const x2 = x & 0x4444444444444444; + const x3 = x & 0x8888888888888888; + const y0 = y & 0x1111111111111111; + const y1 = y & 0x2222222222222222; + const y2 = y & 0x4444444444444444; + const y3 = y & 0x8888888888888888; + var z0 = (x0 *% y0) ^ (x1 *% y3) ^ (x2 *% y2) ^ (x3 *% y1); + var z1 = (x0 *% y1) ^ (x1 *% y0) ^ (x2 *% y3) ^ (x3 *% y2); + var z2 = (x0 *% y2) ^ (x1 *% y1) ^ (x2 *% y0) ^ (x3 *% y3); + var z3 = (x0 *% y3) ^ (x1 *% y2) ^ (x2 *% y1) ^ (x3 *% y0); + z0 &= 0x1111111111111111; + z1 &= 0x2222222222222222; + z2 &= 0x4444444444444444; + z3 &= 0x8888888888888888; + + return z0 | z1 | z2 | z3; + } + + fn blocks(st: *Ghash, msg: []const u8) void { + assert(msg.len % 16 == 0); // GHASH blocks() expects full blocks + var y1 = st.y1; + var y0 = st.y0; + + var i: usize = 0; + while (i + 16 <= msg.len) : (i += 16) { + y1 ^= mem.readIntBig(u64, msg[i..][0..8]); + y0 ^= mem.readIntBig(u64, msg[i..][8..16]); + + const y1r = @bitReverse(u64, y1); + const y0r = @bitReverse(u64, y0); + const y2 = y0 ^ y1; + const y2r = y0r ^ y1r; + + const z0 = bmul(y0, st.h0); + const z1 = bmul(y1, st.h1); + var z2 = bmul(y2, st.h2); + var z0h = bmul(y0r, st.h0r); + var z1h = bmul(y1r, st.h1r); + var z2h = bmul(y2r, st.h2r); + z2 ^= z0 ^ z1; + z2h ^= z0h ^ z1h; + z0h = @bitReverse(u64, z0h) >> 1; + z1h = @bitReverse(u64, z1h) >> 1; + z2h = @bitReverse(u64, z2h) >> 1; + + var v3 = z1h; + var v2 = z1 ^ z2h; + var v1 = z0h ^ z2; + var v0 = z0; + + v3 = (v3 << 1) | (v2 >> 63); + v2 = (v2 << 1) | (v1 >> 63); + v1 = (v1 << 1) | (v0 >> 63); + v0 = (v0 << 1); + + v2 ^= v0 ^ (v0 >> 1) ^ (v0 >> 2) ^ (v0 >> 7); + v1 ^= (v0 << 63) ^ (v0 << 62) ^ (v0 << 57); + y1 = v3 ^ v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7); + y0 = v2 ^ (v1 << 63) ^ (v1 << 62) ^ (v1 << 57); + } + st.y1 = y1; + st.y0 = y0; + } + + pub fn update(st: *Ghash, m: []const u8) void { + var mb = m; + + if (st.leftover > 0) { + const want = math.min(block_size - st.leftover, mb.len); + const mc = mb[0..want]; + for (mc) |x, i| { + st.buf[st.leftover + i] = x; + } + mb = mb[want..]; + st.leftover += want; + if (st.leftover > block_size) { + return; + } + st.blocks(&st.buf); + st.leftover = 0; + } + if (mb.len >= block_size) { + const want = mb.len & ~(block_size - 1); + st.blocks(mb[0..want]); + mb = mb[want..]; + } + if (mb.len > 0) { + for (mb) |x, i| { + st.buf[st.leftover + i] = x; + } + st.leftover += mb.len; + } + } + + pub fn final(st: *Ghash, out: []u8) void { + assert(out.len >= mac_length); + if (st.leftover > 0) { + var i = st.leftover; + while (i < block_size) : (i += 1) { + st.buf[i] = 0; + } + st.blocks(&st.buf); + } + mem.writeIntBig(u64, out[0..8], st.y1); + mem.writeIntBig(u64, out[8..16], st.y0); + + mem.secureZero(u8, @ptrCast([*]u8, st)[0..@sizeOf(Ghash)]); + } + + pub fn create(out: []u8, msg: []const u8, key: []const u8) void { + std.debug.assert(out.len >= mac_length); + std.debug.assert(key.len >= minimum_key_length); + + var st = Ghash.init(key); + st.update(msg); + st.final(out); + } +}; + +const htest = @import("test.zig"); + +test "ghash" { + const key = [_]u8{0x42} ** 16; + const m = [_]u8{0x69} ** 256; + + var st = Ghash.init(&key); + st.update(&m); + var out: [16]u8 = undefined; + st.final(&out); + htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); + + st = Ghash.init(&key); + st.update(m[0..100]); + st.update(m[100..]); + st.final(&out); + htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); +} From f1ad94437baaae40109f388a7d44d698c10a56d3 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 30 Sep 2020 18:36:31 +0200 Subject: [PATCH 2/4] ghash & poly1305: use pointer to slices for keys and output --- lib/std/crypto/ghash.zig | 11 +++-------- lib/std/crypto/poly1305.zig | 11 +++-------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/lib/std/crypto/ghash.zig b/lib/std/crypto/ghash.zig index f05b5d7139..a7a6be9722 100644 --- a/lib/std/crypto/ghash.zig +++ b/lib/std/crypto/ghash.zig @@ -34,8 +34,7 @@ pub const Ghash = struct { leftover: usize = 0, buf: [block_size]u8 align(16) = undefined, - pub fn init(key: []const u8) Ghash { - assert(key.len >= minimum_key_length); + pub fn init(key: *const [minimum_key_length]u8) Ghash { const h1 = mem.readIntBig(u64, key[0..8]); const h0 = mem.readIntBig(u64, key[8..16]); const h1r = @bitReverse(u64, h1); @@ -150,8 +149,7 @@ pub const Ghash = struct { } } - pub fn final(st: *Ghash, out: []u8) void { - assert(out.len >= mac_length); + pub fn final(st: *Ghash, out: *[mac_length]u8) void { if (st.leftover > 0) { var i = st.leftover; while (i < block_size) : (i += 1) { @@ -165,10 +163,7 @@ pub const Ghash = struct { mem.secureZero(u8, @ptrCast([*]u8, st)[0..@sizeOf(Ghash)]); } - pub fn create(out: []u8, msg: []const u8, key: []const u8) void { - std.debug.assert(out.len >= mac_length); - std.debug.assert(key.len >= minimum_key_length); - + pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [minimum_key_length]u8) void { var st = Ghash.init(key); st.update(msg); st.final(out); diff --git a/lib/std/crypto/poly1305.zig b/lib/std/crypto/poly1305.zig index a95b9d7cb3..31d1d6ba5a 100644 --- a/lib/std/crypto/poly1305.zig +++ b/lib/std/crypto/poly1305.zig @@ -22,8 +22,7 @@ pub const Poly1305 = struct { // partial block buffer buf: [block_size]u8 align(16) = undefined, - pub fn init(key: []const u8) Poly1305 { - std.debug.assert(key.len >= minimum_key_length); + pub fn init(key: *const [minimum_key_length]u8) Poly1305 { const t0 = mem.readIntLittle(u64, key[0..8]); const t1 = mem.readIntLittle(u64, key[8..16]); return Poly1305{ @@ -115,8 +114,7 @@ pub const Poly1305 = struct { } } - pub fn final(st: *Poly1305, out: []u8) void { - std.debug.assert(out.len >= mac_length); + pub fn final(st: *Poly1305, out: *[mac_length]u8) void { if (st.leftover > 0) { var i = st.leftover; st.buf[i] = 1; @@ -187,10 +185,7 @@ pub const Poly1305 = struct { std.mem.secureZero(u8, @ptrCast([*]u8, st)[0..@sizeOf(Poly1305)]); } - pub fn create(out: []u8, msg: []const u8, key: []const u8) void { - std.debug.assert(out.len >= mac_length); - std.debug.assert(key.len >= minimum_key_length); - + pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [minimum_key_length]u8) void { var st = Poly1305.init(key); st.update(msg); st.final(out); From 8161de7fa44393c08cf96c9f9aa5379f98925472 Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Wed, 30 Sep 2020 22:34:17 +0200 Subject: [PATCH 3/4] Implement ghash aggregated reduction Performance increases from ~400 MiB/s to 450 MiB/s at the expense of extra code. Thus, aggregation is disabled on ReleaseSmall. Since the multiplication cost is significant compared to the reduction, aggregating more than 2 blocks is probably not worth it. --- lib/std/crypto/ghash.zig | 130 +++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 12 deletions(-) diff --git a/lib/std/crypto/ghash.zig b/lib/std/crypto/ghash.zig index a7a6be9722..803547099f 100644 --- a/lib/std/crypto/ghash.zig +++ b/lib/std/crypto/ghash.zig @@ -31,6 +31,13 @@ pub const Ghash = struct { h1r: u64, h2r: u64, + hh0: u64 = undefined, + hh1: u64 = undefined, + hh2: u64 = undefined, + hh0r: u64 = undefined, + hh1r: u64 = undefined, + hh2r: u64 = undefined, + leftover: usize = 0, buf: [block_size]u8 align(16) = undefined, @@ -42,14 +49,49 @@ pub const Ghash = struct { const h2 = h0 ^ h1; const h2r = h0r ^ h1r; - return Ghash{ - .h0 = h0, - .h1 = h1, - .h2 = h2, - .h0r = h0r, - .h1r = h1r, - .h2r = h2r, - }; + if (std.builtin.mode == .ReleaseSmall) { + return Ghash{ + .h0 = h0, + .h1 = h1, + .h2 = h2, + .h0r = h0r, + .h1r = h1r, + .h2r = h2r, + }; + } else { + // Precompute H^2 + var hh = Ghash{ + .h0 = h0, + .h1 = h1, + .h2 = h2, + .h0r = h0r, + .h1r = h1r, + .h2r = h2r, + }; + hh.update(key); + const hh1 = hh.y1; + const hh0 = hh.y0; + const hh1r = @bitReverse(u64, hh1); + const hh0r = @bitReverse(u64, hh0); + const hh2 = hh0 ^ hh1; + const hh2r = hh0r ^ hh1r; + + return Ghash{ + .h0 = h0, + .h1 = h1, + .h2 = h2, + .h0r = h0r, + .h1r = h1r, + .h2r = h2r, + + .hh0 = hh0, + .hh1 = hh1, + .hh2 = hh2, + .hh0r = hh0r, + .hh1r = hh1r, + .hh2r = hh2r, + }; + } } fn bmul(x: u64, y: u64) u64 { @@ -79,6 +121,71 @@ pub const Ghash = struct { var y0 = st.y0; var i: usize = 0; + + // 2-blocks aggregated reduction + if (std.builtin.mode != .ReleaseSmall) { + while (i + 32 <= msg.len) : (i += 32) { + // B0 * H^2 unreduced + y1 ^= mem.readIntBig(u64, msg[i..][0..8]); + y0 ^= mem.readIntBig(u64, msg[i..][8..16]); + + const y1r = @bitReverse(u64, y1); + const y0r = @bitReverse(u64, y0); + const y2 = y0 ^ y1; + const y2r = y0r ^ y1r; + + var z0 = bmul(y0, st.hh0); + var z1 = bmul(y1, st.hh1); + var z2 = bmul(y2, st.hh2) ^ z0 ^ z1; + var z0h = bmul(y0r, st.hh0r); + var z1h = bmul(y1r, st.hh1r); + var z2h = bmul(y2r, st.hh2r) ^ z0h ^ z1h; + + // B1 * H unreduced + const sy1 = mem.readIntBig(u64, msg[i..][16..24]); + const sy0 = mem.readIntBig(u64, msg[i..][24..32]); + + const sy1r = @bitReverse(u64, sy1); + const sy0r = @bitReverse(u64, sy0); + const sy2 = sy0 ^ sy1; + const sy2r = sy0r ^ sy1r; + + const sz0 = bmul(sy0, st.h0); + const sz1 = bmul(sy1, st.h1); + const sz2 = bmul(sy2, st.h2) ^ sz0 ^ sz1; + const sz0h = bmul(sy0r, st.h0r); + const sz1h = bmul(sy1r, st.h1r); + const sz2h = bmul(sy2r, st.h2r) ^ sz0h ^ sz1h; + + // ((B0 * H^2) + B1 * H) (mod M) + z0 ^= sz0; + z1 ^= sz1; + z2 ^= sz2; + z0h ^= sz0h; + z1h ^= sz1h; + z2h ^= sz2h; + z0h = @bitReverse(u64, z0h) >> 1; + z1h = @bitReverse(u64, z1h) >> 1; + z2h = @bitReverse(u64, z2h) >> 1; + + var v3 = z1h; + var v2 = z1 ^ z2h; + var v1 = z0h ^ z2; + var v0 = z0; + + v3 = (v3 << 1) | (v2 >> 63); + v2 = (v2 << 1) | (v1 >> 63); + v1 = (v1 << 1) | (v0 >> 63); + v0 = (v0 << 1); + + v2 ^= v0 ^ (v0 >> 1) ^ (v0 >> 2) ^ (v0 >> 7); + v1 ^= (v0 << 63) ^ (v0 << 62) ^ (v0 << 57); + y1 = v3 ^ v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7); + y0 = v2 ^ (v1 << 63) ^ (v1 << 62) ^ (v1 << 57); + } + } + + // single block while (i + 16 <= msg.len) : (i += 16) { y1 ^= mem.readIntBig(u64, msg[i..][0..8]); y0 ^= mem.readIntBig(u64, msg[i..][8..16]); @@ -90,16 +197,15 @@ pub const Ghash = struct { const z0 = bmul(y0, st.h0); const z1 = bmul(y1, st.h1); - var z2 = bmul(y2, st.h2); + var z2 = bmul(y2, st.h2) ^ z0 ^ z1; var z0h = bmul(y0r, st.h0r); var z1h = bmul(y1r, st.h1r); - var z2h = bmul(y2r, st.h2r); - z2 ^= z0 ^ z1; - z2h ^= z0h ^ z1h; + var z2h = bmul(y2r, st.h2r) ^ z0h ^ z1h; z0h = @bitReverse(u64, z0h) >> 1; z1h = @bitReverse(u64, z1h) >> 1; z2h = @bitReverse(u64, z2h) >> 1; + // shift & reduce var v3 = z1h; var v2 = z1 ^ z2h; var v1 = z0h ^ z2; From 97fd0974b9372f65c661908fb1ffc01d68255dae Mon Sep 17 00:00:00 2001 From: Frank Denis Date: Thu, 1 Oct 2020 01:34:23 +0200 Subject: [PATCH 4/4] ghash: add pclmul support on x86_64 --- lib/std/crypto/ghash.zig | 54 +++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/lib/std/crypto/ghash.zig b/lib/std/crypto/ghash.zig index 803547099f..6a1bf7c186 100644 --- a/lib/std/crypto/ghash.zig +++ b/lib/std/crypto/ghash.zig @@ -94,7 +94,18 @@ pub const Ghash = struct { } } - fn bmul(x: u64, y: u64) u64 { + inline fn clmul_pclmul(x: u64, y: u64) u64 { + const Vector = std.meta.Vector; + const product = asm ( + \\ vpclmulqdq $0x00, %[x], %[y], %[out] + : [out] "=x" (-> Vector(2, u64)) + : [x] "x" (@bitCast(Vector(2, u64), @as(u128, x))), + [y] "x" (@bitCast(Vector(2, u64), @as(u128, y))) + ); + return product[0]; + } + + fn clmul_soft(x: u64, y: u64) u64 { const x0 = x & 0x1111111111111111; const x1 = x & 0x2222222222222222; const x2 = x & 0x4444444444444444; @@ -111,10 +122,13 @@ pub const Ghash = struct { z1 &= 0x2222222222222222; z2 &= 0x4444444444444444; z3 &= 0x8888888888888888; - return z0 | z1 | z2 | z3; } + const has_pclmul = comptime std.Target.x86.featureSetHas(std.Target.current.cpu.features, .pclmul); + const has_avx = comptime std.Target.x86.featureSetHas(std.Target.current.cpu.features, .avx); + const clmul = if (std.Target.current.cpu.arch == .x86_64 and has_pclmul and has_avx) clmul_pclmul else clmul_soft; + fn blocks(st: *Ghash, msg: []const u8) void { assert(msg.len % 16 == 0); // GHASH blocks() expects full blocks var y1 = st.y1; @@ -134,12 +148,12 @@ pub const Ghash = struct { const y2 = y0 ^ y1; const y2r = y0r ^ y1r; - var z0 = bmul(y0, st.hh0); - var z1 = bmul(y1, st.hh1); - var z2 = bmul(y2, st.hh2) ^ z0 ^ z1; - var z0h = bmul(y0r, st.hh0r); - var z1h = bmul(y1r, st.hh1r); - var z2h = bmul(y2r, st.hh2r) ^ z0h ^ z1h; + var z0 = clmul(y0, st.hh0); + var z1 = clmul(y1, st.hh1); + var z2 = clmul(y2, st.hh2) ^ z0 ^ z1; + var z0h = clmul(y0r, st.hh0r); + var z1h = clmul(y1r, st.hh1r); + var z2h = clmul(y2r, st.hh2r) ^ z0h ^ z1h; // B1 * H unreduced const sy1 = mem.readIntBig(u64, msg[i..][16..24]); @@ -150,12 +164,12 @@ pub const Ghash = struct { const sy2 = sy0 ^ sy1; const sy2r = sy0r ^ sy1r; - const sz0 = bmul(sy0, st.h0); - const sz1 = bmul(sy1, st.h1); - const sz2 = bmul(sy2, st.h2) ^ sz0 ^ sz1; - const sz0h = bmul(sy0r, st.h0r); - const sz1h = bmul(sy1r, st.h1r); - const sz2h = bmul(sy2r, st.h2r) ^ sz0h ^ sz1h; + const sz0 = clmul(sy0, st.h0); + const sz1 = clmul(sy1, st.h1); + const sz2 = clmul(sy2, st.h2) ^ sz0 ^ sz1; + const sz0h = clmul(sy0r, st.h0r); + const sz1h = clmul(sy1r, st.h1r); + const sz2h = clmul(sy2r, st.h2r) ^ sz0h ^ sz1h; // ((B0 * H^2) + B1 * H) (mod M) z0 ^= sz0; @@ -195,12 +209,12 @@ pub const Ghash = struct { const y2 = y0 ^ y1; const y2r = y0r ^ y1r; - const z0 = bmul(y0, st.h0); - const z1 = bmul(y1, st.h1); - var z2 = bmul(y2, st.h2) ^ z0 ^ z1; - var z0h = bmul(y0r, st.h0r); - var z1h = bmul(y1r, st.h1r); - var z2h = bmul(y2r, st.h2r) ^ z0h ^ z1h; + const z0 = clmul(y0, st.h0); + const z1 = clmul(y1, st.h1); + var z2 = clmul(y2, st.h2) ^ z0 ^ z1; + var z0h = clmul(y0r, st.h0r); + var z1h = clmul(y1r, st.h1r); + var z2h = clmul(y2r, st.h2r) ^ z0h ^ z1h; z0h = @bitReverse(u64, z0h) >> 1; z1h = @bitReverse(u64, z1h) >> 1; z2h = @bitReverse(u64, z2h) >> 1;