diff --git a/lib/std/crypto.zig b/lib/std/crypto.zig index f9fa50c692..bf3080038c 100644 --- a/lib/std/crypto.zig +++ b/lib/std/crypto.zig @@ -87,7 +87,8 @@ pub const kdf = struct { /// MAC functions requiring single-use secret keys. pub const onetimeauth = struct { - pub const Ghash = @import("crypto/ghash.zig").Ghash; + pub const Ghash = @import("crypto/ghash_polyval.zig").Ghash; + pub const Polyval = @import("crypto/ghash_polyval.zig").Polyval; pub const Poly1305 = @import("crypto/poly1305.zig").Poly1305; }; diff --git a/lib/std/crypto/benchmark.zig b/lib/std/crypto/benchmark.zig index d4bf3d2633..a479906259 100644 --- a/lib/std/crypto/benchmark.zig +++ b/lib/std/crypto/benchmark.zig @@ -54,6 +54,7 @@ pub fn benchmarkHash(comptime Hash: anytype, comptime bytes: comptime_int) !u64 const macs = [_]Crypto{ Crypto{ .ty = crypto.onetimeauth.Ghash, .name = "ghash" }, + Crypto{ .ty = crypto.onetimeauth.Polyval, .name = "polyval" }, Crypto{ .ty = crypto.onetimeauth.Poly1305, .name = "poly1305" }, Crypto{ .ty = crypto.auth.hmac.HmacMd5, .name = "hmac-md5" }, Crypto{ .ty = crypto.auth.hmac.HmacSha1, .name = "hmac-sha1" }, diff --git a/lib/std/crypto/ghash.zig b/lib/std/crypto/ghash.zig deleted file mode 100644 index c60710c4ba..0000000000 --- a/lib/std/crypto/ghash.zig +++ /dev/null @@ -1,415 +0,0 @@ -const std = @import("../std.zig"); -const builtin = @import("builtin"); -const assert = std.debug.assert; -const math = std.math; -const mem = std.mem; -const utils = std.crypto.utils; - -const Precomp = u128; - -/// GHASH is a universal hash function that features multiplication -/// by a fixed parameter within a Galois field. -/// -/// It is not a general purpose hash function - The key must be secret, unpredictable and never reused. -/// -/// GHASH is typically used to compute the authentication tag in the AES-GCM construction. -pub const Ghash = struct { - pub const block_length: usize = 16; - pub const mac_length = 16; - pub const key_length = 16; - - const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 2; - const agg_4_treshold = 22; - const agg_8_treshold = 84; - const agg_16_treshold = 328; - - // Before the Haswell architecture, the carryless multiplication instruction was - // extremely slow. Even with 128-bit operands, using Karatsuba multiplication was - // thus faster than a schoolbook multiplication. - // This is no longer the case -- Modern CPUs, including ARM-based ones, have a fast - // carryless multiplication instruction; using 4 multiplications is now faster than - // 3 multiplications with extra shifts and additions. - const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .schoolbook; - - hx: [pc_count]Precomp, - acc: u128 = 0, - - leftover: usize = 0, - buf: [block_length]u8 align(16) = undefined, - - /// Initialize the GHASH state with a key, and a minimum number of block count. - pub fn initForBlockCount(key: *const [key_length]u8, block_count: usize) Ghash { - const h0 = mem.readIntBig(u128, key[0..16]); - - // We keep the values encoded as in GCM, not Polyval, i.e. without reversing the bits. - // This is fine, but the reversed result would be shifted by 1 bit. So, we shift h - // to compensate. - const carry = ((@as(u128, 0xc2) << 120) | 1) & (@as(u128, 0) -% (h0 >> 127)); - const h = (h0 << 1) ^ carry; - - var hx: [pc_count]Precomp = undefined; - hx[0] = h; - hx[1] = gcmReduce(clsq128(hx[0])); // h^2 - - if (builtin.mode != .ReleaseSmall) { - hx[2] = gcmReduce(clmul128(hx[1], h)); // h^3 - hx[3] = gcmReduce(clsq128(hx[1])); // h^4 = h^2^2 - if (block_count >= agg_8_treshold) { - hx[4] = gcmReduce(clmul128(hx[3], h)); // h^5 - hx[5] = gcmReduce(clsq128(hx[2])); // h^6 = h^3^2 - hx[6] = gcmReduce(clmul128(hx[5], h)); // h^7 - hx[7] = gcmReduce(clsq128(hx[3])); // h^8 = h^4^2 - } - if (block_count >= agg_16_treshold) { - var i: usize = 8; - while (i < 16) : (i += 2) { - hx[i] = gcmReduce(clmul128(hx[i - 1], h)); - hx[i + 1] = gcmReduce(clsq128(hx[i / 2])); - } - } - } - return Ghash{ .hx = hx }; - } - - /// Initialize the GHASH state with a key. - pub fn init(key: *const [key_length]u8) Ghash { - return Ghash.initForBlockCount(key, math.maxInt(usize)); - } - - const Selector = enum { lo, hi, hi_lo }; - - // Carryless multiplication of two 64-bit integers for x86_64. - inline fn clmulPclmul(x: u128, y: u128, comptime half: Selector) u128 { - switch (half) { - .hi => { - const product = asm ( - \\ vpclmulqdq $0x11, %[x], %[y], %[out] - : [out] "=x" (-> @Vector(2, u64)), - : [x] "x" (@bitCast(@Vector(2, u64), x)), - [y] "x" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - .lo => { - const product = asm ( - \\ vpclmulqdq $0x00, %[x], %[y], %[out] - : [out] "=x" (-> @Vector(2, u64)), - : [x] "x" (@bitCast(@Vector(2, u64), x)), - [y] "x" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - .hi_lo => { - const product = asm ( - \\ vpclmulqdq $0x10, %[x], %[y], %[out] - : [out] "=x" (-> @Vector(2, u64)), - : [x] "x" (@bitCast(@Vector(2, u64), x)), - [y] "x" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - } - } - - // Carryless multiplication of two 64-bit integers for ARM crypto. - inline fn clmulPmull(x: u128, y: u128, comptime half: Selector) u128 { - switch (half) { - .hi => { - const product = asm ( - \\ pmull2 %[out].1q, %[x].2d, %[y].2d - : [out] "=w" (-> @Vector(2, u64)), - : [x] "w" (@bitCast(@Vector(2, u64), x)), - [y] "w" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - .lo => { - const product = asm ( - \\ pmull %[out].1q, %[x].1d, %[y].1d - : [out] "=w" (-> @Vector(2, u64)), - : [x] "w" (@bitCast(@Vector(2, u64), x)), - [y] "w" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - .hi_lo => { - const product = asm ( - \\ pmull %[out].1q, %[x].1d, %[y].1d - : [out] "=w" (-> @Vector(2, u64)), - : [x] "w" (@bitCast(@Vector(2, u64), x >> 64)), - [y] "w" (@bitCast(@Vector(2, u64), y)), - ); - return @bitCast(u128, product); - }, - } - } - - // Software carryless multiplication of two 64-bit integers. - fn clmulSoft(x_: u128, y_: u128, comptime half: Selector) u128 { - const x = @truncate(u64, if (half == .hi or half == .hi_lo) x_ >> 64 else x_); - const y = @truncate(u64, if (half == .hi) y_ >> 64 else y_); - - const x0 = x & 0x1111111111111110; - const x1 = x & 0x2222222222222220; - const x2 = x & 0x4444444444444440; - const x3 = x & 0x8888888888888880; - const y0 = y & 0x1111111111111111; - const y1 = y & 0x2222222222222222; - const y2 = y & 0x4444444444444444; - const y3 = y & 0x8888888888888888; - const z0 = (x0 * @as(u128, y0)) ^ (x1 * @as(u128, y3)) ^ (x2 * @as(u128, y2)) ^ (x3 * @as(u128, y1)); - const z1 = (x0 * @as(u128, y1)) ^ (x1 * @as(u128, y0)) ^ (x2 * @as(u128, y3)) ^ (x3 * @as(u128, y2)); - const z2 = (x0 * @as(u128, y2)) ^ (x1 * @as(u128, y1)) ^ (x2 * @as(u128, y0)) ^ (x3 * @as(u128, y3)); - const z3 = (x0 * @as(u128, y3)) ^ (x1 * @as(u128, y2)) ^ (x2 * @as(u128, y1)) ^ (x3 * @as(u128, y0)); - - const x0_mask = @as(u64, 0) -% (x & 1); - const x1_mask = @as(u64, 0) -% ((x >> 1) & 1); - const x2_mask = @as(u64, 0) -% ((x >> 2) & 1); - const x3_mask = @as(u64, 0) -% ((x >> 3) & 1); - const extra = (x0_mask & y) ^ (@as(u128, x1_mask & y) << 1) ^ - (@as(u128, x2_mask & y) << 2) ^ (@as(u128, x3_mask & y) << 3); - - return (z0 & 0x11111111111111111111111111111111) ^ - (z1 & 0x22222222222222222222222222222222) ^ - (z2 & 0x44444444444444444444444444444444) ^ - (z3 & 0x88888888888888888888888888888888) ^ extra; - } - - const I256 = struct { - hi: u128, - lo: u128, - mid: u128, - }; - - inline fn xor256(x: *I256, y: I256) void { - x.* = I256{ - .hi = x.hi ^ y.hi, - .lo = x.lo ^ y.lo, - .mid = x.mid ^ y.mid, - }; - } - - // Square a 128-bit integer in GF(2^128). - fn clsq128(x: u128) I256 { - return .{ - .hi = clmul(x, x, .hi), - .lo = clmul(x, x, .lo), - .mid = 0, - }; - } - - // Multiply two 128-bit integers in GF(2^128). - inline fn clmul128(x: u128, y: u128) I256 { - if (mul_algorithm == .karatsuba) { - const x_hi = @truncate(u64, x >> 64); - const y_hi = @truncate(u64, y >> 64); - const r_lo = clmul(x, y, .lo); - const r_hi = clmul(x, y, .hi); - const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi; - return .{ - .hi = r_hi, - .lo = r_lo, - .mid = r_mid, - }; - } else { - return .{ - .hi = clmul(x, y, .hi), - .lo = clmul(x, y, .lo), - .mid = clmul(x, y, .hi_lo) ^ clmul(y, x, .hi_lo), - }; - } - } - - // Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1. - // This is done *without reversing the bits*, using Shay Gueron's black magic demysticated here: - // https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html - inline fn gcmReduce(x: I256) u128 { - const hi = x.hi ^ (x.mid >> 64); - const lo = x.lo ^ (x.mid << 64); - const p64 = (((1 << 121) | (1 << 126) | (1 << 127)) >> 64); - const a = clmul(lo, p64, .lo); - const b = ((lo << 64) | (lo >> 64)) ^ a; - const c = clmul(b, p64, .lo); - const d = ((b << 64) | (b >> 64)) ^ c; - return d ^ hi; - } - - const has_pclmul = std.Target.x86.featureSetHas(builtin.cpu.features, .pclmul); - const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx); - const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes); - const clmul = if (builtin.cpu.arch == .x86_64 and has_pclmul and has_avx) impl: { - break :impl clmulPclmul; - } else if (builtin.cpu.arch == .aarch64 and has_armaes) impl: { - break :impl clmulPmull; - } else impl: { - break :impl clmulSoft; - }; - - // Process 16 byte blocks. - fn blocks(st: *Ghash, msg: []const u8) void { - assert(msg.len % 16 == 0); // GHASH blocks() expects full blocks - var acc = st.acc; - - var i: usize = 0; - - if (builtin.mode != .ReleaseSmall and msg.len >= agg_16_treshold * block_length) { - // 16-blocks aggregated reduction - while (i + 256 <= msg.len) : (i += 256) { - var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[15 - 0]); - comptime var j = 1; - inline while (j < 16) : (j += 1) { - xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[15 - j])); - } - acc = gcmReduce(u); - } - } else if (builtin.mode != .ReleaseSmall and msg.len >= agg_8_treshold * block_length) { - // 8-blocks aggregated reduction - while (i + 128 <= msg.len) : (i += 128) { - var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[7 - 0]); - comptime var j = 1; - inline while (j < 8) : (j += 1) { - xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[7 - j])); - } - acc = gcmReduce(u); - } - } else if (builtin.mode != .ReleaseSmall and msg.len >= agg_4_treshold * block_length) { - // 4-blocks aggregated reduction - while (i + 64 <= msg.len) : (i += 64) { - var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[3 - 0]); - comptime var j = 1; - inline while (j < 4) : (j += 1) { - xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[3 - j])); - } - acc = gcmReduce(u); - } - } - // 2-blocks aggregated reduction - while (i + 32 <= msg.len) : (i += 32) { - var u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[1 - 0]); - comptime var j = 1; - inline while (j < 2) : (j += 1) { - xor256(&u, clmul128(mem.readIntBig(u128, msg[i..][j * 16 ..][0..16]), st.hx[1 - j])); - } - acc = gcmReduce(u); - } - // remaining blocks - if (i < msg.len) { - const u = clmul128(acc ^ mem.readIntBig(u128, msg[i..][0..16]), st.hx[0]); - acc = gcmReduce(u); - i += 16; - } - assert(i == msg.len); - st.acc = acc; - } - - /// Absorb a message into the GHASH state. - pub fn update(st: *Ghash, m: []const u8) void { - var mb = m; - - if (st.leftover > 0) { - const want = math.min(block_length - st.leftover, mb.len); - const mc = mb[0..want]; - for (mc) |x, i| { - st.buf[st.leftover + i] = x; - } - mb = mb[want..]; - st.leftover += want; - if (st.leftover < block_length) { - return; - } - st.blocks(&st.buf); - st.leftover = 0; - } - if (mb.len >= block_length) { - const want = mb.len & ~(block_length - 1); - st.blocks(mb[0..want]); - mb = mb[want..]; - } - if (mb.len > 0) { - for (mb) |x, i| { - st.buf[st.leftover + i] = x; - } - st.leftover += mb.len; - } - } - - /// Zero-pad to align the next input to the first byte of a block - pub fn pad(st: *Ghash) void { - if (st.leftover == 0) { - return; - } - var i = st.leftover; - while (i < block_length) : (i += 1) { - st.buf[i] = 0; - } - st.blocks(&st.buf); - st.leftover = 0; - } - - /// Compute the GHASH of the entire input. - pub fn final(st: *Ghash, out: *[mac_length]u8) void { - st.pad(); - mem.writeIntBig(u128, out[0..16], st.acc); - - utils.secureZero(u8, @ptrCast([*]u8, st)[0..@sizeOf(Ghash)]); - } - - /// Compute the GHASH of a message. - pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [key_length]u8) void { - var st = Ghash.init(key); - st.update(msg); - st.final(out); - } -}; - -const htest = @import("test.zig"); - -test "ghash" { - const key = [_]u8{0x42} ** 16; - const m = [_]u8{0x69} ** 256; - - var st = Ghash.init(&key); - st.update(&m); - var out: [16]u8 = undefined; - st.final(&out); - try htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); - - st = Ghash.init(&key); - st.update(m[0..100]); - st.update(m[100..]); - st.final(&out); - try htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); -} - -test "ghash2" { - var key: [16]u8 = undefined; - var i: usize = 0; - while (i < key.len) : (i += 1) { - key[i] = @intCast(u8, i * 15 + 1); - } - const tvs = [_]struct { len: usize, hash: [:0]const u8 }{ - .{ .len = 5263, .hash = "b9395f37c131cd403a327ccf82ec016a" }, - .{ .len = 1361, .hash = "8c24cb3664e9a36e32ddef0c8178ab33" }, - .{ .len = 1344, .hash = "015d7243b52d62eee8be33a66a9658cc" }, - .{ .len = 1000, .hash = "56e148799944193f351f2014ef9dec9d" }, - .{ .len = 512, .hash = "ca4882ce40d37546185c57709d17d1ca" }, - .{ .len = 128, .hash = "d36dc3aac16cfe21a75cd5562d598c1c" }, - .{ .len = 111, .hash = "6e2bea99700fd19cf1694e7b56543320" }, - .{ .len = 80, .hash = "aa28f4092a7cca155f3de279cf21aa17" }, - .{ .len = 16, .hash = "9d7eb5ed121a52a4b0996e4ec9b98911" }, - .{ .len = 1, .hash = "968a203e5c7a98b6d4f3112f4d6b89a7" }, - .{ .len = 0, .hash = "00000000000000000000000000000000" }, - }; - inline for (tvs) |tv| { - var m: [tv.len]u8 = undefined; - i = 0; - while (i < m.len) : (i += 1) { - m[i] = @truncate(u8, i % 254 + 1); - } - var st = Ghash.init(&key); - st.update(&m); - var out: [16]u8 = undefined; - st.final(&out); - try htest.assertEqual(tv.hash, &out); - } -} diff --git a/lib/std/crypto/ghash_polyval.zig b/lib/std/crypto/ghash_polyval.zig new file mode 100644 index 0000000000..77916fdca1 --- /dev/null +++ b/lib/std/crypto/ghash_polyval.zig @@ -0,0 +1,444 @@ +const std = @import("../std.zig"); +const builtin = @import("builtin"); +const assert = std.debug.assert; +const math = std.math; +const mem = std.mem; +const utils = std.crypto.utils; + +const Precomp = u128; + +/// GHASH is a universal hash function that uses multiplication by a fixed +/// parameter within a Galois field. +/// +/// It is not a general purpose hash function - The key must be secret, unpredictable and never reused. +/// +/// GHASH is typically used to compute the authentication tag in the AES-GCM construction. +pub const Ghash = Hash(.Big, true); + +/// POLYVAL is a universal hash function that uses multiplication by a fixed +/// parameter within a Galois field. +/// +/// It is not a general purpose hash function - The key must be secret, unpredictable and never reused. +/// +/// POLYVAL is typically used to compute the authentication tag in the AES-GCM-SIV construction. +pub const Polyval = Hash(.Little, false); + +fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type { + return struct { + const Self = @This(); + + pub const block_length: usize = 16; + pub const mac_length = 16; + pub const key_length = 16; + + const pc_count = if (builtin.mode != .ReleaseSmall) 16 else 2; + const agg_4_treshold = 22; + const agg_8_treshold = 84; + const agg_16_treshold = 328; + + // Before the Haswell architecture, the carryless multiplication instruction was + // extremely slow. Even with 128-bit operands, using Karatsuba multiplication was + // thus faster than a schoolbook multiplication. + // This is no longer the case -- Modern CPUs, including ARM-based ones, have a fast + // carryless multiplication instruction; using 4 multiplications is now faster than + // 3 multiplications with extra shifts and additions. + const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .schoolbook; + + hx: [pc_count]Precomp, + acc: u128 = 0, + + leftover: usize = 0, + buf: [block_length]u8 align(16) = undefined, + + /// Initialize the GHASH state with a key, and a minimum number of block count. + pub fn initForBlockCount(key: *const [key_length]u8, block_count: usize) Self { + var h = mem.readInt(u128, key[0..16], endian); + if (shift_key) { + // Shift the key by 1 bit to the left & reduce for GCM. + const carry = ((@as(u128, 0xc2) << 120) | 1) & (@as(u128, 0) -% (h >> 127)); + h = (h << 1) ^ carry; + } + var hx: [pc_count]Precomp = undefined; + hx[0] = h; + hx[1] = reduce(clsq128(hx[0])); // h^2 + + if (builtin.mode != .ReleaseSmall) { + hx[2] = reduce(clmul128(hx[1], h)); // h^3 + hx[3] = reduce(clsq128(hx[1])); // h^4 = h^2^2 + if (block_count >= agg_8_treshold) { + hx[4] = reduce(clmul128(hx[3], h)); // h^5 + hx[5] = reduce(clsq128(hx[2])); // h^6 = h^3^2 + hx[6] = reduce(clmul128(hx[5], h)); // h^7 + hx[7] = reduce(clsq128(hx[3])); // h^8 = h^4^2 + } + if (block_count >= agg_16_treshold) { + var i: usize = 8; + while (i < 16) : (i += 2) { + hx[i] = reduce(clmul128(hx[i - 1], h)); + hx[i + 1] = reduce(clsq128(hx[i / 2])); + } + } + } + return Self{ .hx = hx }; + } + + /// Initialize the GHASH state with a key. + pub fn init(key: *const [key_length]u8) Self { + return Self.initForBlockCount(key, math.maxInt(usize)); + } + + const Selector = enum { lo, hi, hi_lo }; + + // Carryless multiplication of two 64-bit integers for x86_64. + inline fn clmulPclmul(x: u128, y: u128, comptime half: Selector) u128 { + switch (half) { + .hi => { + const product = asm ( + \\ vpclmulqdq $0x11, %[x], %[y], %[out] + : [out] "=x" (-> @Vector(2, u64)), + : [x] "x" (@bitCast(@Vector(2, u64), x)), + [y] "x" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + .lo => { + const product = asm ( + \\ vpclmulqdq $0x00, %[x], %[y], %[out] + : [out] "=x" (-> @Vector(2, u64)), + : [x] "x" (@bitCast(@Vector(2, u64), x)), + [y] "x" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + .hi_lo => { + const product = asm ( + \\ vpclmulqdq $0x10, %[x], %[y], %[out] + : [out] "=x" (-> @Vector(2, u64)), + : [x] "x" (@bitCast(@Vector(2, u64), x)), + [y] "x" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + } + } + + // Carryless multiplication of two 64-bit integers for ARM crypto. + inline fn clmulPmull(x: u128, y: u128, comptime half: Selector) u128 { + switch (half) { + .hi => { + const product = asm ( + \\ pmull2 %[out].1q, %[x].2d, %[y].2d + : [out] "=w" (-> @Vector(2, u64)), + : [x] "w" (@bitCast(@Vector(2, u64), x)), + [y] "w" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + .lo => { + const product = asm ( + \\ pmull %[out].1q, %[x].1d, %[y].1d + : [out] "=w" (-> @Vector(2, u64)), + : [x] "w" (@bitCast(@Vector(2, u64), x)), + [y] "w" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + .hi_lo => { + const product = asm ( + \\ pmull %[out].1q, %[x].1d, %[y].1d + : [out] "=w" (-> @Vector(2, u64)), + : [x] "w" (@bitCast(@Vector(2, u64), x >> 64)), + [y] "w" (@bitCast(@Vector(2, u64), y)), + ); + return @bitCast(u128, product); + }, + } + } + + // Software carryless multiplication of two 64-bit integers. + fn clmulSoft(x_: u128, y_: u128, comptime half: Selector) u128 { + const x = @truncate(u64, if (half == .hi or half == .hi_lo) x_ >> 64 else x_); + const y = @truncate(u64, if (half == .hi) y_ >> 64 else y_); + + const x0 = x & 0x1111111111111110; + const x1 = x & 0x2222222222222220; + const x2 = x & 0x4444444444444440; + const x3 = x & 0x8888888888888880; + const y0 = y & 0x1111111111111111; + const y1 = y & 0x2222222222222222; + const y2 = y & 0x4444444444444444; + const y3 = y & 0x8888888888888888; + const z0 = (x0 * @as(u128, y0)) ^ (x1 * @as(u128, y3)) ^ (x2 * @as(u128, y2)) ^ (x3 * @as(u128, y1)); + const z1 = (x0 * @as(u128, y1)) ^ (x1 * @as(u128, y0)) ^ (x2 * @as(u128, y3)) ^ (x3 * @as(u128, y2)); + const z2 = (x0 * @as(u128, y2)) ^ (x1 * @as(u128, y1)) ^ (x2 * @as(u128, y0)) ^ (x3 * @as(u128, y3)); + const z3 = (x0 * @as(u128, y3)) ^ (x1 * @as(u128, y2)) ^ (x2 * @as(u128, y1)) ^ (x3 * @as(u128, y0)); + + const x0_mask = @as(u64, 0) -% (x & 1); + const x1_mask = @as(u64, 0) -% ((x >> 1) & 1); + const x2_mask = @as(u64, 0) -% ((x >> 2) & 1); + const x3_mask = @as(u64, 0) -% ((x >> 3) & 1); + const extra = (x0_mask & y) ^ (@as(u128, x1_mask & y) << 1) ^ + (@as(u128, x2_mask & y) << 2) ^ (@as(u128, x3_mask & y) << 3); + + return (z0 & 0x11111111111111111111111111111111) ^ + (z1 & 0x22222222222222222222222222222222) ^ + (z2 & 0x44444444444444444444444444444444) ^ + (z3 & 0x88888888888888888888888888888888) ^ extra; + } + + const I256 = struct { + hi: u128, + lo: u128, + mid: u128, + }; + + inline fn xor256(x: *I256, y: I256) void { + x.* = I256{ + .hi = x.hi ^ y.hi, + .lo = x.lo ^ y.lo, + .mid = x.mid ^ y.mid, + }; + } + + // Square a 128-bit integer in GF(2^128). + fn clsq128(x: u128) I256 { + return .{ + .hi = clmul(x, x, .hi), + .lo = clmul(x, x, .lo), + .mid = 0, + }; + } + + // Multiply two 128-bit integers in GF(2^128). + inline fn clmul128(x: u128, y: u128) I256 { + if (mul_algorithm == .karatsuba) { + const x_hi = @truncate(u64, x >> 64); + const y_hi = @truncate(u64, y >> 64); + const r_lo = clmul(x, y, .lo); + const r_hi = clmul(x, y, .hi); + const r_mid = clmul(x ^ x_hi, y ^ y_hi, .lo) ^ r_lo ^ r_hi; + return .{ + .hi = r_hi, + .lo = r_lo, + .mid = r_mid, + }; + } else { + return .{ + .hi = clmul(x, y, .hi), + .lo = clmul(x, y, .lo), + .mid = clmul(x, y, .hi_lo) ^ clmul(y, x, .hi_lo), + }; + } + } + + // Reduce a 256-bit representative of a polynomial modulo the irreducible polynomial x^128 + x^127 + x^126 + x^121 + 1. + // This is done using Shay Gueron's black magic demysticated here: + // https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html + inline fn reduce(x: I256) u128 { + const hi = x.hi ^ (x.mid >> 64); + const lo = x.lo ^ (x.mid << 64); + const p64 = (((1 << 121) | (1 << 126) | (1 << 127)) >> 64); + const a = clmul(lo, p64, .lo); + const b = ((lo << 64) | (lo >> 64)) ^ a; + const c = clmul(b, p64, .lo); + const d = ((b << 64) | (b >> 64)) ^ c; + return d ^ hi; + } + + const has_pclmul = std.Target.x86.featureSetHas(builtin.cpu.features, .pclmul); + const has_avx = std.Target.x86.featureSetHas(builtin.cpu.features, .avx); + const has_armaes = std.Target.aarch64.featureSetHas(builtin.cpu.features, .aes); + const clmul = if (builtin.cpu.arch == .x86_64 and has_pclmul and has_avx) impl: { + break :impl clmulPclmul; + } else if (builtin.cpu.arch == .aarch64 and has_armaes) impl: { + break :impl clmulPmull; + } else impl: { + break :impl clmulSoft; + }; + + // Process 16 byte blocks. + fn blocks(st: *Self, msg: []const u8) void { + assert(msg.len % 16 == 0); // GHASH blocks() expects full blocks + var acc = st.acc; + + var i: usize = 0; + + if (builtin.mode != .ReleaseSmall and msg.len >= agg_16_treshold * block_length) { + // 16-blocks aggregated reduction + while (i + 256 <= msg.len) : (i += 256) { + var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[15 - 0]); + comptime var j = 1; + inline while (j < 16) : (j += 1) { + xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[15 - j])); + } + acc = reduce(u); + } + } else if (builtin.mode != .ReleaseSmall and msg.len >= agg_8_treshold * block_length) { + // 8-blocks aggregated reduction + while (i + 128 <= msg.len) : (i += 128) { + var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[7 - 0]); + comptime var j = 1; + inline while (j < 8) : (j += 1) { + xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[7 - j])); + } + acc = reduce(u); + } + } else if (builtin.mode != .ReleaseSmall and msg.len >= agg_4_treshold * block_length) { + // 4-blocks aggregated reduction + while (i + 64 <= msg.len) : (i += 64) { + var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[3 - 0]); + comptime var j = 1; + inline while (j < 4) : (j += 1) { + xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[3 - j])); + } + acc = reduce(u); + } + } + // 2-blocks aggregated reduction + while (i + 32 <= msg.len) : (i += 32) { + var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[1 - 0]); + comptime var j = 1; + inline while (j < 2) : (j += 1) { + xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[1 - j])); + } + acc = reduce(u); + } + // remaining blocks + if (i < msg.len) { + const u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[0]); + acc = reduce(u); + i += 16; + } + assert(i == msg.len); + st.acc = acc; + } + + /// Absorb a message into the GHASH state. + pub fn update(st: *Self, m: []const u8) void { + var mb = m; + + if (st.leftover > 0) { + const want = math.min(block_length - st.leftover, mb.len); + const mc = mb[0..want]; + for (mc) |x, i| { + st.buf[st.leftover + i] = x; + } + mb = mb[want..]; + st.leftover += want; + if (st.leftover < block_length) { + return; + } + st.blocks(&st.buf); + st.leftover = 0; + } + if (mb.len >= block_length) { + const want = mb.len & ~(block_length - 1); + st.blocks(mb[0..want]); + mb = mb[want..]; + } + if (mb.len > 0) { + for (mb) |x, i| { + st.buf[st.leftover + i] = x; + } + st.leftover += mb.len; + } + } + + /// Zero-pad to align the next input to the first byte of a block + pub fn pad(st: *Self) void { + if (st.leftover == 0) { + return; + } + var i = st.leftover; + while (i < block_length) : (i += 1) { + st.buf[i] = 0; + } + st.blocks(&st.buf); + st.leftover = 0; + } + + /// Compute the GHASH of the entire input. + pub fn final(st: *Self, out: *[mac_length]u8) void { + st.pad(); + mem.writeInt(u128, out[0..16], st.acc, endian); + + utils.secureZero(u8, @ptrCast([*]u8, st)[0..@sizeOf(Self)]); + } + + /// Compute the GHASH of a message. + pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [key_length]u8) void { + var st = Self.init(key); + st.update(msg); + st.final(out); + } + }; +} + +const htest = @import("test.zig"); + +test "ghash" { + const key = [_]u8{0x42} ** 16; + const m = [_]u8{0x69} ** 256; + + var st = Ghash.init(&key); + st.update(&m); + var out: [16]u8 = undefined; + st.final(&out); + try htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); + + st = Ghash.init(&key); + st.update(m[0..100]); + st.update(m[100..]); + st.final(&out); + try htest.assertEqual("889295fa746e8b174bf4ec80a65dea41", &out); +} + +test "ghash2" { + var key: [16]u8 = undefined; + var i: usize = 0; + while (i < key.len) : (i += 1) { + key[i] = @intCast(u8, i * 15 + 1); + } + const tvs = [_]struct { len: usize, hash: [:0]const u8 }{ + .{ .len = 5263, .hash = "b9395f37c131cd403a327ccf82ec016a" }, + .{ .len = 1361, .hash = "8c24cb3664e9a36e32ddef0c8178ab33" }, + .{ .len = 1344, .hash = "015d7243b52d62eee8be33a66a9658cc" }, + .{ .len = 1000, .hash = "56e148799944193f351f2014ef9dec9d" }, + .{ .len = 512, .hash = "ca4882ce40d37546185c57709d17d1ca" }, + .{ .len = 128, .hash = "d36dc3aac16cfe21a75cd5562d598c1c" }, + .{ .len = 111, .hash = "6e2bea99700fd19cf1694e7b56543320" }, + .{ .len = 80, .hash = "aa28f4092a7cca155f3de279cf21aa17" }, + .{ .len = 16, .hash = "9d7eb5ed121a52a4b0996e4ec9b98911" }, + .{ .len = 1, .hash = "968a203e5c7a98b6d4f3112f4d6b89a7" }, + .{ .len = 0, .hash = "00000000000000000000000000000000" }, + }; + inline for (tvs) |tv| { + var m: [tv.len]u8 = undefined; + i = 0; + while (i < m.len) : (i += 1) { + m[i] = @truncate(u8, i % 254 + 1); + } + var st = Ghash.init(&key); + st.update(&m); + var out: [16]u8 = undefined; + st.final(&out); + try htest.assertEqual(tv.hash, &out); + } +} + +test "polyval" { + const key = [_]u8{0x42} ** 16; + const m = [_]u8{0x69} ** 256; + + var st = Polyval.init(&key); + st.update(&m); + var out: [16]u8 = undefined; + st.final(&out); + try htest.assertEqual("0713c82b170eef25c8955ddf72c85ccb", &out); + + st = Polyval.init(&key); + st.update(m[0..100]); + st.update(m[100..]); + st.final(&out); + try htest.assertEqual("0713c82b170eef25c8955ddf72c85ccb", &out); +}