diff --git a/lib/std/crypto/benchmark.zig b/lib/std/crypto/benchmark.zig index 9b6056370c..f295276210 100644 --- a/lib/std/crypto/benchmark.zig +++ b/lib/std/crypto/benchmark.zig @@ -30,6 +30,7 @@ const hashes = [_]Crypto{ Crypto{ .ty = crypto.hash.sha3.Shake256, .name = "shake-256" }, Crypto{ .ty = crypto.hash.sha3.TurboShake128(null), .name = "turboshake-128" }, Crypto{ .ty = crypto.hash.sha3.TurboShake256(null), .name = "turboshake-256" }, + Crypto{ .ty = crypto.hash.sha3.KT128, .name = "kt128" }, Crypto{ .ty = crypto.hash.blake2.Blake2s256, .name = "blake2s" }, Crypto{ .ty = crypto.hash.blake2.Blake2b512, .name = "blake2b" }, Crypto{ .ty = crypto.hash.Blake3, .name = "blake3" }, @@ -37,6 +38,7 @@ const hashes = [_]Crypto{ const parallel_hashes = [_]Crypto{ Crypto{ .ty = crypto.hash.Blake3, .name = "blake3-parallel" }, + Crypto{ .ty = crypto.hash.sha3.KT128, .name = "kt128-parallel" }, }; const block_size: usize = 8 * 8192; diff --git a/lib/std/crypto/kangarootwelve.zig b/lib/std/crypto/kangarootwelve.zig new file mode 100644 index 0000000000..8c784aaa1d --- /dev/null +++ b/lib/std/crypto/kangarootwelve.zig @@ -0,0 +1,1647 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const crypto = std.crypto; +const Allocator = std.mem.Allocator; + +const TurboSHAKE128State = crypto.hash.sha3.TurboShake128(0x06); +const TurboSHAKE256State = crypto.hash.sha3.TurboShake256(0x06); + +const chunk_size: usize = 8192; // Chunk size for tree hashing (8 KiB) +const cache_line_size = std.atomic.cache_line; + +// Optimal SIMD vector length for u64 on this target platform +const optimal_vector_len = std.simd.suggestVectorLength(u64) orelse 1; + +// Round constants for Keccak-p[1600,12] +const RC = [12]u64{ + 0x000000008000808B, + 0x800000000000008B, + 0x8000000000008089, + 0x8000000000008003, + 0x8000000000008002, + 0x8000000000000080, + 0x000000000000800A, + 0x800000008000000A, + 0x8000000080008081, + 0x8000000000008080, + 0x0000000080000001, + 0x8000000080008008, +}; + +/// Generic KangarooTwelve variant builder. +/// Creates a variant type with specific cryptographic parameters. +fn KangarooVariant( + comptime security_level_bits: comptime_int, + comptime rate_bytes: usize, + comptime cv_size_bytes: usize, + comptime StateTypeParam: type, + comptime sep_x: usize, + comptime sep_y: usize, + comptime pad_x: usize, + comptime pad_y: usize, + comptime toBufferFn: fn (*const MultiSliceView, u8, []u8) void, + comptime allocFn: fn (Allocator, *const MultiSliceView, u8, usize) anyerror![]u8, +) type { + return struct { + const security_level = security_level_bits; + const rate = rate_bytes; + const rate_in_lanes = rate_bytes / 8; + const cv_size = cv_size_bytes; + const StateType = StateTypeParam; + const separation_byte_pos = .{ .x = sep_x, .y = sep_y }; + const padding_pos = .{ .x = pad_x, .y = pad_y }; + + inline fn turboShakeToBuffer(view: *const MultiSliceView, separation_byte: u8, output: []u8) void { + toBufferFn(view, separation_byte, output); + } + + inline fn turboShakeMultiSliceAlloc( + allocator: Allocator, + view: *const MultiSliceView, + separation_byte: u8, + output_len: usize, + ) ![]u8 { + return allocFn(allocator, view, separation_byte, output_len); + } + }; +} + +/// KangarooTwelve with 128-bit security parameters +const KT128Variant = KangarooVariant( + 128, // Security level in bits + 168, // TurboSHAKE128 rate in bytes + 32, // Chaining value size in bytes + TurboSHAKE128State, + 1, // separation_byte_pos.x (lane 11: 88 bytes into 168-byte rate) + 3, // separation_byte_pos.y + 0, // padding_pos.x (lane 20: last lane of 168-byte rate) + 4, // padding_pos.y + turboShake128MultiSliceToBuffer, + turboShake128MultiSlice, +); + +/// KangarooTwelve with 256-bit security parameters +const KT256Variant = KangarooVariant( + 256, // Security level in bits + 136, // TurboSHAKE256 rate in bytes + 64, // Chaining value size in bytes + TurboSHAKE256State, + 4, // separation_byte_pos.x (lane 4: 32 bytes into 136-byte rate) + 0, // separation_byte_pos.y + 1, // padding_pos.x (lane 16: last lane of 136-byte rate) + 3, // padding_pos.y + turboShake256MultiSliceToBuffer, + turboShake256MultiSlice, +); + +/// Rotate left for u64 vector +inline fn rol64Vec(comptime N: usize, v: @Vector(N, u64), comptime n: u6) @Vector(N, u64) { + if (n == 0) return v; + const left: @Vector(N, u64) = @splat(n); + const right_shift: u64 = 64 - @as(u64, n); + const right: @Vector(N, u64) = @splat(right_shift); + return (v << left) | (v >> right); +} + +/// Load a 64-bit little-endian value +inline fn load64(bytes: []const u8) u64 { + return std.mem.readInt(u64, bytes[0..8], .little); +} + +/// Store a 64-bit little-endian value +inline fn store64(value: u64, bytes: []u8) void { + std.mem.writeInt(u64, bytes[0..8], value, .little); +} + +/// Right-encode result type (max 9 bytes for 64-bit usize) +const RightEncoded = struct { + bytes: [9]u8, + len: u8, + + fn slice(self: *const RightEncoded) []const u8 { + return self.bytes[0..self.len]; + } +}; + +/// Right-encode: encodes a number as bytes with length suffix (no allocation) +fn rightEncode(x: usize) RightEncoded { + var result: RightEncoded = undefined; + + if (x == 0) { + result.bytes[0] = 0; + result.len = 1; + return result; + } + + var temp: [9]u8 = undefined; + var len: usize = 0; + var val = x; + + while (val > 0) : (val /= 256) { + temp[len] = @intCast(val % 256); + len += 1; + } + + // Reverse bytes (MSB first) + for (0..len) |i| { + result.bytes[i] = temp[len - 1 - i]; + } + result.bytes[len] = @intCast(len); + result.len = @intCast(len + 1); + + return result; +} + +/// Virtual contiguous view over multiple slices (zero-copy) +const MultiSliceView = struct { + slices: [3][]const u8, + offsets: [4]usize, + + fn init(s1: []const u8, s2: []const u8, s3: []const u8) MultiSliceView { + return .{ + .slices = .{ s1, s2, s3 }, + .offsets = .{ + 0, + s1.len, + s1.len + s2.len, + s1.len + s2.len + s3.len, + }, + }; + } + + fn totalLen(self: *const MultiSliceView) usize { + return self.offsets[3]; + } + + /// Get byte at position (zero-copy) + fn getByte(self: *const MultiSliceView, pos: usize) u8 { + for (0..3) |i| { + if (pos >= self.offsets[i] and pos < self.offsets[i + 1]) { + return self.slices[i][pos - self.offsets[i]]; + } + } + unreachable; + } + + /// Try to get a contiguous slice [start..end) - returns null if spans boundaries + fn tryGetSlice(self: *const MultiSliceView, start: usize, end: usize) ?[]const u8 { + for (0..3) |i| { + if (start >= self.offsets[i] and end <= self.offsets[i + 1]) { + const local_start = start - self.offsets[i]; + const local_end = end - self.offsets[i]; + return self.slices[i][local_start..local_end]; + } + } + return null; + } + + /// Copy range [start..end) to buffer (used when slice spans boundaries) + fn copyRange(self: *const MultiSliceView, start: usize, end: usize, buffer: []u8) void { + var pos: usize = 0; + for (start..end) |i| { + buffer[pos] = self.getByte(i); + pos += 1; + } + } +}; + +/// Apply Keccak-p[1600,12] to N states using SIMD +fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void { + @setEvalBranchQuota(10000); + + // Pre-computed rotation offsets for rho-pi step + const rho_offsets = comptime blk: { + var offsets: [24]u6 = undefined; + var px: usize = 1; + var py: usize = 0; + for (0..24) |t| { + const rot_amount = ((t + 1) * (t + 2) / 2) % 64; + offsets[t] = @intCast(rot_amount); + const temp_x = py; + py = (2 * px + 3 * py) % 5; + px = temp_x; + } + break :blk offsets; + }; + + var round: usize = 0; + while (round < 12) : (round += 2) { + inline for (0..2) |i| { + // θ (theta) + var C: [5]@Vector(N, u64) = undefined; + inline for (0..5) |x| { + C[x] = states[x][0] ^ states[x][1] ^ states[x][2] ^ states[x][3] ^ states[x][4]; + } + + var D: [5]@Vector(N, u64) = undefined; + inline for (0..5) |x| { + D[x] = C[(x + 4) % 5] ^ rol64Vec(N, C[(x + 1) % 5], 1); + } + + // Apply D to all lanes + inline for (0..5) |x| { + states[x][0] ^= D[x]; + states[x][1] ^= D[x]; + states[x][2] ^= D[x]; + states[x][3] ^= D[x]; + states[x][4] ^= D[x]; + } + + // ρ (rho) and π (pi) - optimized with pre-computed offsets + var current = states[1][0]; + var px: usize = 1; + var py: usize = 0; + inline for (rho_offsets) |rot| { + const next_y = (2 * px + 3 * py) % 5; + const next = states[py][next_y]; + states[py][next_y] = rol64Vec(N, current, rot); + current = next; + px = py; + py = next_y; + } + + // χ (chi) - optimized with better register usage + inline for (0..5) |y| { + const t0 = states[0][y]; + const t1 = states[1][y]; + const t2 = states[2][y]; + const t3 = states[3][y]; + const t4 = states[4][y]; + + states[0][y] = t0 ^ (~t1 & t2); + states[1][y] = t1 ^ (~t2 & t3); + states[2][y] = t2 ^ (~t3 & t4); + states[3][y] = t3 ^ (~t4 & t0); + states[4][y] = t4 ^ (~t0 & t1); + } + + // ι (iota) + const rc_splat: @Vector(N, u64) = @splat(RC[round + i]); + states[0][0] ^= rc_splat; + } + } +} + +/// Add lanes from data to N states in parallel with stride using SIMD +fn addLanesAll( + comptime N: usize, + states: *[5][5]@Vector(N, u64), + data: []const u8, + lane_count: usize, + lane_offset: usize, +) void { + + // Process lanes (at most 25 lanes in Keccak state) + inline for (0..25) |xy| { + if (xy < lane_count) { + const x = xy % 5; + const y = xy / 5; + + var loaded_data: @Vector(N, u64) = undefined; + inline for (0..N) |i| { + loaded_data[i] = load64(data[8 * (i * lane_offset + xy) ..]); + } + states[x][y] ^= loaded_data; + } + } +} + +/// Apply Keccak-p[1600,12] to a single state (byte representation) +fn keccakP(state: *[200]u8) void { + @setEvalBranchQuota(10000); + var lanes: [5][5]u64 = undefined; + + // Load state into lanes + inline for (0..5) |x| { + inline for (0..5) |y| { + lanes[x][y] = load64(state[8 * (x + 5 * y) ..]); + } + } + + // Apply 12 rounds + var round: usize = 0; + while (round < 12) : (round += 2) { + inline for (0..2) |i| { + // θ + var C: [5]u64 = undefined; + inline for (0..5) |x| { + C[x] = lanes[x][0] ^ lanes[x][1] ^ lanes[x][2] ^ lanes[x][3] ^ lanes[x][4]; + } + var D: [5]u64 = undefined; + inline for (0..5) |x| { + D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1); + } + inline for (0..5) |x| { + inline for (0..5) |y| { + lanes[x][y] ^= D[x]; + } + } + + // ρ and π + var current = lanes[1][0]; + var px: usize = 1; + var py: usize = 0; + inline for (0..24) |t| { + const temp = lanes[py][(2 * px + 3 * py) % 5]; + const rot_amount = ((t + 1) * (t + 2) / 2) % 64; + lanes[py][(2 * px + 3 * py) % 5] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount))); + current = temp; + const temp_x = py; + py = (2 * px + 3 * py) % 5; + px = temp_x; + } + + // χ + inline for (0..5) |y| { + const T = [5]u64{ lanes[0][y], lanes[1][y], lanes[2][y], lanes[3][y], lanes[4][y] }; + inline for (0..5) |x| { + lanes[x][y] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]); + } + } + + // ι + lanes[0][0] ^= RC[round + i]; + } + } + + // Store lanes back to state + inline for (0..5) |x| { + inline for (0..5) |y| { + store64(lanes[x][y], state[8 * (x + 5 * y) ..]); + } + } +} + +/// Apply Keccak-p[1600,12] to a single state (u64 lane representation) +fn keccakPLanes(lanes: *[25]u64) void { + @setEvalBranchQuota(10000); + + // Apply 12 rounds + inline for (RC) |rc| { + // θ + var C: [5]u64 = undefined; + inline for (0..5) |x| { + C[x] = lanes[x] ^ lanes[x + 5] ^ lanes[x + 10] ^ lanes[x + 15] ^ lanes[x + 20]; + } + var D: [5]u64 = undefined; + inline for (0..5) |x| { + D[x] = C[(x + 4) % 5] ^ std.math.rotl(u64, C[(x + 1) % 5], 1); + } + inline for (0..5) |x| { + inline for (0..5) |y| { + lanes[x + 5 * y] ^= D[x]; + } + } + + // ρ and π + var current = lanes[1]; + var px: usize = 1; + var py: usize = 0; + inline for (0..24) |t| { + const next_y = (2 * px + 3 * py) % 5; + const next_idx = py + 5 * next_y; + const temp = lanes[next_idx]; + const rot_amount = ((t + 1) * (t + 2) / 2) % 64; + lanes[next_idx] = std.math.rotl(u64, current, @as(u6, @intCast(rot_amount))); + current = temp; + px = py; + py = next_y; + } + + // χ + inline for (0..5) |y| { + const idx = 5 * y; + const T = [5]u64{ lanes[idx], lanes[idx + 1], lanes[idx + 2], lanes[idx + 3], lanes[idx + 4] }; + inline for (0..5) |x| { + lanes[idx + x] = T[x] ^ (~T[(x + 1) % 5] & T[(x + 2) % 5]); + } + } + + // ι + lanes[0] ^= rc; + } +} + +/// Generic non-allocating TurboSHAKE: write output to provided buffer +fn turboShakeMultiSliceToBuffer( + comptime rate: usize, + view: *const MultiSliceView, + separation_byte: u8, + output: []u8, +) void { + var state: [200]u8 = @splat(0); + var state_pos: usize = 0; + + // Absorb all bytes from the multi-slice view + const total = view.totalLen(); + var pos: usize = 0; + while (pos < total) { + state[state_pos] ^= view.getByte(pos); + state_pos += 1; + pos += 1; + + if (state_pos == rate) { + keccakP(&state); + state_pos = 0; + } + } + + // Add separation byte and padding + state[state_pos] ^= separation_byte; + state[rate - 1] ^= 0x80; + keccakP(&state); + + // Squeeze + var out_offset: usize = 0; + while (out_offset < output.len) { + const chunk = @min(rate, output.len - out_offset); + @memcpy(output[out_offset..][0..chunk], state[0..chunk]); + out_offset += chunk; + if (out_offset < output.len) { + keccakP(&state); + } + } +} + +/// Generic allocating TurboSHAKE +fn turboShakeMultiSlice( + comptime rate: usize, + allocator: Allocator, + view: *const MultiSliceView, + separation_byte: u8, + output_len: usize, +) ![]u8 { + const output = try allocator.alloc(u8, output_len); + turboShakeMultiSliceToBuffer(rate, view, separation_byte, output); + return output; +} + +/// Non-allocating TurboSHAKE128: write output to provided buffer +fn turboShake128MultiSliceToBuffer( + view: *const MultiSliceView, + separation_byte: u8, + output: []u8, +) void { + turboShakeMultiSliceToBuffer(168, view, separation_byte, output); +} + +/// Allocating TurboSHAKE128 +fn turboShake128MultiSlice( + allocator: Allocator, + view: *const MultiSliceView, + separation_byte: u8, + output_len: usize, +) ![]u8 { + return turboShakeMultiSlice(168, allocator, view, separation_byte, output_len); +} + +/// Non-allocating TurboSHAKE256: write output to provided buffer +fn turboShake256MultiSliceToBuffer( + view: *const MultiSliceView, + separation_byte: u8, + output: []u8, +) void { + turboShakeMultiSliceToBuffer(136, view, separation_byte, output); +} + +/// Allocating TurboSHAKE256 +fn turboShake256MultiSlice( + allocator: Allocator, + view: *const MultiSliceView, + separation_byte: u8, + output_len: usize, +) ![]u8 { + return turboShakeMultiSlice(136, allocator, view, separation_byte, output_len); +} + +/// Process N leaves (8KiB chunks) in parallel - generic version +fn processLeaves( + comptime Variant: type, + comptime N: usize, + data: []const u8, + result: *[N * Variant.cv_size]u8, +) void { + const rate_in_lanes: usize = Variant.rate_in_lanes; + const rate_in_bytes: usize = rate_in_lanes * 8; + const cv_size: usize = Variant.cv_size; + + // Initialize N all-zero states with cache alignment + var states: [5][5]@Vector(N, u64) align(cache_line_size) = undefined; + inline for (0..5) |x| { + inline for (0..5) |y| { + states[x][y] = @splat(0); + } + } + + // Process complete blocks + var j: usize = 0; + while (j + rate_in_bytes <= chunk_size) : (j += rate_in_bytes) { + addLanesAll(N, &states, data[j..], rate_in_lanes, chunk_size / 8); + keccakP1600timesN(N, &states); + } + + // Process last incomplete block + const remaining_lanes = (chunk_size - j) / 8; + if (remaining_lanes > 0) { + addLanesAll(N, &states, data[j..], remaining_lanes, chunk_size / 8); + } + + // Add suffix 0x0B and padding + const suffix_pos = Variant.separation_byte_pos; + const padding_pos = Variant.padding_pos; + + const suffix_splat: @Vector(N, u64) = @splat(0x0B); + states[suffix_pos.x][suffix_pos.y] ^= suffix_splat; + const padding_splat: @Vector(N, u64) = @splat(0x8000000000000000); + states[padding_pos.x][padding_pos.y] ^= padding_splat; + + keccakP1600timesN(N, &states); + + // Extract chaining values from each state + const lanes_to_extract = cv_size / 8; + comptime var lane_idx: usize = 0; + inline while (lane_idx < lanes_to_extract) : (lane_idx += 1) { + const x = lane_idx % 5; + const y = lane_idx / 5; + inline for (0..N) |i| { + store64(states[x][y][i], result[i * cv_size + lane_idx * 8 ..]); + } + } +} + +/// Helper function to process N leaves in parallel, reducing code duplication +inline fn processNLeaves( + comptime Variant: type, + comptime N: usize, + view: *const MultiSliceView, + j: usize, + leaf_buffer: []u8, + output: []align(@alignOf(u64)) u8, +) void { + const cv_size = Variant.cv_size; + comptime std.debug.assert(cv_size % @sizeOf(u64) == 0); + + if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| { + var leaf_cvs: [N * cv_size]u8 = undefined; + processLeaves(Variant, N, leaf_data, &leaf_cvs); + @memcpy(output[0..leaf_cvs.len], &leaf_cvs); + } else { + view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]); + var leaf_cvs: [N * cv_size]u8 = undefined; + processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs); + @memcpy(output[0..leaf_cvs.len], &leaf_cvs); + } +} + +/// Helper to process N leaves in SIMD and absorb CVs into state +inline fn processAndAbsorbNLeaves( + comptime Variant: type, + comptime N: usize, + view: *const MultiSliceView, + j: usize, + leaf_buffer: []u8, + final_state: anytype, +) void { + const cv_size = Variant.cv_size; + if (view.tryGetSlice(j, j + N * chunk_size)) |leaf_data| { + var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined; + processLeaves(Variant, N, leaf_data, &leaf_cvs); + final_state.update(&leaf_cvs); + } else { + view.copyRange(j, j + N * chunk_size, leaf_buffer[0 .. N * chunk_size]); + var leaf_cvs: [N * cv_size]u8 align(cache_line_size) = undefined; + processLeaves(Variant, N, leaf_buffer[0 .. N * chunk_size], &leaf_cvs); + final_state.update(&leaf_cvs); + } +} + +/// Generic single-threaded implementation +fn ktSingleThreaded(comptime Variant: type, view: *const MultiSliceView, total_len: usize, output: []u8) void { + const cv_size = Variant.cv_size; + const StateType = Variant.StateType; + + // Initialize streaming TurboSHAKE state for final node (delimiter 0x06 is set in the type) + var final_state = StateType.init(.{}); + + // Absorb first B bytes from input + var first_b_buffer: [chunk_size]u8 = undefined; + if (view.tryGetSlice(0, chunk_size)) |first_chunk| { + final_state.update(first_chunk); + } else { + view.copyRange(0, chunk_size, &first_b_buffer); + final_state.update(&first_b_buffer); + } + + // Absorb padding bytes (8 bytes: 0x03 followed by 7 zeros) + const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + final_state.update(&padding); + + var j: usize = chunk_size; + var n: usize = 0; + + // Temporary buffers for boundary-spanning leaves and CV computation + var leaf_buffer: [chunk_size * 8]u8 align(cache_line_size) = undefined; + var cv_buffer: [64]u8 = undefined; // Max CV size is 64 bytes + + // Process leaves in SIMD batches (8x, 4x, 2x) + inline for ([_]usize{ 8, 4, 2 }) |batch_size| { + while (optimal_vector_len >= batch_size and j + batch_size * chunk_size <= total_len) { + processAndAbsorbNLeaves(Variant, batch_size, view, j, &leaf_buffer, &final_state); + j += batch_size * chunk_size; + n += batch_size; + } + } + + // Process remaining leaves one at a time + while (j < total_len) { + const chunk_len = @min(chunk_size, total_len - j); + if (view.tryGetSlice(j, j + chunk_len)) |leaf_data| { + const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{}); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + final_state.update(cv_buffer[0..cv_size]); // Absorb CV immediately + } else { + view.copyRange(j, j + chunk_len, leaf_buffer[0..chunk_len]); + const cv_slice = MultiSliceView.init(leaf_buffer[0..chunk_len], &[_]u8{}, &[_]u8{}); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + final_state.update(cv_buffer[0..cv_size]); + } + j += chunk_size; + n += 1; + } + + // Absorb right_encode(n) and terminator + const n_enc = rightEncode(n); + final_state.update(n_enc.slice()); + const terminator = [_]u8{ 0xFF, 0xFF }; + final_state.update(&terminator); + + // Finalize and squeeze output + final_state.final(output); +} + +/// Generic KangarooTwelve hash function builder. +/// Creates a public API type with hash and hashParallel methods for a specific variant. +fn KTHash( + comptime Variant: type, + comptime singleChunkFn: fn (*const MultiSliceView, u8, []u8) void, +) type { + return struct { + const Self = @This(); + const StateType = Variant.StateType; + + /// The recommended output length, in bytes. + pub const digest_length = Variant.security_level / 8 * 2; + /// The block length, or rate, in bytes. + pub const block_length = Variant.rate; + + /// Configuration options for KangarooTwelve hashing. + /// + /// Options include an optional customization string that provides domain separation, + /// ensuring that identical inputs with different customization strings + /// produce completely distinct hash outputs. + /// + /// This prevents hash collisions when the same data is hashed in different contexts. + /// + /// Customization strings can be of any length. + /// + /// Common options for customization:: + /// + /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256 + /// - Context Separation: domain-specific strings (e.g., "email", "password", "session") + /// - Composite Keys: concatenation of secret key + context string + pub const Options = struct { + customization: ?[]const u8 = null, + }; + + // Message buffer (accumulates message data only, not customization) + buffer: [chunk_size]u8, + buffer_len: usize, + message_len: usize, + + // Customization string (fixed at init) + customization: []const u8, + custom_len_enc: RightEncoded, + + // Tree mode state (lazy initialization when buffer overflows first time) + first_chunk: ?[chunk_size]u8, // Saved first chunk for tree mode + final_state: ?StateType, // Running TurboSHAKE state for final node + num_leaves: usize, // Count of leaves processed (after first chunk) + + // SIMD chunk batching + pending_chunks: [8 * chunk_size]u8 align(cache_line_size), // Buffer for up to 8 chunks + pending_count: usize, // Number of complete chunks in pending_chunks + + /// Initialize a KangarooTwelve hashing context. + /// + /// Options include an optional customization string that provides domain separation, + /// ensuring that identical inputs with different customization strings + /// produce completely distinct hash outputs. + /// + /// This prevents hash collisions when the same data is hashed in different contexts. + /// + /// Customization strings can be of any length. + /// + /// Common options for customization:: + /// + /// - Key derivation or MAC: 16-byte secret for KT128, 32-byte secret for KT256 + /// - Context Separation: domain-specific strings (e.g., "email", "password", "session") + /// - Composite Keys: concatenation of secret key + context string + pub fn init(options: Options) Self { + const custom = options.customization orelse &[_]u8{}; + return .{ + .buffer = undefined, + .buffer_len = 0, + .message_len = 0, + .customization = custom, + .custom_len_enc = rightEncode(custom.len), + .first_chunk = null, + .final_state = null, + .num_leaves = 0, + .pending_chunks = undefined, + .pending_count = 0, + }; + } + + /// Flush all pending chunks using SIMD when possible + fn flushPendingChunks(self: *Self) void { + const cv_size = Variant.cv_size; + + // Process all pending chunks using the largest SIMD batch sizes possible + while (self.pending_count > 0) { + // Try SIMD batches in decreasing size order + inline for ([_]usize{ 8, 4, 2 }) |batch_size| { + if (optimal_vector_len >= batch_size and self.pending_count >= batch_size) { + var leaf_cvs: [batch_size * cv_size]u8 align(cache_line_size) = undefined; + processLeaves(Variant, batch_size, self.pending_chunks[0 .. batch_size * chunk_size], &leaf_cvs); + self.final_state.?.update(&leaf_cvs); + self.num_leaves += batch_size; + self.pending_count -= batch_size; + + // Shift remaining chunks to the front + if (self.pending_count > 0) { + const remaining_bytes = self.pending_count * chunk_size; + @memcpy(self.pending_chunks[0..remaining_bytes], self.pending_chunks[batch_size * chunk_size ..][0..remaining_bytes]); + } + break; // Continue outer loop to try next batch + } + } + + // If no SIMD batch was possible, process one chunk with scalar code + if (self.pending_count > 0 and self.pending_count < 2) { + var cv_buffer: [64]u8 = undefined; + const cv_slice = MultiSliceView.init(self.pending_chunks[0..chunk_size], &[_]u8{}, &[_]u8{}); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + self.final_state.?.update(cv_buffer[0..cv_size]); + self.num_leaves += 1; + self.pending_count -= 1; + break; // No more chunks to process + } + } + } + + /// Absorb data into the hash state. + /// Can be called multiple times to incrementally add data. + pub fn update(self: *Self, data: []const u8) void { + if (data.len == 0) return; + + var remaining = data; + + while (remaining.len > 0) { + const space_in_buffer = chunk_size - self.buffer_len; + const to_copy = @min(space_in_buffer, remaining.len); + + // Copy data into buffer + @memcpy(self.buffer[self.buffer_len..][0..to_copy], remaining[0..to_copy]); + self.buffer_len += to_copy; + self.message_len += to_copy; + remaining = remaining[to_copy..]; + + // If buffer is full, process it + if (self.buffer_len == chunk_size) { + if (self.first_chunk == null) { + // First time buffer fills - initialize tree mode + self.first_chunk = self.buffer; + self.final_state = StateType.init(.{}); + + // Absorb first chunk into final state + self.final_state.?.update(&self.buffer); + + // Absorb padding (8 bytes: 0x03 followed by 7 zeros) + const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + self.final_state.?.update(&padding); + } else { + // Add chunk to pending buffer for SIMD batch processing + @memcpy(self.pending_chunks[self.pending_count * chunk_size ..][0..chunk_size], &self.buffer); + self.pending_count += 1; + + // Flush when we have enough chunks for optimal SIMD batch + // Determine best batch size for this architecture + const optimal_batch_size = comptime blk: { + if (optimal_vector_len >= 8) break :blk 8; + if (optimal_vector_len >= 4) break :blk 4; + if (optimal_vector_len >= 2) break :blk 2; + break :blk 1; + }; + if (self.pending_count >= optimal_batch_size) { + self.flushPendingChunks(); + } + } + self.buffer_len = 0; + } + } + } + + /// Finalize the hash and produce output. + /// + /// Unlike traditional hash functions, the output can be of any length. + /// + /// When using as a regular hash function, use the recommended `digest_length` value (32 bytes for KT128, 64 bytes for KT256). + /// + /// After calling this method, the context should not be reused. However, the structure can be cloned before finalizing + /// to compute multiple hashes with the same prefix. + pub fn final(self: *Self, out: []u8) void { + const cv_size = Variant.cv_size; + + // Calculate total length: message + customization + right_encode(customization.len) + const total_len = self.message_len + self.customization.len + self.custom_len_enc.len; + + // Single chunk mode: total data fits in one chunk + if (total_len <= chunk_size) { + // Build the complete input: buffer + customization + encoded length + var single_chunk: [chunk_size]u8 = undefined; + @memcpy(single_chunk[0..self.buffer_len], self.buffer[0..self.buffer_len]); + @memcpy(single_chunk[self.buffer_len..][0..self.customization.len], self.customization); + @memcpy(single_chunk[self.buffer_len + self.customization.len ..][0..self.custom_len_enc.len], self.custom_len_enc.slice()); + + const view = MultiSliceView.init(single_chunk[0..total_len], &[_]u8{}, &[_]u8{}); + singleChunkFn(&view, 0x07, out); + return; + } + + // Flush any pending chunks with SIMD + self.flushPendingChunks(); + + // Build view over remaining data (buffer + customization + encoding) + const remaining_view = MultiSliceView.init( + self.buffer[0..self.buffer_len], + self.customization, + self.custom_len_enc.slice(), + ); + const remaining_len = remaining_view.totalLen(); + + var final_leaves = self.num_leaves; + var leaf_start: usize = 0; + + // Tree mode: initialize if not already done (lazy initialization) + if (self.final_state == null and remaining_len > 0) { + self.final_state = StateType.init(.{}); + + // Absorb first chunk (up to chunk_size bytes from remaining data) + const first_chunk_len = @min(chunk_size, remaining_len); + if (remaining_view.tryGetSlice(0, first_chunk_len)) |first_chunk| { + // Data is contiguous, use it directly + self.final_state.?.update(first_chunk); + } else { + // Data spans boundaries, copy to buffer + var first_chunk_buf: [chunk_size]u8 = undefined; + remaining_view.copyRange(0, first_chunk_len, first_chunk_buf[0..first_chunk_len]); + self.final_state.?.update(first_chunk_buf[0..first_chunk_len]); + } + + // Absorb padding (8 bytes: 0x03 followed by 7 zeros) + const padding = [_]u8{ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + self.final_state.?.update(&padding); + + // Process remaining data as leaves + leaf_start = first_chunk_len; + } + + // Process all remaining data as leaves (starting from leaf_start) + var offset = leaf_start; + while (offset < remaining_len) { + const leaf_end = @min(offset + chunk_size, remaining_len); + const leaf_size = leaf_end - offset; + + var cv_buffer: [64]u8 = undefined; + if (remaining_view.tryGetSlice(offset, leaf_end)) |leaf_data| { + // Data is contiguous, use it directly + const cv_slice = MultiSliceView.init(leaf_data, &[_]u8{}, &[_]u8{}); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + } else { + // Data spans boundaries, copy to buffer + var leaf_buf: [chunk_size]u8 = undefined; + remaining_view.copyRange(offset, leaf_end, leaf_buf[0..leaf_size]); + const cv_slice = MultiSliceView.init(leaf_buf[0..leaf_size], &[_]u8{}, &[_]u8{}); + Variant.turboShakeToBuffer(&cv_slice, 0x0B, cv_buffer[0..cv_size]); + } + self.final_state.?.update(cv_buffer[0..cv_size]); + final_leaves += 1; + offset = leaf_end; + } + + // Absorb right_encode(num_leaves) and terminator + const n_enc = rightEncode(final_leaves); + self.final_state.?.update(n_enc.slice()); + const terminator = [_]u8{ 0xFF, 0xFF }; + self.final_state.?.update(&terminator); + + // Squeeze output + self.final_state.?.final(out); + } + + /// Hash a message using sequential processing with SIMD acceleration. + /// + /// Parameters: + /// - message: Input data to hash (any length) + /// - out: Output buffer (any length, arbitrary output sizes supported, `digest_length` recommended for standard use) + /// - options: Optional settings to include a secret key or a context separation string + pub fn hash(message: []const u8, out: []u8, options: Options) !void { + const custom = options.customization orelse &[_]u8{}; + + // Right-encode customization length + const custom_len_enc = rightEncode(custom.len); + + // Create zero-copy multi-slice view (no concatenation) + const view = MultiSliceView.init(message, custom, custom_len_enc.slice()); + const total_len = view.totalLen(); + + // Single chunk case - zero-copy absorption! + if (total_len <= chunk_size) { + singleChunkFn(&view, 0x07, out); + return; + } + + // Tree mode - single-threaded SIMD processing + ktSingleThreaded(Variant, &view, total_len, out); + } + }; +} + +/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing +/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive +/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis +/// since the SHA-3 competition (2008-2012) and remains secure. +/// +/// K12 uses Keccak-p[1600,12] with 12 rounds (half of SHA-3's 24 rounds), providing +/// 128-bit security strength equivalent to AES-128 and SHAKE128. While this offers +/// less conservative margin than SHA-3, current cryptanalysis reaches only 6 rounds, +/// leaving a substantial security margin. This deliberate trade-off delivers +/// significantly better performance while maintaining strong practical security. +/// +/// Standardized as RFC 9861 after 8 years of public scrutiny. Supports arbitrary-length +/// output and optional customization strings for domain separation. +pub const KT128 = KTHash(KT128Variant, turboShake128MultiSliceToBuffer); + +/// KangarooTwelve is a fast, secure cryptographic hash function that uses tree-hashing +/// on top of TurboSHAKE. It is built on the Keccak permutation, the same primitive +/// underlying SHA-3, which has undergone over 15 years of intensive cryptanalysis +/// since the SHA-3 competition (2008-2012) and remains secure. +/// +/// KT256 provides 256-bit security strength and achieves NIST post-quantum security +/// level 2 when using at least 256-bit outputs. Like KT128, it uses Keccak-p[1600,12] +/// with 12 rounds, offering a deliberate trade-off between conservative margin and +/// significantly better performance while maintaining strong practical security. +/// +/// Use KT256 when you need extra conservative margins. +/// For most applications, KT128 offers better performance with adequate security. +pub const KT256 = KTHash(KT256Variant, turboShake256MultiSliceToBuffer); + +/// Helper: Generate pattern data where data[i] = (i % 251) +fn generatePattern(allocator: Allocator, len: usize) ![]u8 { + const data = try allocator.alloc(u8, len); + for (data, 0..) |*byte, i| { + byte.* = @intCast(i % 251); + } + return data; +} + +test "KT128: empty message, empty customization, 32 bytes" { + var output: [32]u8 = undefined; + try KT128.hash(&[_]u8{}, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E5"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: empty message, empty customization, 64 bytes" { + var output: [64]u8 = undefined; + try KT128.hash(&[_]u8{}, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "1AC2D450FC3B4205D19DA7BFCA1B37513C0803577AC7167F06FE2CE1F0EF39E54269C056B8C82E48276038B6D292966CC07A3D4645272E31FF38508139EB0A71"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: empty message, empty customization, 10032 bytes (last 32)" { + const allocator = std.testing.allocator; + const output = try allocator.alloc(u8, 10032); + defer allocator.free(output); + + try KT128.hash(&[_]u8{}, output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "E8DC563642F7228C84684C898405D3A834799158C079B12880277A1D28E2FF6D"); + try std.testing.expectEqualSlices(u8, &expected, output[10000..]); +} + +test "KT128: pattern message (1 byte), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 1); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "2BDA92450E8B147F8A7CB629E784A058EFCA7CF7D8218E02D345DFAA65244A1F"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (17 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 17); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "6BF75FA2239198DB4772E36478F8E19B0F371205F6A9A93A273F51DF37122888"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (289 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 289); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "0C315EBCDEDBF61426DE7DCF8FB725D1E74675D7F5327A5067F367B108ECB67C"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: 0xFF message (1 byte), pattern customization (1 byte), 32 bytes" { + const allocator = std.testing.allocator; + const customization = try generatePattern(allocator, 1); + defer allocator.free(customization); + + const message = [_]u8{0xFF}; + var output: [32]u8 = undefined; + try KT128.hash(&message, &output, .{ .customization = customization }); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "A20B92B251E3D62443EC286E4B9B470A4E8315C156EEB24878B038ABE20650BE"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (8191 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8191); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "1B577636F723643E990CC7D6A659837436FD6A103626600EB8301CD1DBE553D6"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (8192 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8192); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "48F256F6772F9EDFB6A8B661EC92DC93B95EBD05A08A17B39AE3490870C926C3"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: empty message, empty customization, 64 bytes" { + var output: [64]u8 = undefined; + try KT256.hash(&[_]u8{}, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: empty message, empty customization, 128 bytes" { + var output: [128]u8 = undefined; + try KT256.hash(&[_]u8{}, &output, .{}); + + var expected: [128]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "B23D2E9CEA9F4904E02BEC06817FC10CE38CE8E93EF4C89E6537076AF8646404E3E8B68107B8833A5D30490AA33482353FD4ADC7148ECB782855003AAEBDE4A9B0925319D8EA1E121A609821EC19EFEA89E6D08DAEE1662B69C840289F188BA860F55760B61F82114C030C97E5178449608CCD2CD2D919FC7829FF69931AC4D0"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (1 byte), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 1); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "0D005A194085360217128CF17F91E1F71314EFA5564539D444912E3437EFA17F82DB6F6FFE76E781EAA068BCE01F2BBF81EACB983D7230F2FB02834A21B1DDD0"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (17 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 17); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "1BA3C02B1FC514474F06C8979978A9056C8483F4A1B63D0DCCEFE3A28A2F323E1CDCCA40EBF006AC76EF0397152346837B1277D3E7FAA9C9653B19075098527B"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (8191 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8191); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "3081434D93A4108D8D8A3305B89682CEBEDC7CA4EA8A3CE869FBB73CBE4A58EEF6F24DE38FFC170514C70E7AB2D01F03812616E863D769AFB3753193BA045B20"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (8192 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8192); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "C6EE8E2AD3200C018AC87AAA031CDAC22121B412D07DC6E0DCCBB53423747E9A1C18834D99DF596CF0CF4B8DFAFB7BF02D139D0C9035725ADC1A01B7230A41FA"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (8193 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8193); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "BB66FE72EAEA5179418D5295EE1344854D8AD7F3FA17EFCB467EC152341284CF"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (16384 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 16384); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "82778F7F7234C83352E76837B721FBDBB5270B88010D84FA5AB0B61EC8CE0956"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128: pattern message (16385 bytes), empty customization, 32 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 16385); + defer allocator.free(message); + + var output: [32]u8 = undefined; + try KT128.hash(message, &output, .{}); + + var expected: [32]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "5F8D2B943922B451842B4E82740D02369E2D5F9F33C5123509A53B955FE177B2"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (8193 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 8193); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "65FF03335900E5197ACBD5F41B797F0E7E36AD4FF7D89C09FA6F28AE58D1E8BC2DF1779B86F988C3B13690172914EA172423B23EF4057255BB0836AB3A99836E"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (16384 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 16384); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "74604239A14847CB79069B4FF0E51070A93034C9AC4DFF4D45E0F2C5DA81D930DE6055C2134B4DF4E49F27D1B2C66E95491858B182A924BD0504DA5976BC516D"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT256: pattern message (16385 bytes), empty customization, 64 bytes" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 16385); + defer allocator.free(message); + + var output: [64]u8 = undefined; + try KT256.hash(message, &output, .{}); + + var expected: [64]u8 = undefined; + _ = try std.fmt.hexToBytes(&expected, "C814F23132DADBFD55379F18CB988CB39B751F119322823FD982644A897485397B9F40EB11C6E416359B8AE695A5CE0FA79D1ADA1EEC745D82E0A5AB08A9F014"); + try std.testing.expectEqualSlices(u8, &expected, &output); +} + +test "KT128 incremental: empty message matches one-shot" { + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(&[_]u8{}, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: small message matches one-shot" { + const message = "Hello, KangarooTwelve!"; + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: multiple updates match single update" { + const part1 = "Hello, "; + const part2 = "Kangaroo"; + const part3 = "Twelve!"; + + var output_single: [32]u8 = undefined; + var output_multi: [32]u8 = undefined; + + // Single update + var hasher1 = KT128.init(.{}); + hasher1.update(part1 ++ part2 ++ part3); + hasher1.final(&output_single); + + // Multiple updates + var hasher2 = KT128.init(.{}); + hasher2.update(part1); + hasher2.update(part2); + hasher2.update(part3); + hasher2.final(&output_multi); + + try std.testing.expectEqualSlices(u8, &output_single, &output_multi); +} + +test "KT128 incremental: exactly chunk_size matches one-shot" { + const allocator = std.testing.allocator; + const message = try allocator.alloc(u8, 8192); + defer allocator.free(message); + @memset(message, 0xAB); + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: larger than chunk_size matches one-shot" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 16384); + defer allocator.free(message); + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: with customization matches one-shot" { + const message = "Test message"; + const customization = "my custom domain"; + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{ .customization = customization }); + + var hasher = KT128.init(.{ .customization = customization }); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: large message with customization" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 20000); + defer allocator.free(message); + const customization = "test domain"; + + var output_oneshot: [48]u8 = undefined; + var output_incremental: [48]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{ .customization = customization }); + + var hasher = KT128.init(.{ .customization = customization }); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: streaming chunks matches one-shot" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 25000); + defer allocator.free(message); + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + + // Feed in 1KB chunks + var offset: usize = 0; + while (offset < message.len) { + const chunk_size_local = @min(1024, message.len - offset); + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT256 incremental: empty message matches one-shot" { + var output_oneshot: [64]u8 = undefined; + var output_incremental: [64]u8 = undefined; + + try KT256.hash(&[_]u8{}, &output_oneshot, .{}); + + var hasher = KT256.init(.{}); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT256 incremental: small message matches one-shot" { + const message = "Hello, KangarooTwelve with 256-bit security!"; + + var output_oneshot: [64]u8 = undefined; + var output_incremental: [64]u8 = undefined; + + try KT256.hash(message, &output_oneshot, .{}); + + var hasher = KT256.init(.{}); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT256 incremental: large message matches one-shot" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 30000); + defer allocator.free(message); + + var output_oneshot: [64]u8 = undefined; + var output_incremental: [64]u8 = undefined; + + try KT256.hash(message, &output_oneshot, .{}); + + var hasher = KT256.init(.{}); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT256 incremental: with customization matches one-shot" { + const allocator = std.testing.allocator; + const message = try generatePattern(allocator, 15000); + defer allocator.free(message); + const customization = "KT256 custom domain"; + + var output_oneshot: [80]u8 = undefined; + var output_incremental: [80]u8 = undefined; + + try KT256.hash(message, &output_oneshot, .{ .customization = customization }); + + var hasher = KT256.init(.{ .customization = customization }); + hasher.update(message); + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: random small message with random chunk sizes" { + const allocator = std.testing.allocator; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 }; + + for (test_sizes) |total_size| { + const message = try allocator.alloc(u8, total_size); + defer allocator.free(message); + random.bytes(message); + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + var offset: usize = 0; + + while (offset < message.len) { + const remaining = message.len - offset; + const max_chunk = @min(1000, remaining); + const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk); + + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); + } +} + +test "KT128 incremental: random large message (1MB) with random chunk sizes" { + const allocator = std.testing.allocator; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const total_size: usize = 1024 * 1024; // 1 MB + const message = try allocator.alloc(u8, total_size); + defer allocator.free(message); + random.bytes(message); + + var output_oneshot: [32]u8 = undefined; + var output_incremental: [32]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{}); + + var hasher = KT128.init(.{}); + var offset: usize = 0; + + while (offset < message.len) { + const remaining = message.len - offset; + const max_chunk = @min(10000, remaining); + const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk); + + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT256 incremental: random small message with random chunk sizes" { + const allocator = std.testing.allocator; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const test_sizes = [_]usize{ 100, 500, 2000, 5000, 10000 }; + + for (test_sizes) |total_size| { + // Generate random message + const message = try allocator.alloc(u8, total_size); + defer allocator.free(message); + random.bytes(message); + + var output_oneshot: [64]u8 = undefined; + var output_incremental: [64]u8 = undefined; + + try KT256.hash(message, &output_oneshot, .{}); + + var hasher = KT256.init(.{}); + var offset: usize = 0; + + while (offset < message.len) { + const remaining = message.len - offset; + const max_chunk = @min(1000, remaining); + const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk); + + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); + } +} + +test "KT256 incremental: random large message (1MB) with random chunk sizes" { + const allocator = std.testing.allocator; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const total_size: usize = 1024 * 1024; // 1 MB + const message = try allocator.alloc(u8, total_size); + defer allocator.free(message); + random.bytes(message); + + var output_oneshot: [64]u8 = undefined; + var output_incremental: [64]u8 = undefined; + + try KT256.hash(message, &output_oneshot, .{}); + + var hasher = KT256.init(.{}); + var offset: usize = 0; + + while (offset < message.len) { + const remaining = message.len - offset; + const max_chunk = @min(10000, remaining); + const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk); + + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} + +test "KT128 incremental: random message with customization and random chunks" { + const allocator = std.testing.allocator; + + var prng = std.Random.DefaultPrng.init(std.testing.random_seed); + const random = prng.random(); + + const total_size: usize = 50000; + const message = try allocator.alloc(u8, total_size); + defer allocator.free(message); + random.bytes(message); + + const customization = "random test domain"; + + var output_oneshot: [48]u8 = undefined; + var output_incremental: [48]u8 = undefined; + + try KT128.hash(message, &output_oneshot, .{ .customization = customization }); + + var hasher = KT128.init(.{ .customization = customization }); + var offset: usize = 0; + + while (offset < message.len) { + const remaining = message.len - offset; + const max_chunk = @min(5000, remaining); + const chunk_size_local = if (max_chunk == 1) 1 else random.intRangeAtMost(usize, 1, max_chunk); + + hasher.update(message[offset..][0..chunk_size_local]); + offset += chunk_size_local; + } + hasher.final(&output_incremental); + + try std.testing.expectEqualSlices(u8, &output_oneshot, &output_incremental); +} diff --git a/lib/std/crypto/sha3.zig b/lib/std/crypto/sha3.zig index 332276ded6..9173beb2cf 100644 --- a/lib/std/crypto/sha3.zig +++ b/lib/std/crypto/sha3.zig @@ -4,6 +4,8 @@ const assert = std.debug.assert; const math = std.math; const mem = std.mem; +const kangarootwelve = @import("kangarootwelve.zig"); + const KeccakState = std.crypto.core.keccak.State; pub const Sha3_224 = Keccak(1600, 224, 0x06, 24); @@ -26,6 +28,9 @@ pub const KMac256 = KMac(256); pub const TupleHash128 = TupleHash(128); pub const TupleHash256 = TupleHash(256); +pub const KT128 = kangarootwelve.KT128; +pub const KT256 = kangarootwelve.KT256; + /// TurboSHAKE128 is a XOF (a secure hash function with a variable output length), with a 128 bit security level. /// It is based on the same permutation as SHA3 and SHAKE128, but which much higher performance. /// The delimiter is 0x1f by default, but can be changed for context-separation. @@ -481,6 +486,10 @@ pub const NistLengthEncoding = enum { const htest = @import("test.zig"); +test { + _ = kangarootwelve; +} + test "sha3-224 single" { try htest.assertEqualHash(Sha3_224, "6b4e03423667dbb73b6e15454f0eb1abd4597f9a1b078e3f5b5a6bc7", ""); try htest.assertEqualHash(Sha3_224, "e642824c3f8cf24ad09234ee7d3c766fc9a3a5168d0c94ad73b46fdf", "abc");