diff --git a/lib/std/hash/benchmark.zig b/lib/std/hash/benchmark.zig index 699de5ceb4..322adeb61c 100644 --- a/lib/std/hash/benchmark.zig +++ b/lib/std/hash/benchmark.zig @@ -18,6 +18,7 @@ const Hash = struct { name: []const u8, has_iterative_api: bool = true, has_crypto_api: bool = false, + has_anytype_api: ?[]const comptime_int = null, init_u8s: ?[]const u8 = null, init_u64: ?u64 = null, }; @@ -27,11 +28,13 @@ const hashes = [_]Hash{ .ty = hash.XxHash64, .name = "xxhash64", .init_u64 = 0, + .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }), }, Hash{ .ty = hash.XxHash32, .name = "xxhash32", .init_u64 = 0, + .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }), }, Hash{ .ty = hash.Wyhash, @@ -99,14 +102,14 @@ const Result = struct { }; const block_size: usize = 8 * 8192; -const alignment: usize = 64; pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result { - const blocks_count = bytes / block_size; - var blocks = try allocator.alloc(u8, block_size + alignment * (blocks_count - 1)); + var blocks = try allocator.alloc(u8, bytes); defer allocator.free(blocks); random.bytes(blocks); + const block_count = bytes / block_size; + var h = blk: { if (H.init_u8s) |init| { break :blk H.ty.init(init[0..H.ty.key_length]); @@ -118,17 +121,17 @@ pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Alloc }; var timer = try Timer.start(); - const start = timer.lap(); - for (0..blocks_count) |i| { - h.update(blocks[i * alignment ..][0..block_size]); + for (0..block_count) |i| { + h.update(blocks[i * block_size ..][0..block_size]); } const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final(); std.mem.doNotOptimizeAway(final); - const end = timer.read(); + const elapsed_ns = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; - const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s)); + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s; + const size_float: f64 = @floatFromInt(block_size * block_count); + const throughput: u64 = @intFromFloat(size_float / elapsed_s); return Result{ .hash = final, @@ -144,7 +147,6 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize const key_count = bytes / key_size; var timer = try Timer.start(); - const start = timer.lap(); var sum: u64 = 0; for (0..key_count) |i| { @@ -164,10 +166,11 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize }; sum +%= final; } - const end = timer.read(); + const elapsed_ns = timer.read(); - const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; - const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s)); + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s; + const size_float: f64 = @floatFromInt(key_count * key_size); + const throughput: u64 = @intFromFloat(size_float / elapsed_s); std.mem.doNotOptimizeAway(sum); @@ -177,6 +180,143 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize }; } +// the array and array pointer benchmarks for xxhash are very sensitive to in-lining, +// if you see strange performance changes consider using `.never_inline` or `.always_inline` +// to ensure the changes are not only due to the optimiser inlining the benchmark differently +pub fn benchmarkHashSmallKeysArrayPtr( + comptime H: anytype, + comptime key_size: usize, + bytes: usize, + allocator: std.mem.Allocator, +) !Result { + var blocks = try allocator.alloc(u8, bytes); + defer allocator.free(blocks); + random.bytes(blocks); + + const key_count = bytes / key_size; + + var timer = try Timer.start(); + + var sum: u64 = 0; + for (0..key_count) |i| { + const small_key = blocks[i * key_size ..][0..key_size]; + const final: u64 = blk: { + if (H.init_u8s) |init| { + if (H.has_crypto_api) { + break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length])); + } else { + break :blk H.ty.hash(init, small_key); + } + } + if (H.init_u64) |init| { + break :blk H.ty.hash(init, small_key); + } + break :blk H.ty.hash(small_key); + }; + sum +%= final; + } + const elapsed_ns = timer.read(); + + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s; + const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s); + + std.mem.doNotOptimizeAway(sum); + + return Result{ + .hash = sum, + .throughput = throughput, + }; +} + +// the array and array pointer benchmarks for xxhash are very sensitive to in-lining, +// if you see strange performance changes consider using `.never_inline` or `.always_inline` +// to ensure the changes are not only due to the optimiser inlining the benchmark differently +pub fn benchmarkHashSmallKeysArray( + comptime H: anytype, + comptime key_size: usize, + bytes: usize, + allocator: std.mem.Allocator, +) !Result { + var blocks = try allocator.alloc(u8, bytes); + defer allocator.free(blocks); + random.bytes(blocks); + + const key_count = bytes / key_size; + + var i: usize = 0; + var timer = try Timer.start(); + + var sum: u64 = 0; + while (i < key_count) : (i += 1) { + const small_key = blocks[i * key_size ..][0..key_size]; + const final: u64 = blk: { + if (H.init_u8s) |init| { + if (H.has_crypto_api) { + break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length])); + } else { + break :blk H.ty.hash(init, small_key.*); + } + } + if (H.init_u64) |init| { + break :blk H.ty.hash(init, small_key.*); + } + break :blk H.ty.hash(small_key.*); + }; + sum +%= final; + } + const elapsed_ns = timer.read(); + + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s; + const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s); + + std.mem.doNotOptimizeAway(sum); + + return Result{ + .hash = sum, + .throughput = throughput, + }; +} + +pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result { + var blocks = try allocator.alloc(u8, bytes); + defer allocator.free(blocks); + random.bytes(blocks); + + const key_count = bytes / key_size; + + var timer = try Timer.start(); + + var sum: u64 = 0; + for (0..key_count) |i| { + const small_key = blocks[i * key_size ..][0..key_size]; + const final: u64 = blk: { + if (H.init_u8s) |init| { + if (H.has_crypto_api) { + break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length])); + } else { + break :blk H.ty.hashSmall(init, small_key); + } + } + if (H.init_u64) |init| { + break :blk H.ty.hashSmall(init, small_key); + } + break :blk H.ty.hashSmall(small_key); + }; + sum +%= final; + } + const elapsed_ns = timer.read(); + + const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s; + const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s); + + std.mem.doNotOptimizeAway(sum); + + return Result{ + .throughput = throughput, + .hash = sum, + }; +} + fn usage() void { std.debug.print( \\throughput_test [options] @@ -205,9 +345,12 @@ pub fn main() !void { var filter: ?[]u8 = ""; var count: usize = mode(128 * MiB); - var key_size: usize = 32; + var key_size: ?usize = null; var seed: u32 = 0; var test_iterative_only = false; + var test_arrays = false; + + const default_small_key_size = 32; var i: usize = 1; while (i < args.len) : (i += 1) { @@ -248,12 +391,14 @@ pub fn main() !void { } key_size = try std.fmt.parseUnsigned(usize, args[i], 10); - if (key_size > block_size) { + if (key_size.? > block_size) { try stdout.print("key_size cannot exceed block size of {}\n", .{block_size}); std.os.exit(1); } } else if (std.mem.eql(u8, args[i], "--iterative-only")) { test_iterative_only = true; + } else if (std.mem.eql(u8, args[i], "--include-array")) { + test_arrays = true; } else if (std.mem.eql(u8, args[i], "--help")) { usage(); return; @@ -268,7 +413,7 @@ pub fn main() !void { const allocator = gpa.allocator(); inline for (hashes) |H| { - if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) { + if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: { if (!test_iterative_only or H.has_iterative_api) { try stdout.print("{s}\n", .{H.name}); @@ -281,9 +426,69 @@ pub fn main() !void { } if (!test_iterative_only) { - prng.seed(seed); - const result_small = try benchmarkHashSmallKeys(H, key_size, count, allocator); - try stdout.print(" small keys: {:5} MiB/s [{x:0<16}]\n", .{ result_small.throughput / (1 * MiB), result_small.hash }); + if (key_size) |size| { + prng.seed(seed); + const result_small = try benchmarkHashSmallKeys(H, size, count, allocator); + try stdout.print(" small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{ + size, + result_small.throughput / (1 * MiB), + result_small.throughput / size, + result_small.hash, + }); + + if (!test_arrays) break :hash; + if (H.has_anytype_api) |sizes| { + inline for (sizes) |exact_size| { + if (size == exact_size) { + prng.seed(seed); + const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator); + prng.seed(seed); + const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator); + try stdout.print(" array: {:5} MiB/s [{x:0<16}]\n", .{ + result_array.throughput / (1 * MiB), + result_array.hash, + }); + try stdout.print(" array ptr: {:5} MiB/s [{x:0<16}]\n", .{ + result_ptr.throughput / (1 * MiB), + result_ptr.hash, + }); + } + } + } + } else { + prng.seed(seed); + const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator); + try stdout.print(" small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{ + default_small_key_size, + result_small.throughput / (1 * MiB), + result_small.throughput / default_small_key_size, + result_small.hash, + }); + + if (!test_arrays) break :hash; + if (H.has_anytype_api) |sizes| { + try stdout.print(" array:\n", .{}); + inline for (sizes) |exact_size| { + prng.seed(seed); + const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator); + try stdout.print(" {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{ + exact_size, + result.throughput / (1 * MiB), + result.hash, + }); + } + try stdout.print(" array ptr: \n", .{}); + inline for (sizes) |exact_size| { + prng.seed(seed); + const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator); + try stdout.print(" {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{ + exact_size, + result.throughput / (1 * MiB), + result.hash, + }); + } + } + } } } } diff --git a/lib/std/hash/xxhash.zig b/lib/std/hash/xxhash.zig index f1d1da429d..8e25b9ce7b 100644 --- a/lib/std/hash/xxhash.zig +++ b/lib/std/hash/xxhash.zig @@ -5,11 +5,7 @@ const expectEqual = std.testing.expectEqual; const rotl = std.math.rotl; pub const XxHash64 = struct { - acc1: u64, - acc2: u64, - acc3: u64, - acc4: u64, - + accumulator: Accumulator, seed: u64, buf: [32]u8, buf_len: usize, @@ -21,20 +17,174 @@ pub const XxHash64 = struct { const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011 const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101 + const Accumulator = struct { + acc1: u64, + acc2: u64, + acc3: u64, + acc4: u64, + + fn init(seed: u64) Accumulator { + return .{ + .acc1 = seed +% prime_1 +% prime_2, + .acc2 = seed +% prime_2, + .acc3 = seed, + .acc4 = seed -% prime_1, + }; + } + + fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize { + var i: usize = 0; + + if (unroll_count > 0) { + const unrolled_bytes = unroll_count * 32; + while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) { + inline for (0..unroll_count) |j| { + self.processStripe(input[i + j * 32 ..][0..32]); + } + } + } + + while (i + 32 <= input.len) : (i += 32) { + self.processStripe(input[i..][0..32]); + } + + return i; + } + + fn processStripe(self: *Accumulator, buf: *const [32]u8) void { + self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8])); + self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16])); + self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24])); + self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32])); + } + + fn merge(self: Accumulator) u64 { + var acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +% + rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18); + acc = mergeAccumulator(acc, self.acc1); + acc = mergeAccumulator(acc, self.acc2); + acc = mergeAccumulator(acc, self.acc3); + acc = mergeAccumulator(acc, self.acc4); + return acc; + } + + fn mergeAccumulator(acc: u64, other: u64) u64 { + const a = acc ^ round(0, other); + const b = a *% prime_1; + return b +% prime_4; + } + }; + + fn finalize( + unfinished: u64, + byte_count: usize, + partial: anytype, + ) u64 { + std.debug.assert(partial.len < 32); + var acc = unfinished +% @as(u64, byte_count) +% @as(u64, partial.len); + + switch (partial.len) { + inline 0, 1, 2, 3 => |count| { + inline for (0..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 4, 5, 6, 7 => |count| { + acc = finalize4(acc, partial[0..4]); + inline for (4..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 8, 9, 10, 11 => |count| { + acc = finalize8(acc, partial[0..8]); + inline for (8..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 12, 13, 14, 15 => |count| { + acc = finalize8(acc, partial[0..8]); + acc = finalize4(acc, partial[8..12]); + inline for (12..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 16, 17, 18, 19 => |count| { + acc = finalize8(acc, partial[0..8]); + acc = finalize8(acc, partial[8..16]); + inline for (16..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 20, 21, 22, 23 => |count| { + acc = finalize8(acc, partial[0..8]); + acc = finalize8(acc, partial[8..16]); + acc = finalize4(acc, partial[16..20]); + inline for (20..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 24, 25, 26, 27 => |count| { + acc = finalize8(acc, partial[0..8]); + acc = finalize8(acc, partial[8..16]); + acc = finalize8(acc, partial[16..24]); + inline for (24..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 28, 29, 30, 31 => |count| { + acc = finalize8(acc, partial[0..8]); + acc = finalize8(acc, partial[8..16]); + acc = finalize8(acc, partial[16..24]); + acc = finalize4(acc, partial[24..28]); + inline for (28..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + else => unreachable, + } + } + + fn finalize8(v: u64, bytes: *const [8]u8) u64 { + var acc = v; + const lane = mem.readIntLittle(u64, bytes); + acc ^= round(0, lane); + acc = rotl(u64, acc, 27) *% prime_1; + acc +%= prime_4; + return acc; + } + + fn finalize4(v: u64, bytes: *const [4]u8) u64 { + var acc = v; + const lane = @as(u64, mem.readIntLittle(u32, bytes)); + acc ^= lane *% prime_1; + acc = rotl(u64, acc, 23) *% prime_2; + acc +%= prime_3; + return acc; + } + + fn finalize1(v: u64, byte: u8) u64 { + var acc = v; + const lane = @as(u64, byte); + acc ^= lane *% prime_5; + acc = rotl(u64, acc, 11) *% prime_1; + return acc; + } + + fn avalanche(value: u64) u64 { + var result = value ^ (value >> 33); + result *%= prime_2; + result ^= result >> 29; + result *%= prime_3; + result ^= result >> 32; + + return result; + } + pub fn init(seed: u64) XxHash64 { return XxHash64{ + .accumulator = Accumulator.init(seed), .seed = seed, - .acc1 = seed +% prime_1 +% prime_2, - .acc2 = seed +% prime_2, - .acc3 = seed, - .acc4 = seed -% prime_1, .buf = undefined, .buf_len = 0, .byte_count = 0, }; } - pub fn update(self: *XxHash64, input: []const u8) void { + pub fn update(self: *XxHash64, input: anytype) void { + validateType(@TypeOf(input)); + if (input.len < 32 - self.buf_len) { @memcpy(self.buf[self.buf_len..][0..input.len], input); self.buf_len += input.len; @@ -46,99 +196,54 @@ pub const XxHash64 = struct { if (self.buf_len > 0) { i = 32 - self.buf_len; @memcpy(self.buf[self.buf_len..][0..i], input[0..i]); - self.processStripe(&self.buf); - self.buf_len = 0; + self.accumulator.processStripe(&self.buf); + self.byte_count += self.buf_len; } - while (i + 32 <= input.len) : (i += 32) { - self.processStripe(input[i..][0..32]); - } + i += self.accumulator.updateEmpty(input[i..], 32); + self.byte_count += i; const remaining_bytes = input[i..]; @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes); self.buf_len = remaining_bytes.len; } - inline fn processStripe(self: *XxHash64, buf: *const [32]u8) void { - self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8])); - self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16])); - self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24])); - self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32])); - self.byte_count += 32; - } - - inline fn round(acc: u64, lane: u64) u64 { + fn round(acc: u64, lane: u64) u64 { const a = acc +% (lane *% prime_2); const b = rotl(u64, a, 31); return b *% prime_1; } pub fn final(self: *XxHash64) u64 { - var acc: u64 = undefined; + const unfinished = if (self.byte_count < 32) + self.seed +% prime_5 + else + self.accumulator.merge(); - if (self.byte_count < 32) { - acc = self.seed +% prime_5; + return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]); + } + + const Size = enum { + small, + large, + unknown, + }; + + pub fn hash(seed: u64, input: anytype) u64 { + validateType(@TypeOf(input)); + + if (input.len < 32) { + return finalize(seed +% prime_5, 0, input); } else { - acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +% - rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18); - acc = mergeAccumulator(acc, self.acc1); - acc = mergeAccumulator(acc, self.acc2); - acc = mergeAccumulator(acc, self.acc3); - acc = mergeAccumulator(acc, self.acc4); + var hasher = Accumulator.init(seed); + const i = hasher.updateEmpty(input, 0); + return finalize(hasher.merge(), i, input[i..]); } - - acc = acc +% @as(u64, self.byte_count) +% @as(u64, self.buf_len); - - var pos: usize = 0; - while (pos + 8 <= self.buf_len) : (pos += 8) { - const lane = mem.readIntLittle(u64, self.buf[pos..][0..8]); - acc ^= round(0, lane); - acc = rotl(u64, acc, 27) *% prime_1; - acc +%= prime_4; - } - - if (pos + 4 <= self.buf_len) { - const lane = @as(u64, mem.readIntLittle(u32, self.buf[pos..][0..4])); - acc ^= lane *% prime_1; - acc = rotl(u64, acc, 23) *% prime_2; - acc +%= prime_3; - pos += 4; - } - - while (pos < self.buf_len) : (pos += 1) { - const lane = @as(u64, self.buf[pos]); - acc ^= lane *% prime_5; - acc = rotl(u64, acc, 11) *% prime_1; - } - - acc ^= acc >> 33; - acc *%= prime_2; - acc ^= acc >> 29; - acc *%= prime_3; - acc ^= acc >> 32; - - return acc; - } - - inline fn mergeAccumulator(acc: u64, other: u64) u64 { - const a = acc ^ round(0, other); - const b = a *% prime_1; - return b +% prime_4; - } - - pub fn hash(seed: u64, input: []const u8) u64 { - var hasher = XxHash64.init(seed); - hasher.update(input); - return hasher.final(); } }; pub const XxHash32 = struct { - acc1: u32, - acc2: u32, - acc3: u32, - acc4: u32, - + accumulator: Accumulator, seed: u32, buf: [16]u8, buf_len: usize, @@ -150,13 +255,57 @@ pub const XxHash32 = struct { const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111 const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001 + const Accumulator = struct { + acc1: u32, + acc2: u32, + acc3: u32, + acc4: u32, + + fn init(seed: u32) Accumulator { + return .{ + .acc1 = seed +% prime_1 +% prime_2, + .acc2 = seed +% prime_2, + .acc3 = seed, + .acc4 = seed -% prime_1, + }; + } + + fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize { + var i: usize = 0; + + if (unroll_count > 0) { + const unrolled_bytes = unroll_count * 16; + while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) { + inline for (0..unroll_count) |j| { + self.processStripe(input[i + j * 16 ..][0..16]); + } + } + } + + while (i + 16 <= input.len) : (i += 16) { + self.processStripe(input[i..][0..16]); + } + + return i; + } + + fn processStripe(self: *Accumulator, buf: *const [16]u8) void { + self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4])); + self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8])); + self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12])); + self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16])); + } + + fn merge(self: Accumulator) u32 { + return rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +% + rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18); + } + }; + pub fn init(seed: u32) XxHash32 { return XxHash32{ + .accumulator = Accumulator.init(seed), .seed = seed, - .acc1 = seed +% prime_1 +% prime_2, - .acc2 = seed +% prime_2, - .acc3 = seed, - .acc4 = seed -% prime_1, .buf = undefined, .buf_len = 0, .byte_count = 0, @@ -164,6 +313,8 @@ pub const XxHash32 = struct { } pub fn update(self: *XxHash32, input: []const u8) void { + validateType(@TypeOf(input)); + if (input.len < 16 - self.buf_len) { @memcpy(self.buf[self.buf_len..][0..input.len], input); self.buf_len += input.len; @@ -175,59 +326,85 @@ pub const XxHash32 = struct { if (self.buf_len > 0) { i = 16 - self.buf_len; @memcpy(self.buf[self.buf_len..][0..i], input[0..i]); - self.processStripe(&self.buf); + self.accumulator.processStripe(&self.buf); + self.byte_count += self.buf_len; self.buf_len = 0; } - while (i + 16 <= input.len) : (i += 16) { - self.processStripe(input[i..][0..16]); - } + i += self.accumulator.updateEmpty(input[i..], 16); + self.byte_count += i; const remaining_bytes = input[i..]; @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes); self.buf_len = remaining_bytes.len; } - inline fn processStripe(self: *XxHash32, buf: *const [16]u8) void { - self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4])); - self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8])); - self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12])); - self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16])); - self.byte_count += 16; - } - - inline fn round(acc: u32, lane: u32) u32 { + fn round(acc: u32, lane: u32) u32 { const a = acc +% (lane *% prime_2); const b = rotl(u32, a, 13); return b *% prime_1; } pub fn final(self: *XxHash32) u32 { - var acc: u32 = undefined; + const unfinished = if (self.byte_count < 16) + self.seed +% prime_5 + else + self.accumulator.merge(); - if (self.byte_count < 16) { - acc = self.seed +% prime_5; - } else { - acc = rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +% - rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18); + return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]); + } + + fn finalize(unfinished: u32, byte_count: usize, partial: anytype) u32 { + std.debug.assert(partial.len < 16); + var acc = unfinished +% @as(u32, @intCast(byte_count)) +% @as(u32, @intCast(partial.len)); + + switch (partial.len) { + inline 0, 1, 2, 3 => |count| { + inline for (0..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 4, 5, 6, 7 => |count| { + acc = finalize4(acc, partial[0..4]); + inline for (4..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 8, 9, 10, 11 => |count| { + acc = finalize4(acc, partial[0..4]); + acc = finalize4(acc, partial[4..8]); + inline for (8..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + inline 12, 13, 14, 15 => |count| { + acc = finalize4(acc, partial[0..4]); + acc = finalize4(acc, partial[4..8]); + acc = finalize4(acc, partial[8..12]); + inline for (12..count) |i| acc = finalize1(acc, partial[i]); + return avalanche(acc); + }, + else => unreachable, } - acc = acc +% @as(u32, @intCast(self.byte_count)) +% @as(u32, @intCast(self.buf_len)); + return avalanche(acc); + } - var pos: usize = 0; - while (pos + 4 <= self.buf_len) : (pos += 4) { - const lane = mem.readIntLittle(u32, self.buf[pos..][0..4]); - acc +%= lane *% prime_3; - acc = rotl(u32, acc, 17) *% prime_4; - } + fn finalize4(v: u32, bytes: *const [4]u8) u32 { + var acc = v; + const lane = mem.readIntLittle(u32, bytes); + acc +%= lane *% prime_3; + acc = rotl(u32, acc, 17) *% prime_4; + return acc; + } - while (pos < self.buf_len) : (pos += 1) { - const lane = @as(u32, self.buf[pos]); - acc +%= lane *% prime_5; - acc = rotl(u32, acc, 11) *% prime_1; - } + fn finalize1(v: u32, byte: u8) u32 { + var acc = v; + const lane = @as(u32, byte); + acc +%= lane *% prime_5; + acc = rotl(u32, acc, 11) *% prime_1; + return acc; + } - acc ^= acc >> 15; + fn avalanche(value: u32) u32 { + var acc = value ^ value >> 15; acc *%= prime_2; acc ^= acc >> 13; acc *%= prime_3; @@ -236,33 +413,58 @@ pub const XxHash32 = struct { return acc; } - pub fn hash(seed: u32, input: []const u8) u32 { - var hasher = XxHash32.init(seed); - hasher.update(input); - return hasher.final(); + pub fn hash(seed: u32, input: anytype) u32 { + validateType(@TypeOf(input)); + + if (input.len < 16) { + return finalize(seed +% prime_5, 0, input); + } else { + var hasher = Accumulator.init(seed); + const i = hasher.updateEmpty(input, 0); + return finalize(hasher.merge(), i, input[i..]); + } } }; -test "xxhash64" { - const hash = XxHash64.hash; +fn validateType(comptime T: type) void { + comptime { + if (!((std.meta.trait.isSlice(T) or + std.meta.trait.is(.Array)(T) or + std.meta.trait.isPtrTo(.Array)(T)) and + std.meta.Elem(T) == u8)) + { + @compileError("expect a slice, array or pointer to array of u8, got " ++ @typeName(T)); + } + } +} - try expectEqual(hash(0, ""), 0xef46db3751d8e999); - try expectEqual(hash(0, "a"), 0xd24ec4f1a98c6e5b); - try expectEqual(hash(0, "abc"), 0x44bc2cf5ad770999); - try expectEqual(hash(0, "message digest"), 0x066ed728fceeb3be); - try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0xcfe1f278fa89835c); - try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0xaaa46907d3047814); - try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0xe04a477f19ee145d); +fn testExpect(comptime H: type, seed: anytype, input: []const u8, expected: u64) !void { + try expectEqual(expected, H.hash(0, input)); + + var hasher = H.init(seed); + hasher.update(input); + try expectEqual(expected, hasher.final()); +} + +test "xxhash64" { + const H = XxHash64; + try testExpect(H, 0, "", 0xef46db3751d8e999); + try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b); + try testExpect(H, 0, "abc", 0x44bc2cf5ad770999); + try testExpect(H, 0, "message digest", 0x066ed728fceeb3be); + try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0xcfe1f278fa89835c); + try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0xaaa46907d3047814); + try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0xe04a477f19ee145d); } test "xxhash32" { - const hash = XxHash32.hash; + const H = XxHash32; - try expectEqual(hash(0, ""), 0x02cc5d05); - try expectEqual(hash(0, "a"), 0x550d7456); - try expectEqual(hash(0, "abc"), 0x32d153ff); - try expectEqual(hash(0, "message digest"), 0x7c948494); - try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0x63a14d5f); - try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x9c285e64); - try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x9c05f475); + try testExpect(H, 0, "", 0x02cc5d05); + try testExpect(H, 0, "a", 0x550d7456); + try testExpect(H, 0, "abc", 0x32d153ff); + try testExpect(H, 0, "message digest", 0x7c948494); + try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0x63a14d5f); + try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0x9c285e64); + try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0x9c05f475); }