Xxhash perf (#15947)

Improvements for xxHash performance, both on small keys as well as large slices.

* std.hash: make benchmarks consistent for xxhash

There is some odd behaviour in the timings for the XXHash benchmarks
introduced in 879f0b9, specifically the changes to the allocation in
benchmarkHash. The problem is somewhere in the difference between
9628243 and 9362d61 (these are commit that were force-pushed over but
        can be found in PR #15917).

* std.hash: correctly calculate throughput in benchmark
* std.hash: add hashes per sec to small key output
* std.hash: add exact and small xxhash routines
* std.hash: add --small-only flag to benchmark
* std.hash.xxhash: extract stateless Accumulator type
* std.hash.xxhash: cleanup hash() and improve small key perf
* std.hash.xxhash: port xxhash64 changes to xxhash32
* std.hash: change benchmark --small-only flag to --include-array
This commit is contained in:
Dominic 2023-07-25 03:47:45 +10:00 committed by GitHub
parent d82b359010
commit 559150e844
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 565 additions and 158 deletions

View File

@ -18,6 +18,7 @@ const Hash = struct {
name: []const u8, name: []const u8,
has_iterative_api: bool = true, has_iterative_api: bool = true,
has_crypto_api: bool = false, has_crypto_api: bool = false,
has_anytype_api: ?[]const comptime_int = null,
init_u8s: ?[]const u8 = null, init_u8s: ?[]const u8 = null,
init_u64: ?u64 = null, init_u64: ?u64 = null,
}; };
@ -27,11 +28,13 @@ const hashes = [_]Hash{
.ty = hash.XxHash64, .ty = hash.XxHash64,
.name = "xxhash64", .name = "xxhash64",
.init_u64 = 0, .init_u64 = 0,
.has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
}, },
Hash{ Hash{
.ty = hash.XxHash32, .ty = hash.XxHash32,
.name = "xxhash32", .name = "xxhash32",
.init_u64 = 0, .init_u64 = 0,
.has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
}, },
Hash{ Hash{
.ty = hash.Wyhash, .ty = hash.Wyhash,
@ -99,14 +102,14 @@ const Result = struct {
}; };
const block_size: usize = 8 * 8192; const block_size: usize = 8 * 8192;
const alignment: usize = 64;
pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result { pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result {
const blocks_count = bytes / block_size; var blocks = try allocator.alloc(u8, bytes);
var blocks = try allocator.alloc(u8, block_size + alignment * (blocks_count - 1));
defer allocator.free(blocks); defer allocator.free(blocks);
random.bytes(blocks); random.bytes(blocks);
const block_count = bytes / block_size;
var h = blk: { var h = blk: {
if (H.init_u8s) |init| { if (H.init_u8s) |init| {
break :blk H.ty.init(init[0..H.ty.key_length]); break :blk H.ty.init(init[0..H.ty.key_length]);
@ -118,17 +121,17 @@ pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Alloc
}; };
var timer = try Timer.start(); var timer = try Timer.start();
const start = timer.lap(); for (0..block_count) |i| {
for (0..blocks_count) |i| { h.update(blocks[i * block_size ..][0..block_size]);
h.update(blocks[i * alignment ..][0..block_size]);
} }
const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final(); const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final();
std.mem.doNotOptimizeAway(final); std.mem.doNotOptimizeAway(final);
const end = timer.read(); const elapsed_ns = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s)); const size_float: f64 = @floatFromInt(block_size * block_count);
const throughput: u64 = @intFromFloat(size_float / elapsed_s);
return Result{ return Result{
.hash = final, .hash = final,
@ -144,7 +147,6 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
const key_count = bytes / key_size; const key_count = bytes / key_size;
var timer = try Timer.start(); var timer = try Timer.start();
const start = timer.lap();
var sum: u64 = 0; var sum: u64 = 0;
for (0..key_count) |i| { for (0..key_count) |i| {
@ -164,10 +166,11 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
}; };
sum +%= final; sum +%= final;
} }
const end = timer.read(); const elapsed_ns = timer.read();
const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s)); const size_float: f64 = @floatFromInt(key_count * key_size);
const throughput: u64 = @intFromFloat(size_float / elapsed_s);
std.mem.doNotOptimizeAway(sum); std.mem.doNotOptimizeAway(sum);
@ -177,6 +180,143 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
}; };
} }
// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
// if you see strange performance changes consider using `.never_inline` or `.always_inline`
// to ensure the changes are not only due to the optimiser inlining the benchmark differently
pub fn benchmarkHashSmallKeysArrayPtr(
comptime H: anytype,
comptime key_size: usize,
bytes: usize,
allocator: std.mem.Allocator,
) !Result {
var blocks = try allocator.alloc(u8, bytes);
defer allocator.free(blocks);
random.bytes(blocks);
const key_count = bytes / key_size;
var timer = try Timer.start();
var sum: u64 = 0;
for (0..key_count) |i| {
const small_key = blocks[i * key_size ..][0..key_size];
const final: u64 = blk: {
if (H.init_u8s) |init| {
if (H.has_crypto_api) {
break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
} else {
break :blk H.ty.hash(init, small_key);
}
}
if (H.init_u64) |init| {
break :blk H.ty.hash(init, small_key);
}
break :blk H.ty.hash(small_key);
};
sum +%= final;
}
const elapsed_ns = timer.read();
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
std.mem.doNotOptimizeAway(sum);
return Result{
.hash = sum,
.throughput = throughput,
};
}
// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
// if you see strange performance changes consider using `.never_inline` or `.always_inline`
// to ensure the changes are not only due to the optimiser inlining the benchmark differently
pub fn benchmarkHashSmallKeysArray(
comptime H: anytype,
comptime key_size: usize,
bytes: usize,
allocator: std.mem.Allocator,
) !Result {
var blocks = try allocator.alloc(u8, bytes);
defer allocator.free(blocks);
random.bytes(blocks);
const key_count = bytes / key_size;
var i: usize = 0;
var timer = try Timer.start();
var sum: u64 = 0;
while (i < key_count) : (i += 1) {
const small_key = blocks[i * key_size ..][0..key_size];
const final: u64 = blk: {
if (H.init_u8s) |init| {
if (H.has_crypto_api) {
break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
} else {
break :blk H.ty.hash(init, small_key.*);
}
}
if (H.init_u64) |init| {
break :blk H.ty.hash(init, small_key.*);
}
break :blk H.ty.hash(small_key.*);
};
sum +%= final;
}
const elapsed_ns = timer.read();
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
std.mem.doNotOptimizeAway(sum);
return Result{
.hash = sum,
.throughput = throughput,
};
}
pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
var blocks = try allocator.alloc(u8, bytes);
defer allocator.free(blocks);
random.bytes(blocks);
const key_count = bytes / key_size;
var timer = try Timer.start();
var sum: u64 = 0;
for (0..key_count) |i| {
const small_key = blocks[i * key_size ..][0..key_size];
const final: u64 = blk: {
if (H.init_u8s) |init| {
if (H.has_crypto_api) {
break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
} else {
break :blk H.ty.hashSmall(init, small_key);
}
}
if (H.init_u64) |init| {
break :blk H.ty.hashSmall(init, small_key);
}
break :blk H.ty.hashSmall(small_key);
};
sum +%= final;
}
const elapsed_ns = timer.read();
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
std.mem.doNotOptimizeAway(sum);
return Result{
.throughput = throughput,
.hash = sum,
};
}
fn usage() void { fn usage() void {
std.debug.print( std.debug.print(
\\throughput_test [options] \\throughput_test [options]
@ -205,9 +345,12 @@ pub fn main() !void {
var filter: ?[]u8 = ""; var filter: ?[]u8 = "";
var count: usize = mode(128 * MiB); var count: usize = mode(128 * MiB);
var key_size: usize = 32; var key_size: ?usize = null;
var seed: u32 = 0; var seed: u32 = 0;
var test_iterative_only = false; var test_iterative_only = false;
var test_arrays = false;
const default_small_key_size = 32;
var i: usize = 1; var i: usize = 1;
while (i < args.len) : (i += 1) { while (i < args.len) : (i += 1) {
@ -248,12 +391,14 @@ pub fn main() !void {
} }
key_size = try std.fmt.parseUnsigned(usize, args[i], 10); key_size = try std.fmt.parseUnsigned(usize, args[i], 10);
if (key_size > block_size) { if (key_size.? > block_size) {
try stdout.print("key_size cannot exceed block size of {}\n", .{block_size}); try stdout.print("key_size cannot exceed block size of {}\n", .{block_size});
std.os.exit(1); std.os.exit(1);
} }
} else if (std.mem.eql(u8, args[i], "--iterative-only")) { } else if (std.mem.eql(u8, args[i], "--iterative-only")) {
test_iterative_only = true; test_iterative_only = true;
} else if (std.mem.eql(u8, args[i], "--include-array")) {
test_arrays = true;
} else if (std.mem.eql(u8, args[i], "--help")) { } else if (std.mem.eql(u8, args[i], "--help")) {
usage(); usage();
return; return;
@ -268,7 +413,7 @@ pub fn main() !void {
const allocator = gpa.allocator(); const allocator = gpa.allocator();
inline for (hashes) |H| { inline for (hashes) |H| {
if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) { if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: {
if (!test_iterative_only or H.has_iterative_api) { if (!test_iterative_only or H.has_iterative_api) {
try stdout.print("{s}\n", .{H.name}); try stdout.print("{s}\n", .{H.name});
@ -281,9 +426,69 @@ pub fn main() !void {
} }
if (!test_iterative_only) { if (!test_iterative_only) {
prng.seed(seed); if (key_size) |size| {
const result_small = try benchmarkHashSmallKeys(H, key_size, count, allocator); prng.seed(seed);
try stdout.print(" small keys: {:5} MiB/s [{x:0<16}]\n", .{ result_small.throughput / (1 * MiB), result_small.hash }); const result_small = try benchmarkHashSmallKeys(H, size, count, allocator);
try stdout.print(" small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
size,
result_small.throughput / (1 * MiB),
result_small.throughput / size,
result_small.hash,
});
if (!test_arrays) break :hash;
if (H.has_anytype_api) |sizes| {
inline for (sizes) |exact_size| {
if (size == exact_size) {
prng.seed(seed);
const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
prng.seed(seed);
const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
try stdout.print(" array: {:5} MiB/s [{x:0<16}]\n", .{
result_array.throughput / (1 * MiB),
result_array.hash,
});
try stdout.print(" array ptr: {:5} MiB/s [{x:0<16}]\n", .{
result_ptr.throughput / (1 * MiB),
result_ptr.hash,
});
}
}
}
} else {
prng.seed(seed);
const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator);
try stdout.print(" small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
default_small_key_size,
result_small.throughput / (1 * MiB),
result_small.throughput / default_small_key_size,
result_small.hash,
});
if (!test_arrays) break :hash;
if (H.has_anytype_api) |sizes| {
try stdout.print(" array:\n", .{});
inline for (sizes) |exact_size| {
prng.seed(seed);
const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
try stdout.print(" {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
exact_size,
result.throughput / (1 * MiB),
result.hash,
});
}
try stdout.print(" array ptr: \n", .{});
inline for (sizes) |exact_size| {
prng.seed(seed);
const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
try stdout.print(" {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
exact_size,
result.throughput / (1 * MiB),
result.hash,
});
}
}
}
} }
} }
} }

View File

@ -5,11 +5,7 @@ const expectEqual = std.testing.expectEqual;
const rotl = std.math.rotl; const rotl = std.math.rotl;
pub const XxHash64 = struct { pub const XxHash64 = struct {
acc1: u64, accumulator: Accumulator,
acc2: u64,
acc3: u64,
acc4: u64,
seed: u64, seed: u64,
buf: [32]u8, buf: [32]u8,
buf_len: usize, buf_len: usize,
@ -21,20 +17,174 @@ pub const XxHash64 = struct {
const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011 const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011
const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101 const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101
const Accumulator = struct {
acc1: u64,
acc2: u64,
acc3: u64,
acc4: u64,
fn init(seed: u64) Accumulator {
return .{
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
};
}
fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
var i: usize = 0;
if (unroll_count > 0) {
const unrolled_bytes = unroll_count * 32;
while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
inline for (0..unroll_count) |j| {
self.processStripe(input[i + j * 32 ..][0..32]);
}
}
}
while (i + 32 <= input.len) : (i += 32) {
self.processStripe(input[i..][0..32]);
}
return i;
}
fn processStripe(self: *Accumulator, buf: *const [32]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
}
fn merge(self: Accumulator) u64 {
var acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
acc = mergeAccumulator(acc, self.acc1);
acc = mergeAccumulator(acc, self.acc2);
acc = mergeAccumulator(acc, self.acc3);
acc = mergeAccumulator(acc, self.acc4);
return acc;
}
fn mergeAccumulator(acc: u64, other: u64) u64 {
const a = acc ^ round(0, other);
const b = a *% prime_1;
return b +% prime_4;
}
};
fn finalize(
unfinished: u64,
byte_count: usize,
partial: anytype,
) u64 {
std.debug.assert(partial.len < 32);
var acc = unfinished +% @as(u64, byte_count) +% @as(u64, partial.len);
switch (partial.len) {
inline 0, 1, 2, 3 => |count| {
inline for (0..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 4, 5, 6, 7 => |count| {
acc = finalize4(acc, partial[0..4]);
inline for (4..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 8, 9, 10, 11 => |count| {
acc = finalize8(acc, partial[0..8]);
inline for (8..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 12, 13, 14, 15 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize4(acc, partial[8..12]);
inline for (12..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 16, 17, 18, 19 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
inline for (16..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 20, 21, 22, 23 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize4(acc, partial[16..20]);
inline for (20..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 24, 25, 26, 27 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize8(acc, partial[16..24]);
inline for (24..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 28, 29, 30, 31 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize8(acc, partial[16..24]);
acc = finalize4(acc, partial[24..28]);
inline for (28..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
else => unreachable,
}
}
fn finalize8(v: u64, bytes: *const [8]u8) u64 {
var acc = v;
const lane = mem.readIntLittle(u64, bytes);
acc ^= round(0, lane);
acc = rotl(u64, acc, 27) *% prime_1;
acc +%= prime_4;
return acc;
}
fn finalize4(v: u64, bytes: *const [4]u8) u64 {
var acc = v;
const lane = @as(u64, mem.readIntLittle(u32, bytes));
acc ^= lane *% prime_1;
acc = rotl(u64, acc, 23) *% prime_2;
acc +%= prime_3;
return acc;
}
fn finalize1(v: u64, byte: u8) u64 {
var acc = v;
const lane = @as(u64, byte);
acc ^= lane *% prime_5;
acc = rotl(u64, acc, 11) *% prime_1;
return acc;
}
fn avalanche(value: u64) u64 {
var result = value ^ (value >> 33);
result *%= prime_2;
result ^= result >> 29;
result *%= prime_3;
result ^= result >> 32;
return result;
}
pub fn init(seed: u64) XxHash64 { pub fn init(seed: u64) XxHash64 {
return XxHash64{ return XxHash64{
.accumulator = Accumulator.init(seed),
.seed = seed, .seed = seed,
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
.buf = undefined, .buf = undefined,
.buf_len = 0, .buf_len = 0,
.byte_count = 0, .byte_count = 0,
}; };
} }
pub fn update(self: *XxHash64, input: []const u8) void { pub fn update(self: *XxHash64, input: anytype) void {
validateType(@TypeOf(input));
if (input.len < 32 - self.buf_len) { if (input.len < 32 - self.buf_len) {
@memcpy(self.buf[self.buf_len..][0..input.len], input); @memcpy(self.buf[self.buf_len..][0..input.len], input);
self.buf_len += input.len; self.buf_len += input.len;
@ -46,99 +196,54 @@ pub const XxHash64 = struct {
if (self.buf_len > 0) { if (self.buf_len > 0) {
i = 32 - self.buf_len; i = 32 - self.buf_len;
@memcpy(self.buf[self.buf_len..][0..i], input[0..i]); @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
self.processStripe(&self.buf); self.accumulator.processStripe(&self.buf);
self.buf_len = 0; self.byte_count += self.buf_len;
} }
while (i + 32 <= input.len) : (i += 32) { i += self.accumulator.updateEmpty(input[i..], 32);
self.processStripe(input[i..][0..32]); self.byte_count += i;
}
const remaining_bytes = input[i..]; const remaining_bytes = input[i..];
@memcpy(self.buf[0..remaining_bytes.len], remaining_bytes); @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
self.buf_len = remaining_bytes.len; self.buf_len = remaining_bytes.len;
} }
inline fn processStripe(self: *XxHash64, buf: *const [32]u8) void { fn round(acc: u64, lane: u64) u64 {
self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
self.byte_count += 32;
}
inline fn round(acc: u64, lane: u64) u64 {
const a = acc +% (lane *% prime_2); const a = acc +% (lane *% prime_2);
const b = rotl(u64, a, 31); const b = rotl(u64, a, 31);
return b *% prime_1; return b *% prime_1;
} }
pub fn final(self: *XxHash64) u64 { pub fn final(self: *XxHash64) u64 {
var acc: u64 = undefined; const unfinished = if (self.byte_count < 32)
self.seed +% prime_5
else
self.accumulator.merge();
if (self.byte_count < 32) { return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
acc = self.seed +% prime_5; }
const Size = enum {
small,
large,
unknown,
};
pub fn hash(seed: u64, input: anytype) u64 {
validateType(@TypeOf(input));
if (input.len < 32) {
return finalize(seed +% prime_5, 0, input);
} else { } else {
acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +% var hasher = Accumulator.init(seed);
rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18); const i = hasher.updateEmpty(input, 0);
acc = mergeAccumulator(acc, self.acc1); return finalize(hasher.merge(), i, input[i..]);
acc = mergeAccumulator(acc, self.acc2);
acc = mergeAccumulator(acc, self.acc3);
acc = mergeAccumulator(acc, self.acc4);
} }
acc = acc +% @as(u64, self.byte_count) +% @as(u64, self.buf_len);
var pos: usize = 0;
while (pos + 8 <= self.buf_len) : (pos += 8) {
const lane = mem.readIntLittle(u64, self.buf[pos..][0..8]);
acc ^= round(0, lane);
acc = rotl(u64, acc, 27) *% prime_1;
acc +%= prime_4;
}
if (pos + 4 <= self.buf_len) {
const lane = @as(u64, mem.readIntLittle(u32, self.buf[pos..][0..4]));
acc ^= lane *% prime_1;
acc = rotl(u64, acc, 23) *% prime_2;
acc +%= prime_3;
pos += 4;
}
while (pos < self.buf_len) : (pos += 1) {
const lane = @as(u64, self.buf[pos]);
acc ^= lane *% prime_5;
acc = rotl(u64, acc, 11) *% prime_1;
}
acc ^= acc >> 33;
acc *%= prime_2;
acc ^= acc >> 29;
acc *%= prime_3;
acc ^= acc >> 32;
return acc;
}
inline fn mergeAccumulator(acc: u64, other: u64) u64 {
const a = acc ^ round(0, other);
const b = a *% prime_1;
return b +% prime_4;
}
pub fn hash(seed: u64, input: []const u8) u64 {
var hasher = XxHash64.init(seed);
hasher.update(input);
return hasher.final();
} }
}; };
pub const XxHash32 = struct { pub const XxHash32 = struct {
acc1: u32, accumulator: Accumulator,
acc2: u32,
acc3: u32,
acc4: u32,
seed: u32, seed: u32,
buf: [16]u8, buf: [16]u8,
buf_len: usize, buf_len: usize,
@ -150,13 +255,57 @@ pub const XxHash32 = struct {
const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111 const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001 const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001
const Accumulator = struct {
acc1: u32,
acc2: u32,
acc3: u32,
acc4: u32,
fn init(seed: u32) Accumulator {
return .{
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
};
}
fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
var i: usize = 0;
if (unroll_count > 0) {
const unrolled_bytes = unroll_count * 16;
while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
inline for (0..unroll_count) |j| {
self.processStripe(input[i + j * 16 ..][0..16]);
}
}
}
while (i + 16 <= input.len) : (i += 16) {
self.processStripe(input[i..][0..16]);
}
return i;
}
fn processStripe(self: *Accumulator, buf: *const [16]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
}
fn merge(self: Accumulator) u32 {
return rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
}
};
pub fn init(seed: u32) XxHash32 { pub fn init(seed: u32) XxHash32 {
return XxHash32{ return XxHash32{
.accumulator = Accumulator.init(seed),
.seed = seed, .seed = seed,
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
.buf = undefined, .buf = undefined,
.buf_len = 0, .buf_len = 0,
.byte_count = 0, .byte_count = 0,
@ -164,6 +313,8 @@ pub const XxHash32 = struct {
} }
pub fn update(self: *XxHash32, input: []const u8) void { pub fn update(self: *XxHash32, input: []const u8) void {
validateType(@TypeOf(input));
if (input.len < 16 - self.buf_len) { if (input.len < 16 - self.buf_len) {
@memcpy(self.buf[self.buf_len..][0..input.len], input); @memcpy(self.buf[self.buf_len..][0..input.len], input);
self.buf_len += input.len; self.buf_len += input.len;
@ -175,59 +326,85 @@ pub const XxHash32 = struct {
if (self.buf_len > 0) { if (self.buf_len > 0) {
i = 16 - self.buf_len; i = 16 - self.buf_len;
@memcpy(self.buf[self.buf_len..][0..i], input[0..i]); @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
self.processStripe(&self.buf); self.accumulator.processStripe(&self.buf);
self.byte_count += self.buf_len;
self.buf_len = 0; self.buf_len = 0;
} }
while (i + 16 <= input.len) : (i += 16) { i += self.accumulator.updateEmpty(input[i..], 16);
self.processStripe(input[i..][0..16]); self.byte_count += i;
}
const remaining_bytes = input[i..]; const remaining_bytes = input[i..];
@memcpy(self.buf[0..remaining_bytes.len], remaining_bytes); @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
self.buf_len = remaining_bytes.len; self.buf_len = remaining_bytes.len;
} }
inline fn processStripe(self: *XxHash32, buf: *const [16]u8) void { fn round(acc: u32, lane: u32) u32 {
self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
self.byte_count += 16;
}
inline fn round(acc: u32, lane: u32) u32 {
const a = acc +% (lane *% prime_2); const a = acc +% (lane *% prime_2);
const b = rotl(u32, a, 13); const b = rotl(u32, a, 13);
return b *% prime_1; return b *% prime_1;
} }
pub fn final(self: *XxHash32) u32 { pub fn final(self: *XxHash32) u32 {
var acc: u32 = undefined; const unfinished = if (self.byte_count < 16)
self.seed +% prime_5
else
self.accumulator.merge();
if (self.byte_count < 16) { return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
acc = self.seed +% prime_5; }
} else {
acc = rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +% fn finalize(unfinished: u32, byte_count: usize, partial: anytype) u32 {
rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18); std.debug.assert(partial.len < 16);
var acc = unfinished +% @as(u32, @intCast(byte_count)) +% @as(u32, @intCast(partial.len));
switch (partial.len) {
inline 0, 1, 2, 3 => |count| {
inline for (0..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 4, 5, 6, 7 => |count| {
acc = finalize4(acc, partial[0..4]);
inline for (4..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 8, 9, 10, 11 => |count| {
acc = finalize4(acc, partial[0..4]);
acc = finalize4(acc, partial[4..8]);
inline for (8..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 12, 13, 14, 15 => |count| {
acc = finalize4(acc, partial[0..4]);
acc = finalize4(acc, partial[4..8]);
acc = finalize4(acc, partial[8..12]);
inline for (12..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
else => unreachable,
} }
acc = acc +% @as(u32, @intCast(self.byte_count)) +% @as(u32, @intCast(self.buf_len)); return avalanche(acc);
}
var pos: usize = 0; fn finalize4(v: u32, bytes: *const [4]u8) u32 {
while (pos + 4 <= self.buf_len) : (pos += 4) { var acc = v;
const lane = mem.readIntLittle(u32, self.buf[pos..][0..4]); const lane = mem.readIntLittle(u32, bytes);
acc +%= lane *% prime_3; acc +%= lane *% prime_3;
acc = rotl(u32, acc, 17) *% prime_4; acc = rotl(u32, acc, 17) *% prime_4;
} return acc;
}
while (pos < self.buf_len) : (pos += 1) { fn finalize1(v: u32, byte: u8) u32 {
const lane = @as(u32, self.buf[pos]); var acc = v;
acc +%= lane *% prime_5; const lane = @as(u32, byte);
acc = rotl(u32, acc, 11) *% prime_1; acc +%= lane *% prime_5;
} acc = rotl(u32, acc, 11) *% prime_1;
return acc;
}
acc ^= acc >> 15; fn avalanche(value: u32) u32 {
var acc = value ^ value >> 15;
acc *%= prime_2; acc *%= prime_2;
acc ^= acc >> 13; acc ^= acc >> 13;
acc *%= prime_3; acc *%= prime_3;
@ -236,33 +413,58 @@ pub const XxHash32 = struct {
return acc; return acc;
} }
pub fn hash(seed: u32, input: []const u8) u32 { pub fn hash(seed: u32, input: anytype) u32 {
var hasher = XxHash32.init(seed); validateType(@TypeOf(input));
hasher.update(input);
return hasher.final(); if (input.len < 16) {
return finalize(seed +% prime_5, 0, input);
} else {
var hasher = Accumulator.init(seed);
const i = hasher.updateEmpty(input, 0);
return finalize(hasher.merge(), i, input[i..]);
}
} }
}; };
test "xxhash64" { fn validateType(comptime T: type) void {
const hash = XxHash64.hash; comptime {
if (!((std.meta.trait.isSlice(T) or
std.meta.trait.is(.Array)(T) or
std.meta.trait.isPtrTo(.Array)(T)) and
std.meta.Elem(T) == u8))
{
@compileError("expect a slice, array or pointer to array of u8, got " ++ @typeName(T));
}
}
}
try expectEqual(hash(0, ""), 0xef46db3751d8e999); fn testExpect(comptime H: type, seed: anytype, input: []const u8, expected: u64) !void {
try expectEqual(hash(0, "a"), 0xd24ec4f1a98c6e5b); try expectEqual(expected, H.hash(0, input));
try expectEqual(hash(0, "abc"), 0x44bc2cf5ad770999);
try expectEqual(hash(0, "message digest"), 0x066ed728fceeb3be); var hasher = H.init(seed);
try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0xcfe1f278fa89835c); hasher.update(input);
try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0xaaa46907d3047814); try expectEqual(expected, hasher.final());
try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0xe04a477f19ee145d); }
test "xxhash64" {
const H = XxHash64;
try testExpect(H, 0, "", 0xef46db3751d8e999);
try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b);
try testExpect(H, 0, "abc", 0x44bc2cf5ad770999);
try testExpect(H, 0, "message digest", 0x066ed728fceeb3be);
try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0xcfe1f278fa89835c);
try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0xaaa46907d3047814);
try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0xe04a477f19ee145d);
} }
test "xxhash32" { test "xxhash32" {
const hash = XxHash32.hash; const H = XxHash32;
try expectEqual(hash(0, ""), 0x02cc5d05); try testExpect(H, 0, "", 0x02cc5d05);
try expectEqual(hash(0, "a"), 0x550d7456); try testExpect(H, 0, "a", 0x550d7456);
try expectEqual(hash(0, "abc"), 0x32d153ff); try testExpect(H, 0, "abc", 0x32d153ff);
try expectEqual(hash(0, "message digest"), 0x7c948494); try testExpect(H, 0, "message digest", 0x7c948494);
try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0x63a14d5f); try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0x63a14d5f);
try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x9c285e64); try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0x9c285e64);
try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x9c05f475); try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0x9c05f475);
} }