zig/lib/std/hash/xxhash.zig
Dominic 559150e844
Xxhash perf (#15947)
Improvements for xxHash performance, both on small keys as well as large slices.

* std.hash: make benchmarks consistent for xxhash

There is some odd behaviour in the timings for the XXHash benchmarks
introduced in 879f0b9, specifically the changes to the allocation in
benchmarkHash. The problem is somewhere in the difference between
9628243 and 9362d61 (these are commit that were force-pushed over but
        can be found in PR #15917).

* std.hash: correctly calculate throughput in benchmark
* std.hash: add hashes per sec to small key output
* std.hash: add exact and small xxhash routines
* std.hash: add --small-only flag to benchmark
* std.hash.xxhash: extract stateless Accumulator type
* std.hash.xxhash: cleanup hash() and improve small key perf
* std.hash.xxhash: port xxhash64 changes to xxhash32
* std.hash: change benchmark --small-only flag to --include-array
2023-07-24 13:47:45 -04:00

471 lines
16 KiB
Zig

const std = @import("std");
const mem = std.mem;
const expectEqual = std.testing.expectEqual;
const rotl = std.math.rotl;
pub const XxHash64 = struct {
accumulator: Accumulator,
seed: u64,
buf: [32]u8,
buf_len: usize,
byte_count: usize,
const prime_1 = 0x9E3779B185EBCA87; // 0b1001111000110111011110011011000110000101111010111100101010000111
const prime_2 = 0xC2B2AE3D27D4EB4F; // 0b1100001010110010101011100011110100100111110101001110101101001111
const prime_3 = 0x165667B19E3779F9; // 0b0001011001010110011001111011000110011110001101110111100111111001
const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011
const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101
const Accumulator = struct {
acc1: u64,
acc2: u64,
acc3: u64,
acc4: u64,
fn init(seed: u64) Accumulator {
return .{
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
};
}
fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
var i: usize = 0;
if (unroll_count > 0) {
const unrolled_bytes = unroll_count * 32;
while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
inline for (0..unroll_count) |j| {
self.processStripe(input[i + j * 32 ..][0..32]);
}
}
}
while (i + 32 <= input.len) : (i += 32) {
self.processStripe(input[i..][0..32]);
}
return i;
}
fn processStripe(self: *Accumulator, buf: *const [32]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
}
fn merge(self: Accumulator) u64 {
var acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
acc = mergeAccumulator(acc, self.acc1);
acc = mergeAccumulator(acc, self.acc2);
acc = mergeAccumulator(acc, self.acc3);
acc = mergeAccumulator(acc, self.acc4);
return acc;
}
fn mergeAccumulator(acc: u64, other: u64) u64 {
const a = acc ^ round(0, other);
const b = a *% prime_1;
return b +% prime_4;
}
};
fn finalize(
unfinished: u64,
byte_count: usize,
partial: anytype,
) u64 {
std.debug.assert(partial.len < 32);
var acc = unfinished +% @as(u64, byte_count) +% @as(u64, partial.len);
switch (partial.len) {
inline 0, 1, 2, 3 => |count| {
inline for (0..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 4, 5, 6, 7 => |count| {
acc = finalize4(acc, partial[0..4]);
inline for (4..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 8, 9, 10, 11 => |count| {
acc = finalize8(acc, partial[0..8]);
inline for (8..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 12, 13, 14, 15 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize4(acc, partial[8..12]);
inline for (12..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 16, 17, 18, 19 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
inline for (16..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 20, 21, 22, 23 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize4(acc, partial[16..20]);
inline for (20..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 24, 25, 26, 27 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize8(acc, partial[16..24]);
inline for (24..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 28, 29, 30, 31 => |count| {
acc = finalize8(acc, partial[0..8]);
acc = finalize8(acc, partial[8..16]);
acc = finalize8(acc, partial[16..24]);
acc = finalize4(acc, partial[24..28]);
inline for (28..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
else => unreachable,
}
}
fn finalize8(v: u64, bytes: *const [8]u8) u64 {
var acc = v;
const lane = mem.readIntLittle(u64, bytes);
acc ^= round(0, lane);
acc = rotl(u64, acc, 27) *% prime_1;
acc +%= prime_4;
return acc;
}
fn finalize4(v: u64, bytes: *const [4]u8) u64 {
var acc = v;
const lane = @as(u64, mem.readIntLittle(u32, bytes));
acc ^= lane *% prime_1;
acc = rotl(u64, acc, 23) *% prime_2;
acc +%= prime_3;
return acc;
}
fn finalize1(v: u64, byte: u8) u64 {
var acc = v;
const lane = @as(u64, byte);
acc ^= lane *% prime_5;
acc = rotl(u64, acc, 11) *% prime_1;
return acc;
}
fn avalanche(value: u64) u64 {
var result = value ^ (value >> 33);
result *%= prime_2;
result ^= result >> 29;
result *%= prime_3;
result ^= result >> 32;
return result;
}
pub fn init(seed: u64) XxHash64 {
return XxHash64{
.accumulator = Accumulator.init(seed),
.seed = seed,
.buf = undefined,
.buf_len = 0,
.byte_count = 0,
};
}
pub fn update(self: *XxHash64, input: anytype) void {
validateType(@TypeOf(input));
if (input.len < 32 - self.buf_len) {
@memcpy(self.buf[self.buf_len..][0..input.len], input);
self.buf_len += input.len;
return;
}
var i: usize = 0;
if (self.buf_len > 0) {
i = 32 - self.buf_len;
@memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
self.accumulator.processStripe(&self.buf);
self.byte_count += self.buf_len;
}
i += self.accumulator.updateEmpty(input[i..], 32);
self.byte_count += i;
const remaining_bytes = input[i..];
@memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
self.buf_len = remaining_bytes.len;
}
fn round(acc: u64, lane: u64) u64 {
const a = acc +% (lane *% prime_2);
const b = rotl(u64, a, 31);
return b *% prime_1;
}
pub fn final(self: *XxHash64) u64 {
const unfinished = if (self.byte_count < 32)
self.seed +% prime_5
else
self.accumulator.merge();
return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
}
const Size = enum {
small,
large,
unknown,
};
pub fn hash(seed: u64, input: anytype) u64 {
validateType(@TypeOf(input));
if (input.len < 32) {
return finalize(seed +% prime_5, 0, input);
} else {
var hasher = Accumulator.init(seed);
const i = hasher.updateEmpty(input, 0);
return finalize(hasher.merge(), i, input[i..]);
}
}
};
pub const XxHash32 = struct {
accumulator: Accumulator,
seed: u32,
buf: [16]u8,
buf_len: usize,
byte_count: usize,
const prime_1 = 0x9E3779B1; // 0b10011110001101110111100110110001
const prime_2 = 0x85EBCA77; // 0b10000101111010111100101001110111
const prime_3 = 0xC2B2AE3D; // 0b11000010101100101010111000111101
const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001
const Accumulator = struct {
acc1: u32,
acc2: u32,
acc3: u32,
acc4: u32,
fn init(seed: u32) Accumulator {
return .{
.acc1 = seed +% prime_1 +% prime_2,
.acc2 = seed +% prime_2,
.acc3 = seed,
.acc4 = seed -% prime_1,
};
}
fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
var i: usize = 0;
if (unroll_count > 0) {
const unrolled_bytes = unroll_count * 16;
while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
inline for (0..unroll_count) |j| {
self.processStripe(input[i + j * 16 ..][0..16]);
}
}
}
while (i + 16 <= input.len) : (i += 16) {
self.processStripe(input[i..][0..16]);
}
return i;
}
fn processStripe(self: *Accumulator, buf: *const [16]u8) void {
self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
}
fn merge(self: Accumulator) u32 {
return rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
}
};
pub fn init(seed: u32) XxHash32 {
return XxHash32{
.accumulator = Accumulator.init(seed),
.seed = seed,
.buf = undefined,
.buf_len = 0,
.byte_count = 0,
};
}
pub fn update(self: *XxHash32, input: []const u8) void {
validateType(@TypeOf(input));
if (input.len < 16 - self.buf_len) {
@memcpy(self.buf[self.buf_len..][0..input.len], input);
self.buf_len += input.len;
return;
}
var i: usize = 0;
if (self.buf_len > 0) {
i = 16 - self.buf_len;
@memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
self.accumulator.processStripe(&self.buf);
self.byte_count += self.buf_len;
self.buf_len = 0;
}
i += self.accumulator.updateEmpty(input[i..], 16);
self.byte_count += i;
const remaining_bytes = input[i..];
@memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
self.buf_len = remaining_bytes.len;
}
fn round(acc: u32, lane: u32) u32 {
const a = acc +% (lane *% prime_2);
const b = rotl(u32, a, 13);
return b *% prime_1;
}
pub fn final(self: *XxHash32) u32 {
const unfinished = if (self.byte_count < 16)
self.seed +% prime_5
else
self.accumulator.merge();
return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
}
fn finalize(unfinished: u32, byte_count: usize, partial: anytype) u32 {
std.debug.assert(partial.len < 16);
var acc = unfinished +% @as(u32, @intCast(byte_count)) +% @as(u32, @intCast(partial.len));
switch (partial.len) {
inline 0, 1, 2, 3 => |count| {
inline for (0..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 4, 5, 6, 7 => |count| {
acc = finalize4(acc, partial[0..4]);
inline for (4..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 8, 9, 10, 11 => |count| {
acc = finalize4(acc, partial[0..4]);
acc = finalize4(acc, partial[4..8]);
inline for (8..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
inline 12, 13, 14, 15 => |count| {
acc = finalize4(acc, partial[0..4]);
acc = finalize4(acc, partial[4..8]);
acc = finalize4(acc, partial[8..12]);
inline for (12..count) |i| acc = finalize1(acc, partial[i]);
return avalanche(acc);
},
else => unreachable,
}
return avalanche(acc);
}
fn finalize4(v: u32, bytes: *const [4]u8) u32 {
var acc = v;
const lane = mem.readIntLittle(u32, bytes);
acc +%= lane *% prime_3;
acc = rotl(u32, acc, 17) *% prime_4;
return acc;
}
fn finalize1(v: u32, byte: u8) u32 {
var acc = v;
const lane = @as(u32, byte);
acc +%= lane *% prime_5;
acc = rotl(u32, acc, 11) *% prime_1;
return acc;
}
fn avalanche(value: u32) u32 {
var acc = value ^ value >> 15;
acc *%= prime_2;
acc ^= acc >> 13;
acc *%= prime_3;
acc ^= acc >> 16;
return acc;
}
pub fn hash(seed: u32, input: anytype) u32 {
validateType(@TypeOf(input));
if (input.len < 16) {
return finalize(seed +% prime_5, 0, input);
} else {
var hasher = Accumulator.init(seed);
const i = hasher.updateEmpty(input, 0);
return finalize(hasher.merge(), i, input[i..]);
}
}
};
fn validateType(comptime T: type) void {
comptime {
if (!((std.meta.trait.isSlice(T) or
std.meta.trait.is(.Array)(T) or
std.meta.trait.isPtrTo(.Array)(T)) and
std.meta.Elem(T) == u8))
{
@compileError("expect a slice, array or pointer to array of u8, got " ++ @typeName(T));
}
}
}
fn testExpect(comptime H: type, seed: anytype, input: []const u8, expected: u64) !void {
try expectEqual(expected, H.hash(0, input));
var hasher = H.init(seed);
hasher.update(input);
try expectEqual(expected, hasher.final());
}
test "xxhash64" {
const H = XxHash64;
try testExpect(H, 0, "", 0xef46db3751d8e999);
try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b);
try testExpect(H, 0, "abc", 0x44bc2cf5ad770999);
try testExpect(H, 0, "message digest", 0x066ed728fceeb3be);
try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0xcfe1f278fa89835c);
try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0xaaa46907d3047814);
try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0xe04a477f19ee145d);
}
test "xxhash32" {
const H = XxHash32;
try testExpect(H, 0, "", 0x02cc5d05);
try testExpect(H, 0, "a", 0x550d7456);
try testExpect(H, 0, "abc", 0x32d153ff);
try testExpect(H, 0, "message digest", 0x7c948494);
try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0x63a14d5f);
try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0x9c285e64);
try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0x9c05f475);
}