diff --git a/std/hash.zig b/std/hash.zig index 723860da3b..e246fd0ad3 100644 --- a/std/hash.zig +++ b/std/hash.zig @@ -17,6 +17,7 @@ pub const SipHash128 = siphash.SipHash128; pub const murmur = @import("hash/murmur.zig"); pub const Murmur2_32 = murmur.Murmur2_32; + pub const Murmur2_64 = murmur.Murmur2_64; pub const Murmur3_32 = murmur.Murmur3_32; @@ -24,7 +25,8 @@ pub const cityhash = @import("hash/cityhash.zig"); pub const CityHash32 = cityhash.CityHash32; pub const CityHash64 = cityhash.CityHash64; -pub const wyhash = @import("hash/wyhash.zig").hash; +const wyhash = @import("hash/wyhash.zig"); +pub const Wyhash = wyhash.Wyhash; test "hash" { _ = @import("hash/adler.zig"); diff --git a/std/hash/wyhash.zig b/std/hash/wyhash.zig index 57efe8fd63..49119c5a95 100644 --- a/std/hash/wyhash.zig +++ b/std/hash/wyhash.zig @@ -10,7 +10,7 @@ const primes = [_]u64{ }; fn read_bytes(comptime bytes: u8, data: []const u8) u64 { - return mem.readVarInt(u64, data[0..bytes], @import("builtin").endian); + return mem.readVarInt(u64, data[0..bytes], .Little); } fn read_8bytes_swapped(data: []const u8) u64 { @@ -18,7 +18,7 @@ fn read_8bytes_swapped(data: []const u8) u64 { } fn mum(a: u64, b: u64) u64 { - var r: u128 = @intCast(u128, a) * @intCast(u128, b); + var r = std.math.mulWide(u64, a, b); r = (r >> 64) ^ r; return @truncate(u64, r); } @@ -31,69 +31,117 @@ fn mix1(a: u64, b: u64, seed: u64) u64 { return mum(a ^ seed ^ primes[2], b ^ seed ^ primes[3]); } -pub fn hash(key: []const u8, initial_seed: u64) u64 { - var seed = initial_seed; +pub const Wyhash = struct { + seed: u64, - var i: usize = 0; - while (i + 32 <= key.len) : (i += 32) { - seed = mix0( - read_bytes(8, key[i..]), - read_bytes(8, key[i + 8 ..]), - seed, + buf: [32]u8, + buf_len: usize, + msg_len: usize, + + pub fn init(seed: u64) Wyhash { + return Wyhash{ + .seed = seed, + .buf = undefined, + .buf_len = 0, + .msg_len = 0, + }; + } + + fn round(self: *Wyhash, b: []const u8) void { + std.debug.assert(b.len == 32); + + self.seed = mix0( + read_bytes(8, b[0..]), + read_bytes(8, b[8..]), + self.seed, ) ^ mix1( - read_bytes(8, key[i + 16 ..]), - read_bytes(8, key[i + 24 ..]), - seed, + read_bytes(8, b[16..]), + read_bytes(8, b[24..]), + self.seed, ); } - const rem_len = @truncate(u5, key.len); - const rem_key = key[i..]; - seed = switch (rem_len) { - 0 => seed, - 1 => mix0(read_bytes(1, rem_key), primes[4], seed), - 2 => mix0(read_bytes(2, rem_key), primes[4], seed), - 3 => mix0((read_bytes(2, rem_key) << 8) | read_bytes(1, rem_key[2..]), primes[4], seed), - 4 => mix0(read_bytes(4, rem_key), primes[4], seed), - 5 => mix0((read_bytes(4, rem_key) << 8) | read_bytes(1, rem_key[4..]), primes[4], seed), - 6 => mix0((read_bytes(4, rem_key) << 16) | read_bytes(2, rem_key[4..]), primes[4], seed), - 7 => mix0((read_bytes(4, rem_key) << 24) | (read_bytes(2, rem_key[4..]) << 8) | read_bytes(1, rem_key[6..]), primes[4], seed), - 8 => mix0(read_8bytes_swapped(rem_key), primes[4], seed), - 9 => mix0(read_8bytes_swapped(rem_key), read_bytes(1, rem_key[8..]), seed), - 10 => mix0(read_8bytes_swapped(rem_key), read_bytes(2, rem_key[8..]), seed), - 11 => mix0(read_8bytes_swapped(rem_key), (read_bytes(2, rem_key[8..]) << 8) | read_bytes(1, rem_key[10..]), seed), - 12 => mix0(read_8bytes_swapped(rem_key), read_bytes(4, rem_key[8..]), seed), - 13 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 8) | read_bytes(1, rem_key[12..]), seed), - 14 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 16) | read_bytes(2, rem_key[12..]), seed), - 15 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 24) | (read_bytes(2, rem_key[12..]) << 8) | read_bytes(1, rem_key[14..]), seed), - 16 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed), - 17 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(1, rem_key[16..]), primes[4], seed), - 18 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(2, rem_key[16..]), primes[4], seed), - 19 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(2, rem_key[16..]) << 8) | read_bytes(1, rem_key[18..]), primes[4], seed), - 20 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(4, rem_key[16..]), primes[4], seed), - 21 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 8) | read_bytes(1, rem_key[20..]), primes[4], seed), - 22 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 16) | read_bytes(2, rem_key[20..]), primes[4], seed), - 23 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 24) | (read_bytes(2, rem_key[20..]) << 8) | read_bytes(1, rem_key[22..]), primes[4], seed), - 24 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), primes[4], seed), - 25 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(1, rem_key[24..]), seed), - 26 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(2, rem_key[24..]), seed), - 27 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(2, rem_key[24..]) << 8) | read_bytes(1, rem_key[26..]), seed), - 28 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(4, rem_key[24..]), seed), - 29 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 8) | read_bytes(1, rem_key[28..]), seed), - 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), - 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), - }; + pub fn update(self: *Wyhash, b: []const u8) void { + var off: usize = 0; - return mum(seed ^ key.len, primes[4]); -} + // Partial from previous. + if (self.buf_len != 0 and self.buf_len + b.len > 32) { + off += 32 - self.buf_len; + mem.copy(u8, self.buf[self.buf_len..], b[0..off]); + self.round(self.buf[0..]); + self.buf_len = 0; + } + + // Full middle blocks. + while (off + 32 <= b.len) : (off += 32) { + @inlineCall(self.round, b[off .. off + 32]); + } + + // Remainder for next pass. + mem.copy(u8, self.buf[self.buf_len..], b[off..]); + self.buf_len += @intCast(u8, b[off..].len); + self.msg_len += b.len; + } + + pub fn final(self: *Wyhash) u64 { + const seed = self.seed; + const rem_len = @intCast(u5, self.buf_len); + const rem_key = self.buf[0..self.buf_len]; + + self.seed = switch (rem_len) { + 0 => seed, + 1 => mix0(read_bytes(1, rem_key), primes[4], seed), + 2 => mix0(read_bytes(2, rem_key), primes[4], seed), + 3 => mix0((read_bytes(2, rem_key) << 8) | read_bytes(1, rem_key[2..]), primes[4], seed), + 4 => mix0(read_bytes(4, rem_key), primes[4], seed), + 5 => mix0((read_bytes(4, rem_key) << 8) | read_bytes(1, rem_key[4..]), primes[4], seed), + 6 => mix0((read_bytes(4, rem_key) << 16) | read_bytes(2, rem_key[4..]), primes[4], seed), + 7 => mix0((read_bytes(4, rem_key) << 24) | (read_bytes(2, rem_key[4..]) << 8) | read_bytes(1, rem_key[6..]), primes[4], seed), + 8 => mix0(read_8bytes_swapped(rem_key), primes[4], seed), + 9 => mix0(read_8bytes_swapped(rem_key), read_bytes(1, rem_key[8..]), seed), + 10 => mix0(read_8bytes_swapped(rem_key), read_bytes(2, rem_key[8..]), seed), + 11 => mix0(read_8bytes_swapped(rem_key), (read_bytes(2, rem_key[8..]) << 8) | read_bytes(1, rem_key[10..]), seed), + 12 => mix0(read_8bytes_swapped(rem_key), read_bytes(4, rem_key[8..]), seed), + 13 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 8) | read_bytes(1, rem_key[12..]), seed), + 14 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 16) | read_bytes(2, rem_key[12..]), seed), + 15 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 24) | (read_bytes(2, rem_key[12..]) << 8) | read_bytes(1, rem_key[14..]), seed), + 16 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed), + 17 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(1, rem_key[16..]), primes[4], seed), + 18 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(2, rem_key[16..]), primes[4], seed), + 19 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(2, rem_key[16..]) << 8) | read_bytes(1, rem_key[18..]), primes[4], seed), + 20 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(4, rem_key[16..]), primes[4], seed), + 21 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 8) | read_bytes(1, rem_key[20..]), primes[4], seed), + 22 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 16) | read_bytes(2, rem_key[20..]), primes[4], seed), + 23 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 24) | (read_bytes(2, rem_key[20..]) << 8) | read_bytes(1, rem_key[22..]), primes[4], seed), + 24 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), primes[4], seed), + 25 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(1, rem_key[24..]), seed), + 26 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(2, rem_key[24..]), seed), + 27 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(2, rem_key[24..]) << 8) | read_bytes(1, rem_key[26..]), seed), + 28 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(4, rem_key[24..]), seed), + 29 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 8) | read_bytes(1, rem_key[28..]), seed), + 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), + 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), + }; + + return mum(self.seed ^ self.msg_len, primes[4]); + } + + pub fn hash(seed: u64, input: []const u8) u64 { + var c = Wyhash.init(seed); + c.update(input); + return c.final(); + } +}; test "test vectors" { const expectEqual = std.testing.expectEqual; - expectEqual(hash("", 0), 0x0); - expectEqual(hash("a", 1), 0xbed235177f41d328); - expectEqual(hash("abc", 2), 0xbe348debe59b27c3); - expectEqual(hash("message digest", 3), 0x37320f657213a290); - expectEqual(hash("abcdefghijklmnopqrstuvwxyz", 4), 0xd0b270e1d8a7019c); - expectEqual(hash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 5), 0x602a1894d3bbfe7f); - expectEqual(hash("12345678901234567890123456789012345678901234567890123456789012345678901234567890", 6), 0x829e9c148b75970e); + const hash = Wyhash.hash; + + expectEqual(hash(0, ""), 0x0); + expectEqual(hash(1, "a"), 0xbed235177f41d328); + expectEqual(hash(2, "abc"), 0xbe348debe59b27c3); + expectEqual(hash(3, "message digest"), 0x37320f657213a290); + expectEqual(hash(4, "abcdefghijklmnopqrstuvwxyz"), 0xd0b270e1d8a7019c); + expectEqual(hash(5, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x602a1894d3bbfe7f); + expectEqual(hash(6, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x829e9c148b75970e); } diff --git a/std/hash_map.zig b/std/hash_map.zig index 6a8679ccd0..71cfecdd6d 100644 --- a/std/hash_map.zig +++ b/std/hash_map.zig @@ -5,7 +5,7 @@ const testing = std.testing; const math = std.math; const mem = std.mem; const meta = std.meta; -const wyhash = std.hash.wyhash; +const Wyhash = std.hash.Wyhash; const Allocator = mem.Allocator; const builtin = @import("builtin"); @@ -557,7 +557,7 @@ pub fn autoHash(key: var, seed: u64) u64 { builtin.TypeId.EnumLiteral, => @compileError("cannot hash this type"), - builtin.TypeId.Int => return wyhash(std.mem.asBytes(&key), seed), + builtin.TypeId.Int => return Wyhash.hash(seed, std.mem.asBytes(&key)), builtin.TypeId.Float => |info| return autoHash(@bitCast(@IntType(false, info.bits), key), seed), @@ -594,7 +594,7 @@ pub fn autoHash(key: var, seed: u64) u64 { // If there's no unused bits in the child type, we can just hash // this as an array of bytes. if (info.child.bit_count % 8 == 0) { - return wyhash(mem.asBytes(&key), seed); + return Wyhash.hash(seed, mem.asBytes(&key)); } // Otherwise, hash every element.