From 6150da3df99b41f89ea01a72e6c1b76fe4c36f89 Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Thu, 27 Jun 2019 23:21:35 +0200 Subject: [PATCH 01/10] direct port of wyhash v2 also inspired by https://github.com/ManDeJan/zig-wyhash --- std/hash.zig | 4 ++ std/hash/wyhash.zig | 99 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 std/hash/wyhash.zig diff --git a/std/hash.zig b/std/hash.zig index 148504aa39..723860da3b 100644 --- a/std/hash.zig +++ b/std/hash.zig @@ -16,6 +16,7 @@ pub const SipHash128 = siphash.SipHash128; pub const murmur = @import("hash/murmur.zig"); pub const Murmur2_32 = murmur.Murmur2_32; + pub const Murmur2_64 = murmur.Murmur2_64; pub const Murmur3_32 = murmur.Murmur3_32; @@ -23,6 +24,8 @@ pub const cityhash = @import("hash/cityhash.zig"); pub const CityHash32 = cityhash.CityHash32; pub const CityHash64 = cityhash.CityHash64; +pub const wyhash = @import("hash/wyhash.zig").hash; + test "hash" { _ = @import("hash/adler.zig"); _ = @import("hash/crc.zig"); @@ -30,4 +33,5 @@ test "hash" { _ = @import("hash/siphash.zig"); _ = @import("hash/murmur.zig"); _ = @import("hash/cityhash.zig"); + _ = @import("hash/wyhash.zig"); } diff --git a/std/hash/wyhash.zig b/std/hash/wyhash.zig new file mode 100644 index 0000000000..57efe8fd63 --- /dev/null +++ b/std/hash/wyhash.zig @@ -0,0 +1,99 @@ +const std = @import("std"); +const mem = std.mem; + +const primes = [_]u64{ + 0xa0761d6478bd642f, + 0xe7037ed1a0b428db, + 0x8ebc6af09c88c6e3, + 0x589965cc75374cc3, + 0x1d8e4e27c47d124f, +}; + +fn read_bytes(comptime bytes: u8, data: []const u8) u64 { + return mem.readVarInt(u64, data[0..bytes], @import("builtin").endian); +} + +fn read_8bytes_swapped(data: []const u8) u64 { + return (read_bytes(4, data) << 32 | read_bytes(4, data[4..])); +} + +fn mum(a: u64, b: u64) u64 { + var r: u128 = @intCast(u128, a) * @intCast(u128, b); + r = (r >> 64) ^ r; + return @truncate(u64, r); +} + +fn mix0(a: u64, b: u64, seed: u64) u64 { + return mum(a ^ seed ^ primes[0], b ^ seed ^ primes[1]); +} + +fn mix1(a: u64, b: u64, seed: u64) u64 { + return mum(a ^ seed ^ primes[2], b ^ seed ^ primes[3]); +} + +pub fn hash(key: []const u8, initial_seed: u64) u64 { + var seed = initial_seed; + + var i: usize = 0; + while (i + 32 <= key.len) : (i += 32) { + seed = mix0( + read_bytes(8, key[i..]), + read_bytes(8, key[i + 8 ..]), + seed, + ) ^ mix1( + read_bytes(8, key[i + 16 ..]), + read_bytes(8, key[i + 24 ..]), + seed, + ); + } + + const rem_len = @truncate(u5, key.len); + const rem_key = key[i..]; + seed = switch (rem_len) { + 0 => seed, + 1 => mix0(read_bytes(1, rem_key), primes[4], seed), + 2 => mix0(read_bytes(2, rem_key), primes[4], seed), + 3 => mix0((read_bytes(2, rem_key) << 8) | read_bytes(1, rem_key[2..]), primes[4], seed), + 4 => mix0(read_bytes(4, rem_key), primes[4], seed), + 5 => mix0((read_bytes(4, rem_key) << 8) | read_bytes(1, rem_key[4..]), primes[4], seed), + 6 => mix0((read_bytes(4, rem_key) << 16) | read_bytes(2, rem_key[4..]), primes[4], seed), + 7 => mix0((read_bytes(4, rem_key) << 24) | (read_bytes(2, rem_key[4..]) << 8) | read_bytes(1, rem_key[6..]), primes[4], seed), + 8 => mix0(read_8bytes_swapped(rem_key), primes[4], seed), + 9 => mix0(read_8bytes_swapped(rem_key), read_bytes(1, rem_key[8..]), seed), + 10 => mix0(read_8bytes_swapped(rem_key), read_bytes(2, rem_key[8..]), seed), + 11 => mix0(read_8bytes_swapped(rem_key), (read_bytes(2, rem_key[8..]) << 8) | read_bytes(1, rem_key[10..]), seed), + 12 => mix0(read_8bytes_swapped(rem_key), read_bytes(4, rem_key[8..]), seed), + 13 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 8) | read_bytes(1, rem_key[12..]), seed), + 14 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 16) | read_bytes(2, rem_key[12..]), seed), + 15 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 24) | (read_bytes(2, rem_key[12..]) << 8) | read_bytes(1, rem_key[14..]), seed), + 16 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed), + 17 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(1, rem_key[16..]), primes[4], seed), + 18 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(2, rem_key[16..]), primes[4], seed), + 19 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(2, rem_key[16..]) << 8) | read_bytes(1, rem_key[18..]), primes[4], seed), + 20 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(4, rem_key[16..]), primes[4], seed), + 21 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 8) | read_bytes(1, rem_key[20..]), primes[4], seed), + 22 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 16) | read_bytes(2, rem_key[20..]), primes[4], seed), + 23 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 24) | (read_bytes(2, rem_key[20..]) << 8) | read_bytes(1, rem_key[22..]), primes[4], seed), + 24 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), primes[4], seed), + 25 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(1, rem_key[24..]), seed), + 26 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(2, rem_key[24..]), seed), + 27 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(2, rem_key[24..]) << 8) | read_bytes(1, rem_key[26..]), seed), + 28 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(4, rem_key[24..]), seed), + 29 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 8) | read_bytes(1, rem_key[28..]), seed), + 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), + 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), + }; + + return mum(seed ^ key.len, primes[4]); +} + +test "test vectors" { + const expectEqual = std.testing.expectEqual; + expectEqual(hash("", 0), 0x0); + expectEqual(hash("a", 1), 0xbed235177f41d328); + expectEqual(hash("abc", 2), 0xbe348debe59b27c3); + expectEqual(hash("message digest", 3), 0x37320f657213a290); + expectEqual(hash("abcdefghijklmnopqrstuvwxyz", 4), 0xd0b270e1d8a7019c); + expectEqual(hash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 5), 0x602a1894d3bbfe7f); + expectEqual(hash("12345678901234567890123456789012345678901234567890123456789012345678901234567890", 6), 0x829e9c148b75970e); +} From 5bd407b27890fbed82891289e6a2bf2da93c2a41 Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Sun, 30 Jun 2019 11:35:57 +0200 Subject: [PATCH 02/10] use wyhash in std's hashmap, and improve autoHash to handle more types and behave more correctly --- std/hash_map.zig | 264 +++++++++++++++++++++++++++++++---------------- 1 file changed, 173 insertions(+), 91 deletions(-) diff --git a/std/hash_map.zig b/std/hash_map.zig index c99d1d2490..6a8679ccd0 100644 --- a/std/hash_map.zig +++ b/std/hash_map.zig @@ -4,6 +4,8 @@ const assert = debug.assert; const testing = std.testing; const math = std.math; const mem = std.mem; +const meta = std.meta; +const wyhash = std.hash.wyhash; const Allocator = mem.Allocator; const builtin = @import("builtin"); @@ -448,15 +450,17 @@ test "iterator hash map" { try reset_map.putNoClobber(2, 22); try reset_map.putNoClobber(3, 33); + // TODO this test depends on the hashing algorithm, because it assumes the + // order of the elements in the hashmap. This should not be the case. var keys = [_]i32{ + 1, 3, 2, - 1, }; var values = [_]i32{ + 11, 33, 22, - 11, }; var it = reset_map.iterator(); @@ -518,8 +522,8 @@ pub fn getTrivialEqlFn(comptime K: type) (fn (K, K) bool) { pub fn getAutoHashFn(comptime K: type) (fn (K) u32) { return struct { fn hash(key: K) u32 { - comptime var rng = comptime std.rand.DefaultPrng.init(0); - return autoHash(key, &rng.random, u32); + const h = autoHash(key, 0); + return @truncate(u32, h); } }.hash; } @@ -527,114 +531,192 @@ pub fn getAutoHashFn(comptime K: type) (fn (K) u32) { pub fn getAutoEqlFn(comptime K: type) (fn (K, K) bool) { return struct { fn eql(a: K, b: K) bool { - return autoEql(a, b); + return meta.eql(a, b); } }.eql; } -// TODO improve these hash functions -pub fn autoHash(key: var, comptime rng: *std.rand.Random, comptime HashInt: type) HashInt { - switch (@typeInfo(@typeOf(key))) { +/// Provides generic hashing for any eligible type. +/// Only hashes `key` itself, pointers are not followed. +/// The underlying hashing algorithm is wyhash. +pub fn autoHash(key: var, seed: u64) u64 { + // We use the fact that wyhash takes an input seed to "chain" hasing when the + // key has multiple parts that are not necessarily contiguous in memory. + const Key = @typeOf(key); + switch (@typeInfo(Key)) { builtin.TypeId.NoReturn, builtin.TypeId.Opaque, builtin.TypeId.Undefined, builtin.TypeId.ArgTuple, + builtin.TypeId.Void, + builtin.TypeId.Null, + builtin.TypeId.BoundFn, + builtin.TypeId.ComptimeFloat, + builtin.TypeId.ComptimeInt, + builtin.TypeId.Type, + builtin.TypeId.EnumLiteral, => @compileError("cannot hash this type"), - builtin.TypeId.Void, - builtin.TypeId.Null, - => return 0, + builtin.TypeId.Int => return wyhash(std.mem.asBytes(&key), seed), - builtin.TypeId.Int => |info| { - const unsigned_x = @bitCast(@IntType(false, info.bits), key); - if (info.bits <= HashInt.bit_count) { - return HashInt(unsigned_x) ^ comptime rng.scalar(HashInt); - } else { - return @truncate(HashInt, unsigned_x ^ comptime rng.scalar(@typeOf(unsigned_x))); + builtin.TypeId.Float => |info| return autoHash(@bitCast(@IntType(false, info.bits), key), seed), + + builtin.TypeId.Bool => return autoHash(@boolToInt(key), seed), + builtin.TypeId.Enum => return autoHash(@enumToInt(key), seed), + builtin.TypeId.ErrorSet => return autoHash(@errorToInt(key), seed), + builtin.TypeId.Promise, builtin.TypeId.Fn => return autoHash(@ptrToInt(key), seed), + + builtin.TypeId.Pointer => |info| return switch (info.size) { + builtin.TypeInfo.Pointer.Size.One, + builtin.TypeInfo.Pointer.Size.Many, + builtin.TypeInfo.Pointer.Size.C, + => return autoHash(@ptrToInt(key), seed), + + builtin.TypeInfo.Pointer.Size.Slice => return autoHash(key.len, autoHash(key.ptr, seed)), + }, + + builtin.TypeId.Optional => return if (key) |k| autoHash(k, seed) else 0, + + builtin.TypeId.Array => { + // TODO detect via a trait when Key has no padding bits to + // hash it as an array of bytes. + // Otherwise, hash every element. + var s = seed; + for (key) |element| { + // We reuse the hash of the previous element as the seed for the + // next one so that they're dependant. + s = autoHash(element, s); } + return s; }, - builtin.TypeId.Float => |info| { - return autoHash(@bitCast(@IntType(false, info.bits), key), rng, HashInt); + builtin.TypeId.Vector => |info| { + // If there's no unused bits in the child type, we can just hash + // this as an array of bytes. + if (info.child.bit_count % 8 == 0) { + return wyhash(mem.asBytes(&key), seed); + } + + // Otherwise, hash every element. + var s = seed; + // TODO remove the copy to an array once field access is done. + const array: [info.len]info.child = key; + comptime var i: u32 = 0; + inline while (i < info.len) : (i += 1) { + s = autoHash(array[i], s); + } + return s; }, - builtin.TypeId.Bool => return autoHash(@boolToInt(key), rng, HashInt), - builtin.TypeId.Enum => return autoHash(@enumToInt(key), rng, HashInt), - builtin.TypeId.ErrorSet => return autoHash(@errorToInt(key), rng, HashInt), - builtin.TypeId.Promise, builtin.TypeId.Fn => return autoHash(@ptrToInt(key), rng, HashInt), - builtin.TypeId.BoundFn, - builtin.TypeId.ComptimeFloat, - builtin.TypeId.ComptimeInt, - builtin.TypeId.Type, - builtin.TypeId.EnumLiteral, - => return 0, + builtin.TypeId.Struct => |info| { + // TODO detect via a trait when Key has no padding bits to + // hash it as an array of bytes. + // Otherwise, hash every field. + var s = seed; + inline for (info.fields) |field| { + // We reuse the hash of the previous field as the seed for the + // next one so that they're dependant. + s = autoHash(@field(key, field.name), s); + } + return s; + }, - builtin.TypeId.Pointer => |info| switch (info.size) { - builtin.TypeInfo.Pointer.Size.One => @compileError("TODO auto hash for single item pointers"), - builtin.TypeInfo.Pointer.Size.Many => @compileError("TODO auto hash for many item pointers"), - builtin.TypeInfo.Pointer.Size.C => @compileError("TODO auto hash C pointers"), - builtin.TypeInfo.Pointer.Size.Slice => { - const interval = std.math.max(1, key.len / 256); - var i: usize = 0; - var h = comptime rng.scalar(HashInt); - while (i < key.len) : (i += interval) { - h ^= autoHash(key[i], rng, HashInt); + builtin.TypeId.Union => |info| { + if (info.tag_type) |tag_type| { + const tag = meta.activeTag(key); + const s = autoHash(tag, seed); + inline for (info.fields) |field| { + const enum_field = field.enum_field.?; + if (enum_field.value == @enumToInt(tag)) { + return autoHash(@field(key, enum_field.name), s); + } } - return h; - }, + unreachable; + } else @compileError("cannot hash untagged union type: " ++ @typeName(Key) ++ ", provide your own hash function"); }, - builtin.TypeId.Optional => @compileError("TODO auto hash for optionals"), - builtin.TypeId.Array => @compileError("TODO auto hash for arrays"), - builtin.TypeId.Vector => @compileError("TODO auto hash for vectors"), - builtin.TypeId.Struct => @compileError("TODO auto hash for structs"), - builtin.TypeId.Union => @compileError("TODO auto hash for unions"), - builtin.TypeId.ErrorUnion => @compileError("TODO auto hash for unions"), + builtin.TypeId.ErrorUnion => { + return autoHash(key catch |err| return autoHash(err, seed), seed); + }, } } -pub fn autoEql(a: var, b: @typeOf(a)) bool { - switch (@typeInfo(@typeOf(a))) { - builtin.TypeId.NoReturn, - builtin.TypeId.Opaque, - builtin.TypeId.Undefined, - builtin.TypeId.ArgTuple, - => @compileError("cannot test equality of this type"), - builtin.TypeId.Void, - builtin.TypeId.Null, - => return true, - builtin.TypeId.Bool, - builtin.TypeId.Int, - builtin.TypeId.Float, - builtin.TypeId.ComptimeFloat, - builtin.TypeId.ComptimeInt, - builtin.TypeId.EnumLiteral, - builtin.TypeId.Promise, - builtin.TypeId.Enum, - builtin.TypeId.BoundFn, - builtin.TypeId.Fn, - builtin.TypeId.ErrorSet, - builtin.TypeId.Type, - => return a == b, - - builtin.TypeId.Pointer => |info| switch (info.size) { - builtin.TypeInfo.Pointer.Size.One => @compileError("TODO auto eql for single item pointers"), - builtin.TypeInfo.Pointer.Size.Many => @compileError("TODO auto eql for many item pointers"), - builtin.TypeInfo.Pointer.Size.C => @compileError("TODO auto eql for C pointers"), - builtin.TypeInfo.Pointer.Size.Slice => { - if (a.len != b.len) return false; - for (a) |a_item, i| { - if (!autoEql(a_item, b[i])) return false; - } - return true; - }, - }, - - builtin.TypeId.Optional => @compileError("TODO auto eql for optionals"), - builtin.TypeId.Array => @compileError("TODO auto eql for arrays"), - builtin.TypeId.Struct => @compileError("TODO auto eql for structs"), - builtin.TypeId.Union => @compileError("TODO auto eql for unions"), - builtin.TypeId.ErrorUnion => @compileError("TODO auto eql for unions"), - builtin.TypeId.Vector => @compileError("TODO auto eql for vectors"), - } +test "autoHash slice" { + const array1 = try std.heap.direct_allocator.create([6]u32); + defer std.heap.direct_allocator.destroy(array1); + array1.* = [_]u32{ 1, 2, 3, 4, 5, 6 }; + const array2 = [_]u32{ 1, 2, 3, 4, 5, 6 }; + const a = array1[0..]; + const b = array2[0..]; + const c = array1[0..3]; + testing.expect(autoHash(a, 0) == autoHash(a, 0)); + testing.expect(autoHash(a, 0) != autoHash(array1, 0)); + testing.expect(autoHash(a, 0) != autoHash(b, 0)); + testing.expect(autoHash(a, 0) != autoHash(c, 0)); +} + +test "autoHash optional" { + const a: ?u32 = 123; + const b: ?u32 = null; + testing.expectEqual(autoHash(a, 0), autoHash(u32(123), 0)); + testing.expect(autoHash(a, 0) != autoHash(b, 0)); + testing.expectEqual(autoHash(b, 0), 0); +} + +test "autoHash array" { + const a = [_]u32{ 1, 2, 3 }; + const h = autoHash(a, 0); + testing.expectEqual(h, autoHash(u32(3), autoHash(u32(2), autoHash(u32(1), 0)))); +} + +test "autoHash struct" { + const Foo = struct { + a: u32 = 1, + b: u32 = 2, + c: u32 = 3, + }; + const f = Foo{}; + const h = autoHash(f, 0); + testing.expectEqual(h, autoHash(u32(3), autoHash(u32(2), autoHash(u32(1), 0)))); +} + +test "autoHash union" { + const Foo = union(enum) { + A: u32, + B: f32, + C: u32, + }; + + const a = Foo{ .A = 18 }; + var b = Foo{ .B = 12.34 }; + const c = Foo{ .C = 18 }; + testing.expect(autoHash(a, 0) == autoHash(a, 0)); + testing.expect(autoHash(a, 0) != autoHash(b, 0)); + testing.expect(autoHash(a, 0) != autoHash(c, 0)); + + b = Foo{ .A = 18 }; + testing.expect(autoHash(a, 0) == autoHash(b, 0)); +} + +test "autoHash vector" { + const a: @Vector(4, u32) = [_]u32{ 1, 2, 3, 4 }; + const b: @Vector(4, u32) = [_]u32{ 1, 2, 3, 5 }; + const c: @Vector(4, u31) = [_]u31{ 1, 2, 3, 4 }; + testing.expect(autoHash(a, 0) == autoHash(a, 0)); + testing.expect(autoHash(a, 0) != autoHash(b, 0)); + testing.expect(autoHash(a, 0) != autoHash(c, 0)); +} + +test "autoHash error union" { + const Errors = error{Test}; + const Foo = struct { + a: u32 = 1, + b: u32 = 2, + c: u32 = 3, + }; + const f = Foo{}; + const g: Errors!Foo = Errors.Test; + testing.expect(autoHash(f, 0) != autoHash(g, 0)); + testing.expect(autoHash(f, 0) == autoHash(Foo{}, 0)); + testing.expect(autoHash(g, 0) == autoHash(Errors.Test, 0)); } From c9ce43f59fc777055612aeea58db0849390bc204 Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Sun, 30 Jun 2019 20:46:43 +0200 Subject: [PATCH 03/10] fix hashmap using strings as keys --- std/http/headers.zig | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/std/http/headers.zig b/std/http/headers.zig index 69ed494f3a..7eb7fcc2c2 100644 --- a/std/http/headers.zig +++ b/std/http/headers.zig @@ -102,9 +102,19 @@ test "HeaderEntry" { testing.expectEqualSlices(u8, "x", e.value); } +fn stringEql(a: []const u8, b: []const u8) bool { + if (a.len != b.len) return false; + if (a.ptr == b.ptr) return true; + return mem.compare(u8, a, b) == .Equal; +} + +fn stringHash(s: []const u8) u32 { + return @truncate(u32, std.hash.wyhash(s, 0)); +} + const HeaderList = std.ArrayList(HeaderEntry); const HeaderIndexList = std.ArrayList(usize); -const HeaderIndex = std.AutoHashMap([]const u8, HeaderIndexList); +const HeaderIndex = std.HashMap([]const u8, HeaderIndexList, stringHash, stringEql); pub const Headers = struct { // the owned header field name is stored in the index as part of the key From 83dffc70afe4956c56f570ce5c854b17cbd6f218 Mon Sep 17 00:00:00 2001 From: Marc Tiehuis Date: Mon, 1 Jul 2019 23:23:26 +1200 Subject: [PATCH 04/10] Add iterative wyhash api --- std/hash.zig | 4 +- std/hash/wyhash.zig | 164 ++++++++++++++++++++++++++++---------------- std/hash_map.zig | 6 +- 3 files changed, 112 insertions(+), 62 deletions(-) diff --git a/std/hash.zig b/std/hash.zig index 723860da3b..e246fd0ad3 100644 --- a/std/hash.zig +++ b/std/hash.zig @@ -17,6 +17,7 @@ pub const SipHash128 = siphash.SipHash128; pub const murmur = @import("hash/murmur.zig"); pub const Murmur2_32 = murmur.Murmur2_32; + pub const Murmur2_64 = murmur.Murmur2_64; pub const Murmur3_32 = murmur.Murmur3_32; @@ -24,7 +25,8 @@ pub const cityhash = @import("hash/cityhash.zig"); pub const CityHash32 = cityhash.CityHash32; pub const CityHash64 = cityhash.CityHash64; -pub const wyhash = @import("hash/wyhash.zig").hash; +const wyhash = @import("hash/wyhash.zig"); +pub const Wyhash = wyhash.Wyhash; test "hash" { _ = @import("hash/adler.zig"); diff --git a/std/hash/wyhash.zig b/std/hash/wyhash.zig index 57efe8fd63..49119c5a95 100644 --- a/std/hash/wyhash.zig +++ b/std/hash/wyhash.zig @@ -10,7 +10,7 @@ const primes = [_]u64{ }; fn read_bytes(comptime bytes: u8, data: []const u8) u64 { - return mem.readVarInt(u64, data[0..bytes], @import("builtin").endian); + return mem.readVarInt(u64, data[0..bytes], .Little); } fn read_8bytes_swapped(data: []const u8) u64 { @@ -18,7 +18,7 @@ fn read_8bytes_swapped(data: []const u8) u64 { } fn mum(a: u64, b: u64) u64 { - var r: u128 = @intCast(u128, a) * @intCast(u128, b); + var r = std.math.mulWide(u64, a, b); r = (r >> 64) ^ r; return @truncate(u64, r); } @@ -31,69 +31,117 @@ fn mix1(a: u64, b: u64, seed: u64) u64 { return mum(a ^ seed ^ primes[2], b ^ seed ^ primes[3]); } -pub fn hash(key: []const u8, initial_seed: u64) u64 { - var seed = initial_seed; +pub const Wyhash = struct { + seed: u64, - var i: usize = 0; - while (i + 32 <= key.len) : (i += 32) { - seed = mix0( - read_bytes(8, key[i..]), - read_bytes(8, key[i + 8 ..]), - seed, + buf: [32]u8, + buf_len: usize, + msg_len: usize, + + pub fn init(seed: u64) Wyhash { + return Wyhash{ + .seed = seed, + .buf = undefined, + .buf_len = 0, + .msg_len = 0, + }; + } + + fn round(self: *Wyhash, b: []const u8) void { + std.debug.assert(b.len == 32); + + self.seed = mix0( + read_bytes(8, b[0..]), + read_bytes(8, b[8..]), + self.seed, ) ^ mix1( - read_bytes(8, key[i + 16 ..]), - read_bytes(8, key[i + 24 ..]), - seed, + read_bytes(8, b[16..]), + read_bytes(8, b[24..]), + self.seed, ); } - const rem_len = @truncate(u5, key.len); - const rem_key = key[i..]; - seed = switch (rem_len) { - 0 => seed, - 1 => mix0(read_bytes(1, rem_key), primes[4], seed), - 2 => mix0(read_bytes(2, rem_key), primes[4], seed), - 3 => mix0((read_bytes(2, rem_key) << 8) | read_bytes(1, rem_key[2..]), primes[4], seed), - 4 => mix0(read_bytes(4, rem_key), primes[4], seed), - 5 => mix0((read_bytes(4, rem_key) << 8) | read_bytes(1, rem_key[4..]), primes[4], seed), - 6 => mix0((read_bytes(4, rem_key) << 16) | read_bytes(2, rem_key[4..]), primes[4], seed), - 7 => mix0((read_bytes(4, rem_key) << 24) | (read_bytes(2, rem_key[4..]) << 8) | read_bytes(1, rem_key[6..]), primes[4], seed), - 8 => mix0(read_8bytes_swapped(rem_key), primes[4], seed), - 9 => mix0(read_8bytes_swapped(rem_key), read_bytes(1, rem_key[8..]), seed), - 10 => mix0(read_8bytes_swapped(rem_key), read_bytes(2, rem_key[8..]), seed), - 11 => mix0(read_8bytes_swapped(rem_key), (read_bytes(2, rem_key[8..]) << 8) | read_bytes(1, rem_key[10..]), seed), - 12 => mix0(read_8bytes_swapped(rem_key), read_bytes(4, rem_key[8..]), seed), - 13 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 8) | read_bytes(1, rem_key[12..]), seed), - 14 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 16) | read_bytes(2, rem_key[12..]), seed), - 15 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 24) | (read_bytes(2, rem_key[12..]) << 8) | read_bytes(1, rem_key[14..]), seed), - 16 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed), - 17 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(1, rem_key[16..]), primes[4], seed), - 18 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(2, rem_key[16..]), primes[4], seed), - 19 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(2, rem_key[16..]) << 8) | read_bytes(1, rem_key[18..]), primes[4], seed), - 20 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(4, rem_key[16..]), primes[4], seed), - 21 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 8) | read_bytes(1, rem_key[20..]), primes[4], seed), - 22 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 16) | read_bytes(2, rem_key[20..]), primes[4], seed), - 23 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 24) | (read_bytes(2, rem_key[20..]) << 8) | read_bytes(1, rem_key[22..]), primes[4], seed), - 24 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), primes[4], seed), - 25 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(1, rem_key[24..]), seed), - 26 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(2, rem_key[24..]), seed), - 27 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(2, rem_key[24..]) << 8) | read_bytes(1, rem_key[26..]), seed), - 28 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(4, rem_key[24..]), seed), - 29 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 8) | read_bytes(1, rem_key[28..]), seed), - 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), - 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), - }; + pub fn update(self: *Wyhash, b: []const u8) void { + var off: usize = 0; - return mum(seed ^ key.len, primes[4]); -} + // Partial from previous. + if (self.buf_len != 0 and self.buf_len + b.len > 32) { + off += 32 - self.buf_len; + mem.copy(u8, self.buf[self.buf_len..], b[0..off]); + self.round(self.buf[0..]); + self.buf_len = 0; + } + + // Full middle blocks. + while (off + 32 <= b.len) : (off += 32) { + @inlineCall(self.round, b[off .. off + 32]); + } + + // Remainder for next pass. + mem.copy(u8, self.buf[self.buf_len..], b[off..]); + self.buf_len += @intCast(u8, b[off..].len); + self.msg_len += b.len; + } + + pub fn final(self: *Wyhash) u64 { + const seed = self.seed; + const rem_len = @intCast(u5, self.buf_len); + const rem_key = self.buf[0..self.buf_len]; + + self.seed = switch (rem_len) { + 0 => seed, + 1 => mix0(read_bytes(1, rem_key), primes[4], seed), + 2 => mix0(read_bytes(2, rem_key), primes[4], seed), + 3 => mix0((read_bytes(2, rem_key) << 8) | read_bytes(1, rem_key[2..]), primes[4], seed), + 4 => mix0(read_bytes(4, rem_key), primes[4], seed), + 5 => mix0((read_bytes(4, rem_key) << 8) | read_bytes(1, rem_key[4..]), primes[4], seed), + 6 => mix0((read_bytes(4, rem_key) << 16) | read_bytes(2, rem_key[4..]), primes[4], seed), + 7 => mix0((read_bytes(4, rem_key) << 24) | (read_bytes(2, rem_key[4..]) << 8) | read_bytes(1, rem_key[6..]), primes[4], seed), + 8 => mix0(read_8bytes_swapped(rem_key), primes[4], seed), + 9 => mix0(read_8bytes_swapped(rem_key), read_bytes(1, rem_key[8..]), seed), + 10 => mix0(read_8bytes_swapped(rem_key), read_bytes(2, rem_key[8..]), seed), + 11 => mix0(read_8bytes_swapped(rem_key), (read_bytes(2, rem_key[8..]) << 8) | read_bytes(1, rem_key[10..]), seed), + 12 => mix0(read_8bytes_swapped(rem_key), read_bytes(4, rem_key[8..]), seed), + 13 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 8) | read_bytes(1, rem_key[12..]), seed), + 14 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 16) | read_bytes(2, rem_key[12..]), seed), + 15 => mix0(read_8bytes_swapped(rem_key), (read_bytes(4, rem_key[8..]) << 24) | (read_bytes(2, rem_key[12..]) << 8) | read_bytes(1, rem_key[14..]), seed), + 16 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed), + 17 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(1, rem_key[16..]), primes[4], seed), + 18 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(2, rem_key[16..]), primes[4], seed), + 19 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(2, rem_key[16..]) << 8) | read_bytes(1, rem_key[18..]), primes[4], seed), + 20 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_bytes(4, rem_key[16..]), primes[4], seed), + 21 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 8) | read_bytes(1, rem_key[20..]), primes[4], seed), + 22 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 16) | read_bytes(2, rem_key[20..]), primes[4], seed), + 23 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1((read_bytes(4, rem_key[16..]) << 24) | (read_bytes(2, rem_key[20..]) << 8) | read_bytes(1, rem_key[22..]), primes[4], seed), + 24 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), primes[4], seed), + 25 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(1, rem_key[24..]), seed), + 26 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(2, rem_key[24..]), seed), + 27 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(2, rem_key[24..]) << 8) | read_bytes(1, rem_key[26..]), seed), + 28 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), read_bytes(4, rem_key[24..]), seed), + 29 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 8) | read_bytes(1, rem_key[28..]), seed), + 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), + 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), + }; + + return mum(self.seed ^ self.msg_len, primes[4]); + } + + pub fn hash(seed: u64, input: []const u8) u64 { + var c = Wyhash.init(seed); + c.update(input); + return c.final(); + } +}; test "test vectors" { const expectEqual = std.testing.expectEqual; - expectEqual(hash("", 0), 0x0); - expectEqual(hash("a", 1), 0xbed235177f41d328); - expectEqual(hash("abc", 2), 0xbe348debe59b27c3); - expectEqual(hash("message digest", 3), 0x37320f657213a290); - expectEqual(hash("abcdefghijklmnopqrstuvwxyz", 4), 0xd0b270e1d8a7019c); - expectEqual(hash("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 5), 0x602a1894d3bbfe7f); - expectEqual(hash("12345678901234567890123456789012345678901234567890123456789012345678901234567890", 6), 0x829e9c148b75970e); + const hash = Wyhash.hash; + + expectEqual(hash(0, ""), 0x0); + expectEqual(hash(1, "a"), 0xbed235177f41d328); + expectEqual(hash(2, "abc"), 0xbe348debe59b27c3); + expectEqual(hash(3, "message digest"), 0x37320f657213a290); + expectEqual(hash(4, "abcdefghijklmnopqrstuvwxyz"), 0xd0b270e1d8a7019c); + expectEqual(hash(5, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x602a1894d3bbfe7f); + expectEqual(hash(6, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x829e9c148b75970e); } diff --git a/std/hash_map.zig b/std/hash_map.zig index 6a8679ccd0..71cfecdd6d 100644 --- a/std/hash_map.zig +++ b/std/hash_map.zig @@ -5,7 +5,7 @@ const testing = std.testing; const math = std.math; const mem = std.mem; const meta = std.meta; -const wyhash = std.hash.wyhash; +const Wyhash = std.hash.Wyhash; const Allocator = mem.Allocator; const builtin = @import("builtin"); @@ -557,7 +557,7 @@ pub fn autoHash(key: var, seed: u64) u64 { builtin.TypeId.EnumLiteral, => @compileError("cannot hash this type"), - builtin.TypeId.Int => return wyhash(std.mem.asBytes(&key), seed), + builtin.TypeId.Int => return Wyhash.hash(seed, std.mem.asBytes(&key)), builtin.TypeId.Float => |info| return autoHash(@bitCast(@IntType(false, info.bits), key), seed), @@ -594,7 +594,7 @@ pub fn autoHash(key: var, seed: u64) u64 { // If there's no unused bits in the child type, we can just hash // this as an array of bytes. if (info.child.bit_count % 8 == 0) { - return wyhash(mem.asBytes(&key), seed); + return Wyhash.hash(seed, mem.asBytes(&key)); } // Otherwise, hash every element. From 4c93ccab5ad48ce61e4136c646b3123a06150083 Mon Sep 17 00:00:00 2001 From: Marc Tiehuis Date: Mon, 1 Jul 2019 23:23:40 +1200 Subject: [PATCH 05/10] Add throughput test program for hash functions --- std/hash/throughput_test.zig | 148 +++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 std/hash/throughput_test.zig diff --git a/std/hash/throughput_test.zig b/std/hash/throughput_test.zig new file mode 100644 index 0000000000..4b7e8ef344 --- /dev/null +++ b/std/hash/throughput_test.zig @@ -0,0 +1,148 @@ +const builtin = @import("builtin"); +const std = @import("std"); +const time = std.time; +const Timer = time.Timer; +const hash = std.hash; + +const KiB = 1024; +const MiB = 1024 * KiB; +const GiB = 1024 * MiB; + +var prng = std.rand.DefaultPrng.init(0); + +const Hash = struct { + ty: type, + name: []const u8, + init_u8s: ?[]const u8 = null, + init_u64: ?u64 = null, +}; + +const siphash_key = "0123456789abcdef"; + +const hashes = [_]Hash{ + Hash{ .ty = hash.Wyhash, .name = "wyhash", .init_u64 = 0 }, + Hash{ .ty = hash.SipHash64(1, 3), .name = "siphash(1,3)", .init_u8s = siphash_key }, + Hash{ .ty = hash.SipHash64(2, 4), .name = "siphash(2,4)", .init_u8s = siphash_key }, + Hash{ .ty = hash.Fnv1a_64, .name = "fnv1a" }, + Hash{ .ty = hash.Crc32, .name = "crc32" }, +}; + +const Result = struct { + hash: u64, + throughput: u64, +}; + +pub fn benchmarkHash(comptime H: var, bytes: usize) !Result { + var h = blk: { + if (H.init_u8s) |init| { + break :blk H.ty.init(init); + } + if (H.init_u64) |init| { + break :blk H.ty.init(init); + } + break :blk H.ty.init(); + }; + + var block: [8192]u8 = undefined; + prng.random.bytes(block[0..]); + + var offset: usize = 0; + var timer = try Timer.start(); + const start = timer.lap(); + while (offset < bytes) : (offset += block.len) { + h.update(block[0..]); + } + const end = timer.read(); + + const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s; + const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s); + + return Result{ + .hash = h.final(), + .throughput = throughput, + }; +} + +fn usage() void { + std.debug.warn( + \\throughput_test [options] + \\ + \\Options: + \\ --filter [test-name] + \\ --seed [int] + \\ --count [int] + \\ --help + \\ + ); +} + +fn mode(comptime x: comptime_int) comptime_int { + return if (builtin.mode == builtin.Mode.Debug) x / 64 else x; +} + +// TODO(#1358): Replace with builtin formatted padding when available. +fn printPad(stdout: var, s: []const u8) !void { + var i: usize = 0; + while (i < 12 - s.len) : (i += 1) { + try stdout.print(" "); + } + try stdout.print("{}", s); +} + +pub fn main() !void { + var stdout_file = try std.io.getStdOut(); + var stdout_out_stream = stdout_file.outStream(); + const stdout = &stdout_out_stream.stream; + + var buffer: [1024]u8 = undefined; + var fixed = std.heap.FixedBufferAllocator.init(buffer[0..]); + const args = try std.process.argsAlloc(&fixed.allocator); + + var filter: ?[]u8 = ""; + var count: usize = mode(128 * MiB); + + var i: usize = 1; + while (i < args.len) : (i += 1) { + if (std.mem.eql(u8, args[i], "--seed")) { + i += 1; + if (i == args.len) { + usage(); + std.os.exit(1); + } + + const seed = try std.fmt.parseUnsigned(u32, args[i], 10); + prng.seed(seed); + } else if (std.mem.eql(u8, args[i], "--filter")) { + i += 1; + if (i == args.len) { + usage(); + std.os.exit(1); + } + + filter = args[i]; + } else if (std.mem.eql(u8, args[i], "--count")) { + i += 1; + if (i == args.len) { + usage(); + std.os.exit(1); + } + + const c = try std.fmt.parseUnsigned(u32, args[i], 10); + count = c * MiB; + } else if (std.mem.eql(u8, args[i], "--help")) { + usage(); + return; + } else { + usage(); + std.os.exit(1); + } + } + + inline for (hashes) |H| { + if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) { + const result = try benchmarkHash(H, count); + try printPad(stdout, H.name); + try stdout.print(": {:4} MiB/s [{:16}]\n", result.throughput / (1 * MiB), result.hash); + } + } +} From 8805a7b50985fca23969beab8636fbfbecd857ee Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Tue, 2 Jul 2019 18:38:46 +0200 Subject: [PATCH 06/10] adapt http/headers.zig to wyhash's new interface --- std/http/headers.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/std/http/headers.zig b/std/http/headers.zig index 7eb7fcc2c2..c588f2d055 100644 --- a/std/http/headers.zig +++ b/std/http/headers.zig @@ -109,7 +109,7 @@ fn stringEql(a: []const u8, b: []const u8) bool { } fn stringHash(s: []const u8) u32 { - return @truncate(u32, std.hash.wyhash(s, 0)); + return @truncate(u32, std.hash.Wyhash.hash(0, s)); } const HeaderList = std.ArrayList(HeaderEntry); From 5bf63bfbf113d3921101311f1e3040890b94e798 Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Tue, 2 Jul 2019 18:40:01 +0200 Subject: [PATCH 07/10] make use of hashing streaming interface in autoHash --- std/hash_map.zig | 154 ++++++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 69 deletions(-) diff --git a/std/hash_map.zig b/std/hash_map.zig index 71cfecdd6d..d906b54618 100644 --- a/std/hash_map.zig +++ b/std/hash_map.zig @@ -522,8 +522,9 @@ pub fn getTrivialEqlFn(comptime K: type) (fn (K, K) bool) { pub fn getAutoHashFn(comptime K: type) (fn (K) u32) { return struct { fn hash(key: K) u32 { - const h = autoHash(key, 0); - return @truncate(u32, h); + var hasher = Wyhash.init(0); + autoHash(&hasher, key); + return @truncate(u32, hasher.final()); } }.hash; } @@ -538,10 +539,7 @@ pub fn getAutoEqlFn(comptime K: type) (fn (K, K) bool) { /// Provides generic hashing for any eligible type. /// Only hashes `key` itself, pointers are not followed. -/// The underlying hashing algorithm is wyhash. -pub fn autoHash(key: var, seed: u64) u64 { - // We use the fact that wyhash takes an input seed to "chain" hasing when the - // key has multiple parts that are not necessarily contiguous in memory. +pub fn autoHash(hasher: var, key: var) void { const Key = @typeOf(key); switch (@typeInfo(Key)) { builtin.TypeId.NoReturn, @@ -557,91 +555,101 @@ pub fn autoHash(key: var, seed: u64) u64 { builtin.TypeId.EnumLiteral, => @compileError("cannot hash this type"), - builtin.TypeId.Int => return Wyhash.hash(seed, std.mem.asBytes(&key)), + builtin.TypeId.Int => hasher.update(std.mem.asBytes(&key)), - builtin.TypeId.Float => |info| return autoHash(@bitCast(@IntType(false, info.bits), key), seed), + builtin.TypeId.Float => |info| autoHash(hasher, @bitCast(@IntType(false, info.bits), key)), - builtin.TypeId.Bool => return autoHash(@boolToInt(key), seed), - builtin.TypeId.Enum => return autoHash(@enumToInt(key), seed), - builtin.TypeId.ErrorSet => return autoHash(@errorToInt(key), seed), - builtin.TypeId.Promise, builtin.TypeId.Fn => return autoHash(@ptrToInt(key), seed), + builtin.TypeId.Bool => autoHash(hasher, @boolToInt(key)), + builtin.TypeId.Enum => autoHash(hasher, @enumToInt(key)), + builtin.TypeId.ErrorSet => autoHash(hasher, @errorToInt(key)), + builtin.TypeId.Promise, builtin.TypeId.Fn => autoHash(hasher, @ptrToInt(key)), - builtin.TypeId.Pointer => |info| return switch (info.size) { + builtin.TypeId.Pointer => |info| switch (info.size) { builtin.TypeInfo.Pointer.Size.One, builtin.TypeInfo.Pointer.Size.Many, builtin.TypeInfo.Pointer.Size.C, - => return autoHash(@ptrToInt(key), seed), + => autoHash(hasher, @ptrToInt(key)), - builtin.TypeInfo.Pointer.Size.Slice => return autoHash(key.len, autoHash(key.ptr, seed)), + builtin.TypeInfo.Pointer.Size.Slice => { + autoHash(hasher, key.ptr); + autoHash(hasher, key.len); + }, }, - builtin.TypeId.Optional => return if (key) |k| autoHash(k, seed) else 0, + builtin.TypeId.Optional => if (key) |k| autoHash(hasher, k), builtin.TypeId.Array => { // TODO detect via a trait when Key has no padding bits to // hash it as an array of bytes. // Otherwise, hash every element. - var s = seed; for (key) |element| { - // We reuse the hash of the previous element as the seed for the - // next one so that they're dependant. - s = autoHash(element, s); + autoHash(hasher, element); } - return s; }, builtin.TypeId.Vector => |info| { - // If there's no unused bits in the child type, we can just hash - // this as an array of bytes. if (info.child.bit_count % 8 == 0) { - return Wyhash.hash(seed, mem.asBytes(&key)); + // If there's no unused bits in the child type, we can just hash + // this as an array of bytes. + hasher.update(mem.asBytes(&key)); + } else { + // Otherwise, hash every element. + // TODO remove the copy to an array once field access is done. + const array: [info.len]info.child = key; + comptime var i: u32 = 0; + inline while (i < info.len) : (i += 1) { + autoHash(hasher, array[i]); + } } - - // Otherwise, hash every element. - var s = seed; - // TODO remove the copy to an array once field access is done. - const array: [info.len]info.child = key; - comptime var i: u32 = 0; - inline while (i < info.len) : (i += 1) { - s = autoHash(array[i], s); - } - return s; }, builtin.TypeId.Struct => |info| { // TODO detect via a trait when Key has no padding bits to // hash it as an array of bytes. // Otherwise, hash every field. - var s = seed; inline for (info.fields) |field| { // We reuse the hash of the previous field as the seed for the // next one so that they're dependant. - s = autoHash(@field(key, field.name), s); + autoHash(hasher, @field(key, field.name)); } - return s; }, - builtin.TypeId.Union => |info| { + builtin.TypeId.Union => |info| blk: { if (info.tag_type) |tag_type| { const tag = meta.activeTag(key); - const s = autoHash(tag, seed); + const s = autoHash(hasher, tag); inline for (info.fields) |field| { const enum_field = field.enum_field.?; if (enum_field.value == @enumToInt(tag)) { - return autoHash(@field(key, enum_field.name), s); + autoHash(hasher, @field(key, enum_field.name)); + // TODO use a labelled break when it does not crash the compiler. + // break :blk; + return; } } unreachable; } else @compileError("cannot hash untagged union type: " ++ @typeName(Key) ++ ", provide your own hash function"); }, - builtin.TypeId.ErrorUnion => { - return autoHash(key catch |err| return autoHash(err, seed), seed); + builtin.TypeId.ErrorUnion => blk: { + const payload = key catch |err| { + autoHash(hasher, err); + break :blk; + }; + autoHash(hasher, payload); }, } } +fn testAutoHash(key: var) u64 { + var hasher = Wyhash.init(0); + autoHash(&hasher, key); + return hasher.final(); +} + test "autoHash slice" { + // Allocate one array dynamically so that we're assured it is not merged + // with the other by the optimization passes. const array1 = try std.heap.direct_allocator.create([6]u32); defer std.heap.direct_allocator.destroy(array1); array1.* = [_]u32{ 1, 2, 3, 4, 5, 6 }; @@ -649,38 +657,46 @@ test "autoHash slice" { const a = array1[0..]; const b = array2[0..]; const c = array1[0..3]; - testing.expect(autoHash(a, 0) == autoHash(a, 0)); - testing.expect(autoHash(a, 0) != autoHash(array1, 0)); - testing.expect(autoHash(a, 0) != autoHash(b, 0)); - testing.expect(autoHash(a, 0) != autoHash(c, 0)); + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(array1)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); } -test "autoHash optional" { +test "testAutoHash optional" { const a: ?u32 = 123; const b: ?u32 = null; - testing.expectEqual(autoHash(a, 0), autoHash(u32(123), 0)); - testing.expect(autoHash(a, 0) != autoHash(b, 0)); - testing.expectEqual(autoHash(b, 0), 0); + testing.expectEqual(testAutoHash(a), testAutoHash(u32(123))); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expectEqual(testAutoHash(b), 0); } -test "autoHash array" { +test "testAutoHash array" { const a = [_]u32{ 1, 2, 3 }; - const h = autoHash(a, 0); - testing.expectEqual(h, autoHash(u32(3), autoHash(u32(2), autoHash(u32(1), 0)))); + const h = testAutoHash(a); + var hasher = Wyhash.init(0); + autoHash(&hasher, u32(1)); + autoHash(&hasher, u32(2)); + autoHash(&hasher, u32(3)); + testing.expectEqual(h, hasher.final()); } -test "autoHash struct" { +test "testAutoHash struct" { const Foo = struct { a: u32 = 1, b: u32 = 2, c: u32 = 3, }; const f = Foo{}; - const h = autoHash(f, 0); - testing.expectEqual(h, autoHash(u32(3), autoHash(u32(2), autoHash(u32(1), 0)))); + const h = testAutoHash(f); + var hasher = Wyhash.init(0); + autoHash(&hasher, u32(1)); + autoHash(&hasher, u32(2)); + autoHash(&hasher, u32(3)); + testing.expectEqual(h, hasher.final()); } -test "autoHash union" { +test "testAutoHash union" { const Foo = union(enum) { A: u32, B: f32, @@ -690,24 +706,24 @@ test "autoHash union" { const a = Foo{ .A = 18 }; var b = Foo{ .B = 12.34 }; const c = Foo{ .C = 18 }; - testing.expect(autoHash(a, 0) == autoHash(a, 0)); - testing.expect(autoHash(a, 0) != autoHash(b, 0)); - testing.expect(autoHash(a, 0) != autoHash(c, 0)); + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); b = Foo{ .A = 18 }; - testing.expect(autoHash(a, 0) == autoHash(b, 0)); + testing.expect(testAutoHash(a) == testAutoHash(b)); } -test "autoHash vector" { +test "testAutoHash vector" { const a: @Vector(4, u32) = [_]u32{ 1, 2, 3, 4 }; const b: @Vector(4, u32) = [_]u32{ 1, 2, 3, 5 }; const c: @Vector(4, u31) = [_]u31{ 1, 2, 3, 4 }; - testing.expect(autoHash(a, 0) == autoHash(a, 0)); - testing.expect(autoHash(a, 0) != autoHash(b, 0)); - testing.expect(autoHash(a, 0) != autoHash(c, 0)); + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); } -test "autoHash error union" { +test "testAutoHash error union" { const Errors = error{Test}; const Foo = struct { a: u32 = 1, @@ -716,7 +732,7 @@ test "autoHash error union" { }; const f = Foo{}; const g: Errors!Foo = Errors.Test; - testing.expect(autoHash(f, 0) != autoHash(g, 0)); - testing.expect(autoHash(f, 0) == autoHash(Foo{}, 0)); - testing.expect(autoHash(g, 0) == autoHash(Errors.Test, 0)); + testing.expect(testAutoHash(f) != testAutoHash(g)); + testing.expect(testAutoHash(f) == testAutoHash(Foo{})); + testing.expect(testAutoHash(g) == testAutoHash(Errors.Test)); } From 4b5172d2879742b98e3e34b90b05ac28da9f39fe Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Tue, 2 Jul 2019 19:46:51 +0200 Subject: [PATCH 08/10] move autoHash into its own module since it can be used with any hash function implementing a streaming interface --- std/hash.zig | 4 + std/hash/auto_hash.zig | 208 +++++++++++++++++++++++++++++++++++++++++ std/hash_map.zig | 201 +-------------------------------------- 3 files changed, 213 insertions(+), 200 deletions(-) create mode 100644 std/hash/auto_hash.zig diff --git a/std/hash.zig b/std/hash.zig index e246fd0ad3..648f34b11d 100644 --- a/std/hash.zig +++ b/std/hash.zig @@ -1,6 +1,9 @@ const adler = @import("hash/adler.zig"); pub const Adler32 = adler.Adler32; +const auto_hash = @import("hash/auto_hash.zig"); +pub const autoHash = auto_hash.autoHash; + // pub for polynomials + generic crc32 construction pub const crc = @import("hash/crc.zig"); pub const Crc32 = crc.Crc32; @@ -30,6 +33,7 @@ pub const Wyhash = wyhash.Wyhash; test "hash" { _ = @import("hash/adler.zig"); + _ = @import("hash/auto_hash.zig"); _ = @import("hash/crc.zig"); _ = @import("hash/fnv.zig"); _ = @import("hash/siphash.zig"); diff --git a/std/hash/auto_hash.zig b/std/hash/auto_hash.zig new file mode 100644 index 0000000000..b21af0a1d8 --- /dev/null +++ b/std/hash/auto_hash.zig @@ -0,0 +1,208 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; +const meta = std.meta; + +/// Provides generic hashing for any eligible type. +/// Only hashes `key` itself, pointers are not followed. +pub fn autoHash(hasher: var, key: var) void { + const Key = @typeOf(key); + switch (@typeInfo(Key)) { + builtin.TypeId.NoReturn, + builtin.TypeId.Opaque, + builtin.TypeId.Undefined, + builtin.TypeId.ArgTuple, + builtin.TypeId.Void, + builtin.TypeId.Null, + builtin.TypeId.BoundFn, + builtin.TypeId.ComptimeFloat, + builtin.TypeId.ComptimeInt, + builtin.TypeId.Type, + builtin.TypeId.EnumLiteral, + => @compileError("cannot hash this type"), + + builtin.TypeId.Int => hasher.update(std.mem.asBytes(&key)), + + builtin.TypeId.Float => |info| autoHash(hasher, @bitCast(@IntType(false, info.bits), key)), + + builtin.TypeId.Bool => autoHash(hasher, @boolToInt(key)), + builtin.TypeId.Enum => autoHash(hasher, @enumToInt(key)), + builtin.TypeId.ErrorSet => autoHash(hasher, @errorToInt(key)), + builtin.TypeId.Promise, builtin.TypeId.Fn => autoHash(hasher, @ptrToInt(key)), + + builtin.TypeId.Pointer => |info| switch (info.size) { + builtin.TypeInfo.Pointer.Size.One, + builtin.TypeInfo.Pointer.Size.Many, + builtin.TypeInfo.Pointer.Size.C, + => autoHash(hasher, @ptrToInt(key)), + + builtin.TypeInfo.Pointer.Size.Slice => { + autoHash(hasher, key.ptr); + autoHash(hasher, key.len); + }, + }, + + builtin.TypeId.Optional => if (key) |k| autoHash(hasher, k), + + builtin.TypeId.Array => { + // TODO detect via a trait when Key has no padding bits to + // hash it as an array of bytes. + // Otherwise, hash every element. + for (key) |element| { + autoHash(hasher, element); + } + }, + + builtin.TypeId.Vector => |info| { + if (info.child.bit_count % 8 == 0) { + // If there's no unused bits in the child type, we can just hash + // this as an array of bytes. + hasher.update(mem.asBytes(&key)); + } else { + // Otherwise, hash every element. + // TODO remove the copy to an array once field access is done. + const array: [info.len]info.child = key; + comptime var i: u32 = 0; + inline while (i < info.len) : (i += 1) { + autoHash(hasher, array[i]); + } + } + }, + + builtin.TypeId.Struct => |info| { + // TODO detect via a trait when Key has no padding bits to + // hash it as an array of bytes. + // Otherwise, hash every field. + inline for (info.fields) |field| { + // We reuse the hash of the previous field as the seed for the + // next one so that they're dependant. + autoHash(hasher, @field(key, field.name)); + } + }, + + builtin.TypeId.Union => |info| blk: { + if (info.tag_type) |tag_type| { + const tag = meta.activeTag(key); + const s = autoHash(hasher, tag); + inline for (info.fields) |field| { + const enum_field = field.enum_field.?; + if (enum_field.value == @enumToInt(tag)) { + autoHash(hasher, @field(key, enum_field.name)); + // TODO use a labelled break when it does not crash the compiler. + // break :blk; + return; + } + } + unreachable; + } else @compileError("cannot hash untagged union type: " ++ @typeName(Key) ++ ", provide your own hash function"); + }, + + builtin.TypeId.ErrorUnion => blk: { + const payload = key catch |err| { + autoHash(hasher, err); + break :blk; + }; + autoHash(hasher, payload); + }, + } +} + +const testing = std.testing; +const Wyhash = std.hash.Wyhash; + +fn testAutoHash(key: var) u64 { + // Any hash could be used here, for testing autoHash. + var hasher = Wyhash.init(0); + autoHash(&hasher, key); + return hasher.final(); +} + +test "autoHash slice" { + // Allocate one array dynamically so that we're assured it is not merged + // with the other by the optimization passes. + const array1 = try std.heap.direct_allocator.create([6]u32); + defer std.heap.direct_allocator.destroy(array1); + array1.* = [_]u32{ 1, 2, 3, 4, 5, 6 }; + const array2 = [_]u32{ 1, 2, 3, 4, 5, 6 }; + const a = array1[0..]; + const b = array2[0..]; + const c = array1[0..3]; + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(array1)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); +} + +test "testAutoHash optional" { + const a: ?u32 = 123; + const b: ?u32 = null; + testing.expectEqual(testAutoHash(a), testAutoHash(u32(123))); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expectEqual(testAutoHash(b), 0); +} + +test "testAutoHash array" { + const a = [_]u32{ 1, 2, 3 }; + const h = testAutoHash(a); + var hasher = Wyhash.init(0); + autoHash(&hasher, u32(1)); + autoHash(&hasher, u32(2)); + autoHash(&hasher, u32(3)); + testing.expectEqual(h, hasher.final()); +} + +test "testAutoHash struct" { + const Foo = struct { + a: u32 = 1, + b: u32 = 2, + c: u32 = 3, + }; + const f = Foo{}; + const h = testAutoHash(f); + var hasher = Wyhash.init(0); + autoHash(&hasher, u32(1)); + autoHash(&hasher, u32(2)); + autoHash(&hasher, u32(3)); + testing.expectEqual(h, hasher.final()); +} + +test "testAutoHash union" { + const Foo = union(enum) { + A: u32, + B: f32, + C: u32, + }; + + const a = Foo{ .A = 18 }; + var b = Foo{ .B = 12.34 }; + const c = Foo{ .C = 18 }; + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); + + b = Foo{ .A = 18 }; + testing.expect(testAutoHash(a) == testAutoHash(b)); +} + +test "testAutoHash vector" { + const a: @Vector(4, u32) = [_]u32{ 1, 2, 3, 4 }; + const b: @Vector(4, u32) = [_]u32{ 1, 2, 3, 5 }; + const c: @Vector(4, u31) = [_]u31{ 1, 2, 3, 4 }; + testing.expect(testAutoHash(a) == testAutoHash(a)); + testing.expect(testAutoHash(a) != testAutoHash(b)); + testing.expect(testAutoHash(a) != testAutoHash(c)); +} + +test "testAutoHash error union" { + const Errors = error{Test}; + const Foo = struct { + a: u32 = 1, + b: u32 = 2, + c: u32 = 3, + }; + const f = Foo{}; + const g: Errors!Foo = Errors.Test; + testing.expect(testAutoHash(f) != testAutoHash(g)); + testing.expect(testAutoHash(f) == testAutoHash(Foo{})); + testing.expect(testAutoHash(g) == testAutoHash(Errors.Test)); +} diff --git a/std/hash_map.zig b/std/hash_map.zig index d906b54618..ab3c4c248d 100644 --- a/std/hash_map.zig +++ b/std/hash_map.zig @@ -5,6 +5,7 @@ const testing = std.testing; const math = std.math; const mem = std.mem; const meta = std.meta; +const autoHash = std.hash.autoHash; const Wyhash = std.hash.Wyhash; const Allocator = mem.Allocator; const builtin = @import("builtin"); @@ -536,203 +537,3 @@ pub fn getAutoEqlFn(comptime K: type) (fn (K, K) bool) { } }.eql; } - -/// Provides generic hashing for any eligible type. -/// Only hashes `key` itself, pointers are not followed. -pub fn autoHash(hasher: var, key: var) void { - const Key = @typeOf(key); - switch (@typeInfo(Key)) { - builtin.TypeId.NoReturn, - builtin.TypeId.Opaque, - builtin.TypeId.Undefined, - builtin.TypeId.ArgTuple, - builtin.TypeId.Void, - builtin.TypeId.Null, - builtin.TypeId.BoundFn, - builtin.TypeId.ComptimeFloat, - builtin.TypeId.ComptimeInt, - builtin.TypeId.Type, - builtin.TypeId.EnumLiteral, - => @compileError("cannot hash this type"), - - builtin.TypeId.Int => hasher.update(std.mem.asBytes(&key)), - - builtin.TypeId.Float => |info| autoHash(hasher, @bitCast(@IntType(false, info.bits), key)), - - builtin.TypeId.Bool => autoHash(hasher, @boolToInt(key)), - builtin.TypeId.Enum => autoHash(hasher, @enumToInt(key)), - builtin.TypeId.ErrorSet => autoHash(hasher, @errorToInt(key)), - builtin.TypeId.Promise, builtin.TypeId.Fn => autoHash(hasher, @ptrToInt(key)), - - builtin.TypeId.Pointer => |info| switch (info.size) { - builtin.TypeInfo.Pointer.Size.One, - builtin.TypeInfo.Pointer.Size.Many, - builtin.TypeInfo.Pointer.Size.C, - => autoHash(hasher, @ptrToInt(key)), - - builtin.TypeInfo.Pointer.Size.Slice => { - autoHash(hasher, key.ptr); - autoHash(hasher, key.len); - }, - }, - - builtin.TypeId.Optional => if (key) |k| autoHash(hasher, k), - - builtin.TypeId.Array => { - // TODO detect via a trait when Key has no padding bits to - // hash it as an array of bytes. - // Otherwise, hash every element. - for (key) |element| { - autoHash(hasher, element); - } - }, - - builtin.TypeId.Vector => |info| { - if (info.child.bit_count % 8 == 0) { - // If there's no unused bits in the child type, we can just hash - // this as an array of bytes. - hasher.update(mem.asBytes(&key)); - } else { - // Otherwise, hash every element. - // TODO remove the copy to an array once field access is done. - const array: [info.len]info.child = key; - comptime var i: u32 = 0; - inline while (i < info.len) : (i += 1) { - autoHash(hasher, array[i]); - } - } - }, - - builtin.TypeId.Struct => |info| { - // TODO detect via a trait when Key has no padding bits to - // hash it as an array of bytes. - // Otherwise, hash every field. - inline for (info.fields) |field| { - // We reuse the hash of the previous field as the seed for the - // next one so that they're dependant. - autoHash(hasher, @field(key, field.name)); - } - }, - - builtin.TypeId.Union => |info| blk: { - if (info.tag_type) |tag_type| { - const tag = meta.activeTag(key); - const s = autoHash(hasher, tag); - inline for (info.fields) |field| { - const enum_field = field.enum_field.?; - if (enum_field.value == @enumToInt(tag)) { - autoHash(hasher, @field(key, enum_field.name)); - // TODO use a labelled break when it does not crash the compiler. - // break :blk; - return; - } - } - unreachable; - } else @compileError("cannot hash untagged union type: " ++ @typeName(Key) ++ ", provide your own hash function"); - }, - - builtin.TypeId.ErrorUnion => blk: { - const payload = key catch |err| { - autoHash(hasher, err); - break :blk; - }; - autoHash(hasher, payload); - }, - } -} - -fn testAutoHash(key: var) u64 { - var hasher = Wyhash.init(0); - autoHash(&hasher, key); - return hasher.final(); -} - -test "autoHash slice" { - // Allocate one array dynamically so that we're assured it is not merged - // with the other by the optimization passes. - const array1 = try std.heap.direct_allocator.create([6]u32); - defer std.heap.direct_allocator.destroy(array1); - array1.* = [_]u32{ 1, 2, 3, 4, 5, 6 }; - const array2 = [_]u32{ 1, 2, 3, 4, 5, 6 }; - const a = array1[0..]; - const b = array2[0..]; - const c = array1[0..3]; - testing.expect(testAutoHash(a) == testAutoHash(a)); - testing.expect(testAutoHash(a) != testAutoHash(array1)); - testing.expect(testAutoHash(a) != testAutoHash(b)); - testing.expect(testAutoHash(a) != testAutoHash(c)); -} - -test "testAutoHash optional" { - const a: ?u32 = 123; - const b: ?u32 = null; - testing.expectEqual(testAutoHash(a), testAutoHash(u32(123))); - testing.expect(testAutoHash(a) != testAutoHash(b)); - testing.expectEqual(testAutoHash(b), 0); -} - -test "testAutoHash array" { - const a = [_]u32{ 1, 2, 3 }; - const h = testAutoHash(a); - var hasher = Wyhash.init(0); - autoHash(&hasher, u32(1)); - autoHash(&hasher, u32(2)); - autoHash(&hasher, u32(3)); - testing.expectEqual(h, hasher.final()); -} - -test "testAutoHash struct" { - const Foo = struct { - a: u32 = 1, - b: u32 = 2, - c: u32 = 3, - }; - const f = Foo{}; - const h = testAutoHash(f); - var hasher = Wyhash.init(0); - autoHash(&hasher, u32(1)); - autoHash(&hasher, u32(2)); - autoHash(&hasher, u32(3)); - testing.expectEqual(h, hasher.final()); -} - -test "testAutoHash union" { - const Foo = union(enum) { - A: u32, - B: f32, - C: u32, - }; - - const a = Foo{ .A = 18 }; - var b = Foo{ .B = 12.34 }; - const c = Foo{ .C = 18 }; - testing.expect(testAutoHash(a) == testAutoHash(a)); - testing.expect(testAutoHash(a) != testAutoHash(b)); - testing.expect(testAutoHash(a) != testAutoHash(c)); - - b = Foo{ .A = 18 }; - testing.expect(testAutoHash(a) == testAutoHash(b)); -} - -test "testAutoHash vector" { - const a: @Vector(4, u32) = [_]u32{ 1, 2, 3, 4 }; - const b: @Vector(4, u32) = [_]u32{ 1, 2, 3, 5 }; - const c: @Vector(4, u31) = [_]u31{ 1, 2, 3, 4 }; - testing.expect(testAutoHash(a) == testAutoHash(a)); - testing.expect(testAutoHash(a) != testAutoHash(b)); - testing.expect(testAutoHash(a) != testAutoHash(c)); -} - -test "testAutoHash error union" { - const Errors = error{Test}; - const Foo = struct { - a: u32 = 1, - b: u32 = 2, - c: u32 = 3, - }; - const f = Foo{}; - const g: Errors!Foo = Errors.Test; - testing.expect(testAutoHash(f) != testAutoHash(g)); - testing.expect(testAutoHash(f) == testAutoHash(Foo{})); - testing.expect(testAutoHash(g) == testAutoHash(Errors.Test)); -} From 3faf5d38576616d033c343130607189eb9fe613c Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Tue, 16 Jul 2019 20:31:02 +0200 Subject: [PATCH 09/10] wyhash: stateless is faster for both iterative hashing and small keys. --- std/hash/wyhash.zig | 52 +++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/std/hash/wyhash.zig b/std/hash/wyhash.zig index 49119c5a95..dfa5156cad 100644 --- a/std/hash/wyhash.zig +++ b/std/hash/wyhash.zig @@ -33,16 +33,11 @@ fn mix1(a: u64, b: u64, seed: u64) u64 { pub const Wyhash = struct { seed: u64, - - buf: [32]u8, - buf_len: usize, msg_len: usize, pub fn init(seed: u64) Wyhash { return Wyhash{ .seed = seed, - .buf = undefined, - .buf_len = 0, .msg_len = 0, }; } @@ -61,34 +56,12 @@ pub const Wyhash = struct { ); } - pub fn update(self: *Wyhash, b: []const u8) void { - var off: usize = 0; + fn partial(self: *Wyhash, b: []const u8) void { + const rem_key = b; + const rem_len = b.len; - // Partial from previous. - if (self.buf_len != 0 and self.buf_len + b.len > 32) { - off += 32 - self.buf_len; - mem.copy(u8, self.buf[self.buf_len..], b[0..off]); - self.round(self.buf[0..]); - self.buf_len = 0; - } - - // Full middle blocks. - while (off + 32 <= b.len) : (off += 32) { - @inlineCall(self.round, b[off .. off + 32]); - } - - // Remainder for next pass. - mem.copy(u8, self.buf[self.buf_len..], b[off..]); - self.buf_len += @intCast(u8, b[off..].len); - self.msg_len += b.len; - } - - pub fn final(self: *Wyhash) u64 { - const seed = self.seed; - const rem_len = @intCast(u5, self.buf_len); - const rem_key = self.buf[0..self.buf_len]; - - self.seed = switch (rem_len) { + var seed = self.seed; + seed = switch (@intCast(u5, rem_len)) { 0 => seed, 1 => mix0(read_bytes(1, rem_key), primes[4], seed), 2 => mix0(read_bytes(2, rem_key), primes[4], seed), @@ -122,7 +95,22 @@ pub const Wyhash = struct { 30 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 16) | read_bytes(2, rem_key[28..]), seed), 31 => mix0(read_8bytes_swapped(rem_key), read_8bytes_swapped(rem_key[8..]), seed) ^ mix1(read_8bytes_swapped(rem_key[16..]), (read_bytes(4, rem_key[24..]) << 24) | (read_bytes(2, rem_key[28..]) << 8) | read_bytes(1, rem_key[30..]), seed), }; + self.seed = seed; + } + pub fn update(self: *Wyhash, b: []const u8) void { + var off: usize = 0; + + // Full middle blocks. + while (off + 32 <= b.len) : (off += 32) { + @inlineCall(self.round, b[off .. off + 32]); + } + + self.partial(b[off..]); + self.msg_len += b.len; + } + + pub fn final(self: *Wyhash) u64 { return mum(self.seed ^ self.msg_len, primes[4]); } From 54255ee32e1e6c83b04c3e5f2f1dd7e8aa5e0dd7 Mon Sep 17 00:00:00 2001 From: Sahnvour Date: Tue, 16 Jul 2019 22:32:10 +0200 Subject: [PATCH 10/10] autohash: force inlining of integer hashing so that the optimizer can see the fast path based on key's size which is known at comptime otherwise it will always outline the call to hasher.update, resulting in much worse performance --- std/hash/auto_hash.zig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/std/hash/auto_hash.zig b/std/hash/auto_hash.zig index b21af0a1d8..2da9691ffd 100644 --- a/std/hash/auto_hash.zig +++ b/std/hash/auto_hash.zig @@ -21,7 +21,9 @@ pub fn autoHash(hasher: var, key: var) void { builtin.TypeId.EnumLiteral, => @compileError("cannot hash this type"), - builtin.TypeId.Int => hasher.update(std.mem.asBytes(&key)), + // Help the optimizer see that hashing an int is easy by inlining! + // TODO Check if the situation is better after #561 is resolved. + builtin.TypeId.Int => @inlineCall(hasher.update, std.mem.asBytes(&key)), builtin.TypeId.Float => |info| autoHash(hasher, @bitCast(@IntType(false, info.bits), key)),