From dfbe65e8cd62625c5ef783aee01056bac678ecf1 Mon Sep 17 00:00:00 2001 From: andersfr Date: Sat, 6 Jul 2019 15:12:07 +0200 Subject: [PATCH 1/4] Ported CityHash and Murmur hashing algorithms to native zig --- std/hash.zig | 11 ++ std/hash/cityhash.zig | 387 ++++++++++++++++++++++++++++++++++++++++++ std/hash/murmur.zig | 322 +++++++++++++++++++++++++++++++++++ 3 files changed, 720 insertions(+) create mode 100644 std/hash/cityhash.zig create mode 100644 std/hash/murmur.zig diff --git a/std/hash.zig b/std/hash.zig index 19ee62824d..148504aa39 100644 --- a/std/hash.zig +++ b/std/hash.zig @@ -14,9 +14,20 @@ const siphash = @import("hash/siphash.zig"); pub const SipHash64 = siphash.SipHash64; pub const SipHash128 = siphash.SipHash128; +pub const murmur = @import("hash/murmur.zig"); +pub const Murmur2_32 = murmur.Murmur2_32; +pub const Murmur2_64 = murmur.Murmur2_64; +pub const Murmur3_32 = murmur.Murmur3_32; + +pub const cityhash = @import("hash/cityhash.zig"); +pub const CityHash32 = cityhash.CityHash32; +pub const CityHash64 = cityhash.CityHash64; + test "hash" { _ = @import("hash/adler.zig"); _ = @import("hash/crc.zig"); _ = @import("hash/fnv.zig"); _ = @import("hash/siphash.zig"); + _ = @import("hash/murmur.zig"); + _ = @import("hash/cityhash.zig"); } diff --git a/std/hash/cityhash.zig b/std/hash/cityhash.zig new file mode 100644 index 0000000000..3599d95ef9 --- /dev/null +++ b/std/hash/cityhash.zig @@ -0,0 +1,387 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +pub const CityHash32 = struct { + const Self = @This(); + + // Magic numbers for 32-bit hashing. Copied from Murmur3. + const c1: u32 = 0xcc9e2d51; + const c2: u32 = 0x1b873593; + + fn fetch32(ptr: [*]const u8) u32 { + var v: u32 = undefined; + @memcpy(@ptrCast([*]u8, &v), ptr, 4); + if (builtin.endian == builtin.Endian.Big) + return @byteSwap(u32, v); + return v; + } + + // A 32-bit to 32-bit integer hash copied from Murmur3. + fn fmix(h: u32) u32 { + var h1: u32 = h; + h1 ^= h1 >> 16; + h1 *%= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *%= 0xc2b2ae35; + h1 ^= h1 >> 16; + return h1; + } + + // Rotate right helper + fn rotr32(x: u32, comptime r: u32) u32 { + return (x >> r) | (x << (32 - r)); + } + + // Helper from Murmur3 for combining two 32-bit values. + fn mur(a: u32, h: u32) u32 { + var a1: u32 = a; + var h1: u32 = h; + a1 *%= c1; + a1 = rotr32(a1, 17); + a1 *%= c2; + h1 ^= a1; + h1 = rotr32(h1, 19); + return h1 *% 5 +% 0xe6546b64; + } + + fn hash32Len0To4(str: []const u8) u32 { + const len: u32 = @intCast(u32, str.len); + var b: u32 = 0; + var c: u32 = 9; + for (str) |v| { + b = b *% c1 +% @bitCast(u32, @intCast(i32, @bitCast(i8, v))); + c ^= b; + } + return fmix(mur(b, mur(len, c))); + } + + fn hash32Len5To12(str: []const u8) u32 { + var a: u32 = @intCast(u32, str.len); + var b: u32 = a *% 5; + var c: u32 = 9; + const d: u32 = b; + + a +%= fetch32(str.ptr); + b +%= fetch32(str.ptr + str.len - 4); + c +%= fetch32(str.ptr + ((str.len >> 1) & 4)); + + return fmix(mur(c, mur(b, mur(a, d)))); + } + + fn hash32Len13To24(str: []const u8) u32 { + const len: u32 = @intCast(u32, str.len); + const a: u32 = fetch32(str.ptr + (str.len >> 1) - 4); + const b: u32 = fetch32(str.ptr + 4); + const c: u32 = fetch32(str.ptr + str.len - 8); + const d: u32 = fetch32(str.ptr + (str.len >> 1)); + const e: u32 = fetch32(str.ptr); + const f: u32 = fetch32(str.ptr + str.len - 4); + + return fmix(mur(f, mur(e, mur(d, mur(c, mur(b, mur(a, len))))))); + } + + pub fn hash(str: []const u8) u32 { + if (str.len <= 24) { + if (str.len <= 4) { + return hash32Len0To4(str); + } else { + if (str.len <= 12) + return hash32Len5To12(str); + return hash32Len13To24(str); + } + } + + const len: u32 = @intCast(u32, str.len); + var h: u32 = len; + var g: u32 = c1 *% len; + var f: u32 = g; + + const a0: u32 = rotr32(fetch32(str.ptr + str.len - 4) *% c1, 17) *% c2; + const a1: u32 = rotr32(fetch32(str.ptr + str.len - 8) *% c1, 17) *% c2; + const a2: u32 = rotr32(fetch32(str.ptr + str.len - 16) *% c1, 17) *% c2; + const a3: u32 = rotr32(fetch32(str.ptr + str.len - 12) *% c1, 17) *% c2; + const a4: u32 = rotr32(fetch32(str.ptr + str.len - 20) *% c1, 17) *% c2; + + h ^= a0; + h = rotr32(h, 19); + h = h *% 5 +% 0xe6546b64; + h ^= a2; + h = rotr32(h, 19); + h = h *% 5 +% 0xe6546b64; + g ^= a1; + g = rotr32(g, 19); + g = g *% 5 +% 0xe6546b64; + g ^= a3; + g = rotr32(g, 19); + g = g *% 5 +% 0xe6546b64; + f +%= a4; + f = rotr32(f, 19); + f = f *% 5 +% 0xe6546b64; + var iters = (str.len - 1) / 20; + var ptr = str.ptr; + while (iters != 0) : (iters -= 1) { + const b0: u32 = rotr32(fetch32(ptr) *% c1, 17) *% c2; + const b1: u32 = fetch32(ptr + 4); + const b2: u32 = rotr32(fetch32(ptr + 8) *% c1, 17) *% c2; + const b3: u32 = rotr32(fetch32(ptr + 12) *% c1, 17) *% c2; + const b4: u32 = fetch32(ptr + 16); + + h ^= b0; + h = rotr32(h, 18); + h = h *% 5 +% 0xe6546b64; + f +%= b1; + f = rotr32(f, 19); + f = f *% c1; + g +%= b2; + g = rotr32(g, 18); + g = g *% 5 +% 0xe6546b64; + h ^= b3 +% b1; + h = rotr32(h, 19); + h = h *% 5 +% 0xe6546b64; + g ^= b4; + g = @byteSwap(u32, g) *% 5; + h +%= b4 *% 5; + h = @byteSwap(u32, h); + f +%= b0; + const t: u32 = h; + h = f; + f = g; + g = t; + ptr += 20; + } + g = rotr32(g, 11) *% c1; + g = rotr32(g, 17) *% c1; + f = rotr32(f, 11) *% c1; + f = rotr32(f, 17) *% c1; + h = rotr32(h +% g, 19); + h = h *% 5 +% 0xe6546b64; + h = rotr32(h, 17) *% c1; + h = rotr32(h +% f, 19); + h = h *% 5 +% 0xe6546b64; + h = rotr32(h, 17) *% c1; + return h; + } +}; + +pub const CityHash64 = struct { + const Self = @This(); + + // Some primes between 2^63 and 2^64 for various uses. + const k0: u64 = 0xc3a5c85c97cb3127; + const k1: u64 = 0xb492b66fbe98f273; + const k2: u64 = 0x9ae16a3b2f90404f; + + fn fetch32(ptr: [*]const u8) u32 { + var v: u32 = undefined; + @memcpy(@ptrCast([*]u8, &v), ptr, 4); + if (builtin.endian == builtin.Endian.Big) + return @byteSwap(u32, v); + return v; + } + + fn fetch64(ptr: [*]const u8) u64 { + var v: u64 = undefined; + @memcpy(@ptrCast([*]u8, &v), ptr, 8); + if (builtin.endian == builtin.Endian.Big) + return @byteSwap(u64, v); + return v; + } + + // Rotate right helper + fn rotr64(x: u64, comptime r: u64) u64 { + return (x >> r) | (x << (64 - r)); + } + + fn shiftmix(v: u64) u64 { + return v ^ (v >> 47); + } + + fn hashLen16(u: u64, v: u64) u64 { + return @inlineCall(hash128To64, u, v); + } + + fn hashLen16Mul(low: u64, high: u64, mul: u64) u64 { + var a: u64 = (low ^ high) *% mul; + a ^= (a >> 47); + var b: u64 = (high ^ a) *% mul; + b ^= (b >> 47); + b *%= mul; + return b; + } + + fn hash128To64(low: u64, high: u64) u64 { + return @inlineCall(hashLen16Mul, low, high, 0x9ddfea08eb382d69); + } + + fn hashLen0To16(str: []const u8) u64 { + const len: u64 = @intCast(u64, str.len); + if (len >= 8) { + const mul: u64 = k2 +% len *% 2; + const a: u64 = fetch64(str.ptr) + k2; + const b: u64 = fetch64(str.ptr + str.len - 8); + const c: u64 = rotr64(b, 37) *% mul +% a; + const d: u64 = (rotr64(a, 25) +% b) *% mul; + return hashLen16Mul(c, d, mul); + } + if (len >= 4) { + const mul: u64 = k2 +% len *% 2; + const a: u64 = fetch32(str.ptr); + return hashLen16Mul(len +% (a << 3), fetch32(str.ptr + str.len - 4), mul); + } + if (len > 0) { + const a: u8 = str[0]; + const b: u8 = str[str.len >> 1]; + const c: u8 = str[str.len - 1]; + const y: u32 = @intCast(u32, a) +% (@intCast(u32, b) << 8); + const z: u32 = @intCast(u32, str.len) +% (@intCast(u32, c) << 2); + return shiftmix(@intCast(u64, y) *% k2 ^ @intCast(u64, z) *% k0) *% k2; + } + return k2; + } + + fn hashLen17To32(str: []const u8) u64 { + const len: u64 = @intCast(u64, str.len); + const mul: u64 = k2 +% len *% 2; + const a: u64 = fetch64(str.ptr) *% k1; + const b: u64 = fetch64(str.ptr + 8); + const c: u64 = fetch64(str.ptr + str.len - 8) *% mul; + const d: u64 = fetch64(str.ptr + str.len - 16) *% k2; + + return hashLen16Mul(rotr64(a +% b, 43) +% rotr64(c, 30) +% d, a +% rotr64(b +% k2, 18) +% c, mul); + } + + fn hashLen33To64(str: []const u8) u64 { + const len: u64 = @intCast(u64, str.len); + const mul: u64 = k2 +% len *% 2; + const a: u64 = fetch64(str.ptr) *% k2; + const b: u64 = fetch64(str.ptr + 8); + const c: u64 = fetch64(str.ptr + str.len - 24); + const d: u64 = fetch64(str.ptr + str.len - 32); + const e: u64 = fetch64(str.ptr + 16) *% k2; + const f: u64 = fetch64(str.ptr + 24) *% 9; + const g: u64 = fetch64(str.ptr + str.len - 8); + const h: u64 = fetch64(str.ptr + str.len - 16) *% mul; + + const u: u64 = rotr64(a +% g, 43) +% (rotr64(b, 30) +% c) *% 9; + const v: u64 = ((a +% g) ^ d) +% f +% 1; + const w: u64 = @byteSwap(u64, (u +% v) *% mul) +% h; + const x: u64 = rotr64(e +% f, 42) +% c; + const y: u64 = (@byteSwap(u64, (v +% w) *% mul) +% g) *% mul; + const z: u64 = e +% f +% c; + const a1: u64 = @byteSwap(u64, (x +% z) *% mul +% y) +% b; + const b1: u64 = shiftmix((z +% a1) *% mul +% d +% h) *% mul; + return b1 +% x; + } + + const WeakPair = struct { + first: u64, + second: u64, + }; + + fn weakHashLen32WithSeedsHelper(w: u64, x: u64, y: u64, z: u64, a: u64, b: u64) WeakPair { + var a1: u64 = a; + var b1: u64 = b; + a1 +%= w; + b1 = rotr64(b1 +% a1 +% z, 21); + var c: u64 = a1; + a1 +%= x; + a1 +%= y; + b1 +%= rotr64(a1, 44); + return WeakPair{ .first = a1 +% z, .second = b1 +% c }; + } + + fn weakHashLen32WithSeeds(ptr: [*]const u8, a: u64, b: u64) WeakPair { + return @inlineCall(weakHashLen32WithSeedsHelper, fetch64(ptr), fetch64(ptr + 8), fetch64(ptr + 16), fetch64(ptr + 24), a, b); + } + + pub fn hash(str: []const u8) u64 { + if (str.len <= 32) { + if (str.len <= 16) { + return hashLen0To16(str); + } else { + return hashLen17To32(str); + } + } else if (str.len <= 64) { + return hashLen33To64(str); + } + + var len: u64 = @intCast(u64, str.len); + + var x: u64 = fetch64(str.ptr + str.len - 40); + var y: u64 = fetch64(str.ptr + str.len - 16) +% fetch64(str.ptr + str.len - 56); + var z: u64 = hashLen16(fetch64(str.ptr + str.len - 48) +% len, fetch64(str.ptr + str.len - 24)); + var v: WeakPair = weakHashLen32WithSeeds(str.ptr + str.len - 64, len, z); + var w: WeakPair = weakHashLen32WithSeeds(str.ptr + str.len - 32, y +% k1, x); + + x = x *% k1 +% fetch64(str.ptr); + len = (len - 1) & ~@intCast(u64, 63); + + var ptr: [*]const u8 = str.ptr; + while (true) { + x = rotr64(x +% y +% v.first +% fetch64(ptr + 8), 37) *% k1; + y = rotr64(y +% v.second +% fetch64(ptr + 48), 42) *% k1; + x ^= w.second; + y +%= v.first +% fetch64(ptr + 40); + z = rotr64(z +% w.first, 33) *% k1; + v = weakHashLen32WithSeeds(ptr, v.second *% k1, x +% w.first); + w = weakHashLen32WithSeeds(ptr + 32, z +% w.second, y +% fetch64(ptr + 16)); + const t: u64 = z; + z = x; + x = t; + + ptr += 64; + len -= 64; + if (len == 0) + break; + } + + return hashLen16(hashLen16(v.first, w.first) +% shiftmix(y) *% k1 +% z, hashLen16(v.second, w.second) +% x); + } + + pub fn hashWithSeed(str: []const u8, seed: u64) u64 { + return @inlineCall(Self.hashWithSeeds, str, k2, seed); + } + + pub fn hashWithSeeds(str: []const u8, seed0: u64, seed1: u64) u64 { + return hashLen16(hash(str) -% seed0, seed1); + } +}; + +fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { + const hashbytes = hashbits / 8; + var key: [256]u8 = undefined; + var hashes: [hashbytes * 256]u8 = undefined; + var final: [hashbytes]u8 = undefined; + + @memset(@ptrCast([*]u8, &key[0]), 0, @sizeOf(@typeOf(key))); + @memset(@ptrCast([*]u8, &hashes[0]), 0, @sizeOf(@typeOf(hashes))); + @memset(@ptrCast([*]u8, &final[0]), 0, @sizeOf(@typeOf(final))); + + var i: u32 = 0; + while (i < 256) : (i += 1) { + key[i] = @intCast(u8, i); + + var h = hash_fn(key[0..i], 256 - i); + if (builtin.endian == builtin.Endian.Big) + h = @byteSwap(@typeOf(h), h); + @memcpy(@ptrCast([*]u8, &hashes[i * hashbytes]), @ptrCast([*]u8, &h), hashbytes); + } + + return @intCast(u32, hash_fn(hashes, 0) & 0xffffffff); +} + +fn CityHash32hashIgnoreSeed(str: []const u8, seed: u32) u32 { + return CityHash32.hash(str); +} + +test "cityhash32" { + // Note: SMHasher doesn't provide a 32bit version of the algorithm. + // Note: The implementation was verified against the Google Abseil version. + std.testing.expectEqual(SMHasherTest(CityHash32hashIgnoreSeed, 32), 0x68254F81); +} + +test "cityhash64" { + // Note: This is not compliant with the SMHasher implementation of CityHash64! + // Note: The implementation was verified against the Google Abseil version. + std.testing.expectEqual(SMHasherTest(CityHash64.hashWithSeed, 64), 0x5FABC5C5); +} diff --git a/std/hash/murmur.zig b/std/hash/murmur.zig new file mode 100644 index 0000000000..ca2f23aa98 --- /dev/null +++ b/std/hash/murmur.zig @@ -0,0 +1,322 @@ +const std = @import("std"); +const builtin = @import("builtin"); + +const default_seed: u32 = 0xc70f6907; + +pub const Murmur2_32 = struct { + const Self = @This(); + + pub fn hash(str: []const u8) u32 { + return @inlineCall(Self.hashWithSeed, str, default_seed); + } + + pub fn hashWithSeed(str: []const u8, seed: u32) u32 { + const m: u32 = 0x5bd1e995; + const len = @intCast(u32, str.len); + var h1: u32 = seed ^ len; + for (@ptrCast([*]allowzero align(1) const u32, str.ptr)[0..(len >> 2)]) |v| { + var k1: u32 = v; + if (builtin.endian == builtin.Endian.Big) + k1 = @byteSwap(u32, k1); + k1 *%= m; + k1 ^= k1 >> 24; + k1 *%= m; + h1 *%= m; + h1 ^= k1; + } + const offset = len & 0xfffffffc; + const rest = len & 3; + if (rest >= 3) { + h1 ^= @intCast(u32, str[offset + 2]) << 16; + } + if (rest >= 2) { + h1 ^= @intCast(u32, str[offset + 1]) << 8; + } + if (rest >= 1) { + h1 ^= @intCast(u32, str[offset + 0]); + h1 *%= m; + } + h1 ^= h1 >> 13; + h1 *%= m; + h1 ^= h1 >> 15; + return h1; + } + + pub fn hashUint32(v: u32) u32 { + return @inlineCall(Self.hashUint32WithSeed, v, default_seed); + } + + pub fn hashUint32WithSeed(v: u32, seed: u32) u32 { + const m: u32 = 0x5bd1e995; + const len: u32 = 4; + var h1: u32 = seed ^ len; + var k1: u32 = undefined; + k1 = v *% m; + k1 ^= k1 >> 24; + k1 *%= m; + h1 *%= m; + h1 ^= k1; + h1 ^= h1 >> 13; + h1 *%= m; + h1 ^= h1 >> 15; + return h1; + } + + pub fn hashUint64(v: u64) u32 { + return @inlineCall(Self.hashUint64WithSeed, v, default_seed); + } + + pub fn hashUint64WithSeed(v: u64, seed: u32) u32 { + const m: u32 = 0x5bd1e995; + const len: u32 = 4; + var h1: u32 = seed ^ len; + var k1: u32 = undefined; + k1 = @intCast(u32, v) *% m; + k1 ^= k1 >> 24; + k1 *%= m; + h1 *%= m; + h1 ^= k1; + k1 = @intCast(u32, v >> 32) *% m; + k1 ^= k1 >> 24; + k1 *%= m; + h1 *%= m; + h1 ^= k1; + h1 ^= h1 >> 13; + h1 *%= m; + h1 ^= h1 >> 15; + return h1; + } +}; + +pub const Murmur2_64 = struct { + const Self = @This(); + + pub fn hash(str: []const u8) u64 { + return @inlineCall(Self.hashWithSeed, str, default_seed); + } + + pub fn hashWithSeed(str: []const u8, seed: u64) u64 { + const m: u64 = 0xc6a4a7935bd1e995; + const len = @intCast(u64, str.len); + var h1: u64 = seed ^ (len *% m); + for (@ptrCast([*]allowzero align(1) const u64, str.ptr)[0..(len >> 3)]) |v| { + var k1: u64 = v; + if (builtin.endian == builtin.Endian.Big) + k1 = @byteSwap(u64, k1); + k1 *%= m; + k1 ^= k1 >> 47; + k1 *%= m; + h1 ^= k1; + h1 *%= m; + } + const rest = len & 7; + const offset = len - rest; + if (rest > 0) { + var k1: u64 = 0; + @memcpy(@ptrCast([*]u8, &k1), @ptrCast([*]const u8, &str[offset]), rest); + if (builtin.endian == builtin.Endian.Big) + k1 = @byteSwap(u64, k1); + h1 ^= k1; + h1 *%= m; + } + h1 ^= h1 >> 47; + h1 *%= m; + h1 ^= h1 >> 47; + return h1; + } + + pub fn hashUint32(v: u32) u64 { + return @inlineCall(Self.hashUint32WithSeed, v, default_seed); + } + + pub fn hashUint32WithSeed(v: u32, seed: u32) u64 { + const m: u64 = 0xc6a4a7935bd1e995; + const len: u64 = 4; + var h1: u64 = seed ^ (len *% m); + var k1: u64 = undefined; + k1 = v *% m; + k1 ^= k1 >> 47; + k1 *%= m; + h1 ^= k1; + h1 *%= m; + h1 ^= h1 >> 47; + h1 *%= m; + h1 ^= h1 >> 47; + return h1; + } + + pub fn hashUint64(v: u64) u64 { + return @inlineCall(Self.hashUint64WithSeed, v, default_seed); + } + + pub fn hashUint64WithSeed(v: u64, seed: u32) u64 { + const m: u64 = 0xc6a4a7935bd1e995; + const len: u64 = 8; + var h1: u64 = seed ^ (len *% m); + var k1: u64 = undefined; + k1 = @intCast(u32, v) *% m; + k1 ^= k1 >> 47; + k1 *%= m; + h1 ^= k1; + h1 *%= m; + k1 = @intCast(u32, v >> 32) *% m; + k1 ^= k1 >> 47; + k1 *%= m; + h1 ^= k1; + h1 *%= m; + h1 ^= h1 >> 47; + h1 *%= m; + h1 ^= h1 >> 47; + return h1; + } +}; + +pub const Murmur3_32 = struct { + const Self = @This(); + + fn rotl32(x: u32, comptime r: u32) u32 { + return (x << r) | (x >> (32 - r)); + } + + pub fn hash(str: []const u8) u32 { + return @inlineCall(Self.hashWithSeed, str, default_seed); + } + + pub fn hashWithSeed(str: []const u8, seed: u32) u32 { + const c1: u32 = 0xcc9e2d51; + const c2: u32 = 0x1b873593; + const len = @intCast(u32, str.len); + var h1: u32 = seed; + for (@ptrCast([*]allowzero align(1) const u32, str.ptr)[0..(len >> 2)]) |v| { + var k1: u32 = v; + if (builtin.endian == builtin.Endian.Big) + k1 = @byteSwap(u32, k1); + k1 *%= c1; + k1 = rotl32(k1, 15); + k1 *%= c2; + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 *%= 5; + h1 +%= 0xe6546b64; + } + { + var k1: u32 = 0; + const offset = len & 0xfffffffc; + const rest = len & 3; + if (rest == 3) { + k1 ^= @intCast(u32, str[offset + 2]) << 16; + } + if (rest >= 2) { + k1 ^= @intCast(u32, str[offset + 1]) << 8; + } + if (rest >= 1) { + k1 ^= @intCast(u32, str[offset + 0]); + k1 *%= c1; + k1 = rotl32(k1, 15); + k1 *%= c2; + h1 ^= k1; + } + } + h1 ^= len; + h1 ^= h1 >> 16; + h1 *%= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *%= 0xc2b2ae35; + h1 ^= h1 >> 16; + return h1; + } + + pub fn hashUint32(v: u32) u32 { + return @inlineCall(Self.hashUint32WithSeed, v, default_seed); + } + + pub fn hashUint32WithSeed(v: u32, seed: u32) u32 { + const c1: u32 = 0xcc9e2d51; + const c2: u32 = 0x1b873593; + const len: u32 = 4; + var h1: u32 = seed; + var k1: u32 = undefined; + k1 = v *% c1; + k1 = rotl32(k1, 15); + k1 *%= c2; + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 *%= 5; + h1 +%= 0xe6546b64; + h1 ^= len; + h1 ^= h1 >> 16; + h1 *%= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *%= 0xc2b2ae35; + h1 ^= h1 >> 16; + return h1; + } + + pub fn hashUint64(v: u64) u32 { + return @inlineCall(Self.hashUint64WithSeed, v, default_seed); + } + + pub fn hashUint64WithSeed(v: u64, seed: u32) u32 { + const c1: u32 = 0xcc9e2d51; + const c2: u32 = 0x1b873593; + const len: u32 = 8; + var h1: u32 = seed; + var k1: u32 = undefined; + k1 = @intCast(u32, v) *% c1; + k1 = rotl32(k1, 15); + k1 *%= c2; + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 *%= 5; + h1 +%= 0xe6546b64; + k1 = @intCast(u32, v >> 32) *% c1; + k1 = rotl32(k1, 15); + k1 *%= c2; + h1 ^= k1; + h1 = rotl32(h1, 13); + h1 *%= 5; + h1 +%= 0xe6546b64; + h1 ^= len; + h1 ^= h1 >> 16; + h1 *%= 0x85ebca6b; + h1 ^= h1 >> 13; + h1 *%= 0xc2b2ae35; + h1 ^= h1 >> 16; + return h1; + } +}; + +fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { + const hashbytes = hashbits / 8; + var key: [256]u8 = undefined; + var hashes: [hashbytes * 256]u8 = undefined; + var final: [hashbytes]u8 = undefined; + + @memset(@ptrCast([*]u8, &key[0]), 0, @sizeOf(@typeOf(key))); + @memset(@ptrCast([*]u8, &hashes[0]), 0, @sizeOf(@typeOf(hashes))); + @memset(@ptrCast([*]u8, &final[0]), 0, @sizeOf(@typeOf(final))); + + var i: u32 = 0; + while (i < 256) : (i += 1) { + key[i] = @intCast(u8, i); + + var h = hash_fn(key[0..i], 256 - i); + if (builtin.endian == builtin.Endian.Big) + h = @byteSwap(@typeOf(h), h); + @memcpy(@ptrCast([*]u8, &hashes[i * hashbytes]), @ptrCast([*]u8, &h), hashbytes); + } + + return @intCast(u32, hash_fn(hashes, 0) & 0xffffffff); +} + +test "murmur2_32" { + std.testing.expectEqual(SMHasherTest(Murmur2_32.hashWithSeed, 32), 0x27864C1E); +} + +test "murmur2_64" { + std.testing.expectEqual(SMHasherTest(Murmur2_64.hashWithSeed, 64), 0x1F0D3804); +} + +test "murmur3_32" { + std.testing.expectEqual(SMHasherTest(Murmur3_32.hashWithSeed, 32), 0xB0F57EE3); +} From 0619b525b9100e95ab293792bcaa3c565e08a979 Mon Sep 17 00:00:00 2001 From: andersfr Date: Sat, 6 Jul 2019 22:41:38 +0200 Subject: [PATCH 2/4] Better testing and bugfix in hashUint* functions --- std/hash/murmur.zig | 51 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/std/hash/murmur.zig b/std/hash/murmur.zig index ca2f23aa98..52d65b0564 100644 --- a/std/hash/murmur.zig +++ b/std/hash/murmur.zig @@ -1,5 +1,6 @@ const std = @import("std"); const builtin = @import("builtin"); +const testing = std.testing; const default_seed: u32 = 0xc70f6907; @@ -68,10 +69,10 @@ pub const Murmur2_32 = struct { pub fn hashUint64WithSeed(v: u64, seed: u32) u32 { const m: u32 = 0x5bd1e995; - const len: u32 = 4; + const len: u32 = 8; var h1: u32 = seed ^ len; var k1: u32 = undefined; - k1 = @intCast(u32, v) *% m; + k1 = @intCast(u32, v & 0xffffffff) *% m; k1 ^= k1 >> 24; k1 *%= m; h1 *%= m; @@ -133,10 +134,7 @@ pub const Murmur2_64 = struct { const m: u64 = 0xc6a4a7935bd1e995; const len: u64 = 4; var h1: u64 = seed ^ (len *% m); - var k1: u64 = undefined; - k1 = v *% m; - k1 ^= k1 >> 47; - k1 *%= m; + var k1: u64 = v; h1 ^= k1; h1 *%= m; h1 ^= h1 >> 47; @@ -154,12 +152,7 @@ pub const Murmur2_64 = struct { const len: u64 = 8; var h1: u64 = seed ^ (len *% m); var k1: u64 = undefined; - k1 = @intCast(u32, v) *% m; - k1 ^= k1 >> 47; - k1 *%= m; - h1 ^= k1; - h1 *%= m; - k1 = @intCast(u32, v >> 32) *% m; + k1 = v *% m; k1 ^= k1 >> 47; k1 *%= m; h1 ^= k1; @@ -262,7 +255,7 @@ pub const Murmur3_32 = struct { const len: u32 = 8; var h1: u32 = seed; var k1: u32 = undefined; - k1 = @intCast(u32, v) *% c1; + k1 = @intCast(u32, v & 0xffffffff) *% c1; k1 = rotl32(k1, 15); k1 *%= c2; h1 ^= k1; @@ -310,13 +303,43 @@ fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { } test "murmur2_32" { - std.testing.expectEqual(SMHasherTest(Murmur2_32.hashWithSeed, 32), 0x27864C1E); + testing.expectEqual(SMHasherTest(Murmur2_32.hashWithSeed, 32), 0x27864C1E); + var v0: u32 = 0x12345678; + var v1: u64 = 0x1234567812345678; + var v0le: u32 = v0; + var v1le: u64 = v1; + if (builtin.endian == builtin.Endian.Big) { + v0le = @byteSwap(u32, v0le); + v1le = @byteSwap(u64, v1le); + } + testing.expectEqual(Murmur2_32.hash(@ptrCast([*]u8, &v0le)[0..4]), Murmur2_32.hashUint32(v0)); + testing.expectEqual(Murmur2_32.hash(@ptrCast([*]u8, &v1le)[0..8]), Murmur2_32.hashUint64(v1)); } test "murmur2_64" { std.testing.expectEqual(SMHasherTest(Murmur2_64.hashWithSeed, 64), 0x1F0D3804); + var v0: u32 = 0x12345678; + var v1: u64 = 0x1234567812345678; + var v0le: u32 = v0; + var v1le: u64 = v1; + if (builtin.endian == builtin.Endian.Big) { + v0le = @byteSwap(u32, v0le); + v1le = @byteSwap(u64, v1le); + } + testing.expectEqual(Murmur2_64.hash(@ptrCast([*]u8, &v0le)[0..4]), Murmur2_64.hashUint32(v0)); + testing.expectEqual(Murmur2_64.hash(@ptrCast([*]u8, &v1le)[0..8]), Murmur2_64.hashUint64(v1)); } test "murmur3_32" { std.testing.expectEqual(SMHasherTest(Murmur3_32.hashWithSeed, 32), 0xB0F57EE3); + var v0: u32 = 0x12345678; + var v1: u64 = 0x1234567812345678; + var v0le: u32 = v0; + var v1le: u64 = v1; + if (builtin.endian == builtin.Endian.Big) { + v0le = @byteSwap(u32, v0le); + v1le = @byteSwap(u64, v1le); + } + testing.expectEqual(Murmur3_32.hash(@ptrCast([*]u8, &v0le)[0..4]), Murmur3_32.hashUint32(v0)); + testing.expectEqual(Murmur3_32.hash(@ptrCast([*]u8, &v1le)[0..8]), Murmur3_32.hashUint64(v1)); } From 9684c99dd37ff987114824b2af17b6160b908b79 Mon Sep 17 00:00:00 2001 From: andersfr Date: Sat, 6 Jul 2019 23:44:43 +0200 Subject: [PATCH 3/4] Proper use of @truncate instead of @intCast --- std/hash/cityhash.zig | 20 ++++++++++---------- std/hash/murmur.zig | 18 +++++++++--------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/std/hash/cityhash.zig b/std/hash/cityhash.zig index 3599d95ef9..1562676271 100644 --- a/std/hash/cityhash.zig +++ b/std/hash/cityhash.zig @@ -45,7 +45,7 @@ pub const CityHash32 = struct { } fn hash32Len0To4(str: []const u8) u32 { - const len: u32 = @intCast(u32, str.len); + const len: u32 = @truncate(u32, str.len); var b: u32 = 0; var c: u32 = 9; for (str) |v| { @@ -56,7 +56,7 @@ pub const CityHash32 = struct { } fn hash32Len5To12(str: []const u8) u32 { - var a: u32 = @intCast(u32, str.len); + var a: u32 = @truncate(u32, str.len); var b: u32 = a *% 5; var c: u32 = 9; const d: u32 = b; @@ -69,7 +69,7 @@ pub const CityHash32 = struct { } fn hash32Len13To24(str: []const u8) u32 { - const len: u32 = @intCast(u32, str.len); + const len: u32 = @truncate(u32, str.len); const a: u32 = fetch32(str.ptr + (str.len >> 1) - 4); const b: u32 = fetch32(str.ptr + 4); const c: u32 = fetch32(str.ptr + str.len - 8); @@ -91,7 +91,7 @@ pub const CityHash32 = struct { } } - const len: u32 = @intCast(u32, str.len); + const len: u32 = @truncate(u32, str.len); var h: u32 = len; var g: u32 = c1 *% len; var f: u32 = g; @@ -214,7 +214,7 @@ pub const CityHash64 = struct { } fn hashLen0To16(str: []const u8) u64 { - const len: u64 = @intCast(u64, str.len); + const len: u64 = @truncate(u64, str.len); if (len >= 8) { const mul: u64 = k2 +% len *% 2; const a: u64 = fetch64(str.ptr) + k2; @@ -233,14 +233,14 @@ pub const CityHash64 = struct { const b: u8 = str[str.len >> 1]; const c: u8 = str[str.len - 1]; const y: u32 = @intCast(u32, a) +% (@intCast(u32, b) << 8); - const z: u32 = @intCast(u32, str.len) +% (@intCast(u32, c) << 2); + const z: u32 = @truncate(u32, str.len) +% (@intCast(u32, c) << 2); return shiftmix(@intCast(u64, y) *% k2 ^ @intCast(u64, z) *% k0) *% k2; } return k2; } fn hashLen17To32(str: []const u8) u64 { - const len: u64 = @intCast(u64, str.len); + const len: u64 = @truncate(u64, str.len); const mul: u64 = k2 +% len *% 2; const a: u64 = fetch64(str.ptr) *% k1; const b: u64 = fetch64(str.ptr + 8); @@ -251,7 +251,7 @@ pub const CityHash64 = struct { } fn hashLen33To64(str: []const u8) u64 { - const len: u64 = @intCast(u64, str.len); + const len: u64 = @truncate(u64, str.len); const mul: u64 = k2 +% len *% 2; const a: u64 = fetch64(str.ptr) *% k2; const b: u64 = fetch64(str.ptr + 8); @@ -305,7 +305,7 @@ pub const CityHash64 = struct { return hashLen33To64(str); } - var len: u64 = @intCast(u64, str.len); + var len: u64 = @truncate(u64, str.len); var x: u64 = fetch64(str.ptr + str.len - 40); var y: u64 = fetch64(str.ptr + str.len - 16) +% fetch64(str.ptr + str.len - 56); @@ -367,7 +367,7 @@ fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { @memcpy(@ptrCast([*]u8, &hashes[i * hashbytes]), @ptrCast([*]u8, &h), hashbytes); } - return @intCast(u32, hash_fn(hashes, 0) & 0xffffffff); + return @truncate(u32, hash_fn(hashes, 0)); } fn CityHash32hashIgnoreSeed(str: []const u8, seed: u32) u32 { diff --git a/std/hash/murmur.zig b/std/hash/murmur.zig index 52d65b0564..79d05bf462 100644 --- a/std/hash/murmur.zig +++ b/std/hash/murmur.zig @@ -13,7 +13,7 @@ pub const Murmur2_32 = struct { pub fn hashWithSeed(str: []const u8, seed: u32) u32 { const m: u32 = 0x5bd1e995; - const len = @intCast(u32, str.len); + const len = @truncate(u32, str.len); var h1: u32 = seed ^ len; for (@ptrCast([*]allowzero align(1) const u32, str.ptr)[0..(len >> 2)]) |v| { var k1: u32 = v; @@ -72,12 +72,12 @@ pub const Murmur2_32 = struct { const len: u32 = 8; var h1: u32 = seed ^ len; var k1: u32 = undefined; - k1 = @intCast(u32, v & 0xffffffff) *% m; + k1 = @truncate(u32, v) *% m; k1 ^= k1 >> 24; k1 *%= m; h1 *%= m; h1 ^= k1; - k1 = @intCast(u32, v >> 32) *% m; + k1 = @truncate(u32, v >> 32) *% m; k1 ^= k1 >> 24; k1 *%= m; h1 *%= m; @@ -98,7 +98,7 @@ pub const Murmur2_64 = struct { pub fn hashWithSeed(str: []const u8, seed: u64) u64 { const m: u64 = 0xc6a4a7935bd1e995; - const len = @intCast(u64, str.len); + const len = @truncate(u64, str.len); var h1: u64 = seed ^ (len *% m); for (@ptrCast([*]allowzero align(1) const u64, str.ptr)[0..(len >> 3)]) |v| { var k1: u64 = v; @@ -178,7 +178,7 @@ pub const Murmur3_32 = struct { pub fn hashWithSeed(str: []const u8, seed: u32) u32 { const c1: u32 = 0xcc9e2d51; const c2: u32 = 0x1b873593; - const len = @intCast(u32, str.len); + const len = @truncate(u32, str.len); var h1: u32 = seed; for (@ptrCast([*]allowzero align(1) const u32, str.ptr)[0..(len >> 2)]) |v| { var k1: u32 = v; @@ -255,14 +255,14 @@ pub const Murmur3_32 = struct { const len: u32 = 8; var h1: u32 = seed; var k1: u32 = undefined; - k1 = @intCast(u32, v & 0xffffffff) *% c1; + k1 = @truncate(u32, v) *% c1; k1 = rotl32(k1, 15); k1 *%= c2; h1 ^= k1; h1 = rotl32(h1, 13); h1 *%= 5; h1 +%= 0xe6546b64; - k1 = @intCast(u32, v >> 32) *% c1; + k1 = @truncate(u32, v >> 32) *% c1; k1 = rotl32(k1, 15); k1 *%= c2; h1 ^= k1; @@ -291,7 +291,7 @@ fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { var i: u32 = 0; while (i < 256) : (i += 1) { - key[i] = @intCast(u8, i); + key[i] = @truncate(u8, i); var h = hash_fn(key[0..i], 256 - i); if (builtin.endian == builtin.Endian.Big) @@ -299,7 +299,7 @@ fn SMHasherTest(comptime hash_fn: var, comptime hashbits: u32) u32 { @memcpy(@ptrCast([*]u8, &hashes[i * hashbytes]), @ptrCast([*]u8, &h), hashbytes); } - return @intCast(u32, hash_fn(hashes, 0) & 0xffffffff); + return @truncate(u32, hash_fn(hashes, 0)); } test "murmur2_32" { From 69129c2e932bd51dff02578e67ea79799380a3cc Mon Sep 17 00:00:00 2001 From: andersfr Date: Sat, 13 Jul 2019 13:57:32 +0200 Subject: [PATCH 4/4] Intentional wraparound used + --- CMakeLists.txt | 2 ++ std/hash/cityhash.zig | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 52552cc3ee..d0fca21fbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -523,6 +523,8 @@ set(ZIG_STD_FILES "hash/crc.zig" "hash/fnv.zig" "hash/siphash.zig" + "hash/murmur.zig" + "hash/cityhash.zig" "hash_map.zig" "heap.zig" "heap/logging_allocator.zig" diff --git a/std/hash/cityhash.zig b/std/hash/cityhash.zig index 1562676271..a4d7fc8218 100644 --- a/std/hash/cityhash.zig +++ b/std/hash/cityhash.zig @@ -217,7 +217,7 @@ pub const CityHash64 = struct { const len: u64 = @truncate(u64, str.len); if (len >= 8) { const mul: u64 = k2 +% len *% 2; - const a: u64 = fetch64(str.ptr) + k2; + const a: u64 = fetch64(str.ptr) +% k2; const b: u64 = fetch64(str.ptr + str.len - 8); const c: u64 = rotr64(b, 37) *% mul +% a; const d: u64 = (rotr64(a, 25) +% b) *% mul;