From aee6f7d7eed535597e4b506132e211b9dce311dd Mon Sep 17 00:00:00 2001 From: Francesco Alemanno <50984334+francescoalemanno@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:14:12 +0100 Subject: [PATCH 1/5] std.hash: improve simple hashing of unsigned integers Before, the default bit mixer was very biased, and after a lot of searching it turns out that selecting a better solution is hard. I wrote a custom statistical analysis taylored for bit mixers in order to select the best one at each size (u64/u32/u16), compared a lot of mixers, and packaged the best ones in this commit. --- lib/std/hash.zig | 73 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/lib/std/hash.zig b/lib/std/hash.zig index 061744ac97..68adb1afdd 100644 --- a/lib/std/hash.zig +++ b/lib/std/hash.zig @@ -37,20 +37,79 @@ pub const XxHash3 = xxhash.XxHash3; pub const XxHash64 = xxhash.XxHash64; pub const XxHash32 = xxhash.XxHash32; +/// Deprecated: use std.hash.int(comptime T, input T) T where T is an unsigned integer type. /// This is handy if you have a u32 and want a u32 and don't want to take a /// detour through many layers of abstraction elsewhere in the std.hash /// namespace. -/// Copied from https://nullprogram.com/blog/2018/07/31/ pub fn uint32(input: u32) u32 { - var x: u32 = input; - x ^= x >> 16; - x *%= 0x7feb352d; - x ^= x >> 15; - x *%= 0x846ca68b; - x ^= x >> 16; + return int(u32, input); +} + +/// Applies a bit-mangling transformation to an unsigned integer type `T`. +/// Optimized per type: for `u16` and `u32`, Skeeto's xorshift-multiply; for `u64`, Maiga's mx3. +/// Falls back on an avalanche pattern for other unsigned types, ensuring high entropy. +/// Only unsigned types are accepted; signed types will raise a compile-time error. +pub fn int(comptime T: type, input: T) T { + const tInfo = @typeInfo(T).int; + if (tInfo.signedness != .unsigned) @compileError("type has to be unsigned integer"); + var x = input; + switch (T) { + u16 => { + //https://github.com/skeeto/hash-prospector + // 3-round xorshift-multiply (-Xn3) + // bias = 0.0045976709018820602 + x = (x ^ (x >> 7)) *% 0x2993; + x = (x ^ (x >> 5)) *% 0xe877; + x = (x ^ (x >> 9)) *% 0x0235; + x = x ^ (x >> 10); + }, + u32 => { + // https://github.com/skeeto/hash-prospector + x = (x ^ (x >> 17)) *% 0xed5ad4bb; + x = (x ^ (x >> 11)) *% 0xac4c1b51; + x = (x ^ (x >> 15)) *% 0x31848bab; + x = x ^ (x >> 14); + }, + u64 => { + // https://github.com/jonmaiga/mx3 + // https://github.com/jonmaiga/mx3/blob/48924ee743d724aea2cafd2b4249ef8df57fa8b9/mx3.h#L17 + const C = 0xbea225f9eb34556d; + x = (x ^ (x >> 32)) *% C; + x = (x ^ (x >> 29)) *% C; + x = (x ^ (x >> 32)) *% C; + x = x ^ (x >> 29); + }, + else => { + // this construction provides robust avalanche properties, but it is not optimal for any given size. + const Tsize = @bitSizeOf(T); + if (Tsize < 4) @compileError("not implemented."); + const hsize = Tsize >> 1; + const C = comptime blk: { + const max = (1 << Tsize) - 1; + var mul = 1; + while (mul * 3 < max) mul *= 3; + break :blk ((mul ^ (mul >> hsize)) | 1); + }; + inline for (0..2) |_| { + x = (x ^ (x >> hsize + 1)) *% C; + x = (x ^ (x >> hsize - 1)) *% C; + } + x ^= (x >> hsize); + }, + } return x; } +test "bit manglers" { + const expect = @import("std").testing.expect; + try expect(int(u4, 1) == 0xC); + try expect(int(u8, 1) == 0x4F); + try expect(int(u16, 1) == 0x2880); + try expect(int(u32, 1) == 0x42741D6); + try expect(int(u64, 1) == 0x71894DE00D9981F); + try expect(int(u128, 1) == 0x50BC2BB18910C3DE0BAA2CE0D0C5B83E); +} + test { _ = adler; _ = auto_hash; From 5ad44c14b0c1f8e55856a8f048a570dad564c231 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 1 Nov 2024 11:57:39 -0700 Subject: [PATCH 2/5] std.hash.int: use anytype instead of explicit type parameter also * allow signed ints, simply bitcast them to unsigned * handle odd bit sizes by upcasting and then truncating * naming conventions * remove redundant code * better use of testing API --- lib/std/hash.zig | 75 ++++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/lib/std/hash.zig b/lib/std/hash.zig index 68adb1afdd..b768a85861 100644 --- a/lib/std/hash.zig +++ b/lib/std/hash.zig @@ -37,25 +37,31 @@ pub const XxHash3 = xxhash.XxHash3; pub const XxHash64 = xxhash.XxHash64; pub const XxHash32 = xxhash.XxHash32; -/// Deprecated: use std.hash.int(comptime T, input T) T where T is an unsigned integer type. -/// This is handy if you have a u32 and want a u32 and don't want to take a -/// detour through many layers of abstraction elsewhere in the std.hash -/// namespace. +/// Deprecated in favor of `int`. pub fn uint32(input: u32) u32 { - return int(u32, input); + return int(input); } /// Applies a bit-mangling transformation to an unsigned integer type `T`. /// Optimized per type: for `u16` and `u32`, Skeeto's xorshift-multiply; for `u64`, Maiga's mx3. -/// Falls back on an avalanche pattern for other unsigned types, ensuring high entropy. -/// Only unsigned types are accepted; signed types will raise a compile-time error. -pub fn int(comptime T: type, input: T) T { - const tInfo = @typeInfo(T).int; - if (tInfo.signedness != .unsigned) @compileError("type has to be unsigned integer"); +/// Falls back on an avalanche pattern for other integer types, ensuring high entropy. +pub fn int(input: anytype) @TypeOf(input) { + const info = @typeInfo(@TypeOf(input)).int; + if (info.signedness == .signed) { + const Unsigned = @Type(.{ .int = .{ .signedness = .unsigned, .bits = info.bits } }); + const casted: Unsigned = @bitCast(input); + return @bitCast(int(casted)); + } else if (info.bits < 16) { + return @truncate(int(@as(u16, input))); + } else if (info.bits < 32) { + return @truncate(int(@as(u32, input))); + } else if (info.bits < 64) { + return @truncate(int(@as(u64, input))); + } var x = input; - switch (T) { - u16 => { - //https://github.com/skeeto/hash-prospector + switch (info.bits) { + 16 => { + // https://github.com/skeeto/hash-prospector // 3-round xorshift-multiply (-Xn3) // bias = 0.0045976709018820602 x = (x ^ (x >> 7)) *% 0x2993; @@ -63,36 +69,34 @@ pub fn int(comptime T: type, input: T) T { x = (x ^ (x >> 9)) *% 0x0235; x = x ^ (x >> 10); }, - u32 => { + 32 => { // https://github.com/skeeto/hash-prospector x = (x ^ (x >> 17)) *% 0xed5ad4bb; x = (x ^ (x >> 11)) *% 0xac4c1b51; x = (x ^ (x >> 15)) *% 0x31848bab; x = x ^ (x >> 14); }, - u64 => { + 64 => { // https://github.com/jonmaiga/mx3 // https://github.com/jonmaiga/mx3/blob/48924ee743d724aea2cafd2b4249ef8df57fa8b9/mx3.h#L17 - const C = 0xbea225f9eb34556d; - x = (x ^ (x >> 32)) *% C; - x = (x ^ (x >> 29)) *% C; - x = (x ^ (x >> 32)) *% C; + const c = 0xbea225f9eb34556d; + x = (x ^ (x >> 32)) *% c; + x = (x ^ (x >> 29)) *% c; + x = (x ^ (x >> 32)) *% c; x = x ^ (x >> 29); }, else => { - // this construction provides robust avalanche properties, but it is not optimal for any given size. - const Tsize = @bitSizeOf(T); - if (Tsize < 4) @compileError("not implemented."); - const hsize = Tsize >> 1; - const C = comptime blk: { - const max = (1 << Tsize) - 1; + // This construction provides robust avalanche properties, but it is not optimal for any given size. + const hsize = info.bits >> 1; + const c = comptime blk: { + const max = (1 << info.bits) - 1; var mul = 1; while (mul * 3 < max) mul *= 3; break :blk ((mul ^ (mul >> hsize)) | 1); }; inline for (0..2) |_| { - x = (x ^ (x >> hsize + 1)) *% C; - x = (x ^ (x >> hsize - 1)) *% C; + x = (x ^ (x >> hsize + 1)) *% c; + x = (x ^ (x >> hsize - 1)) *% c; } x ^= (x >> hsize); }, @@ -100,14 +104,15 @@ pub fn int(comptime T: type, input: T) T { return x; } -test "bit manglers" { - const expect = @import("std").testing.expect; - try expect(int(u4, 1) == 0xC); - try expect(int(u8, 1) == 0x4F); - try expect(int(u16, 1) == 0x2880); - try expect(int(u32, 1) == 0x42741D6); - try expect(int(u64, 1) == 0x71894DE00D9981F); - try expect(int(u128, 1) == 0x50BC2BB18910C3DE0BAA2CE0D0C5B83E); +test int { + const expectEqual = @import("std").testing.expectEqual; + try expectEqual(0xC, int(@as(u4, 1))); + try expectEqual(0x4F, int(@as(u8, 1))); + try expectEqual(0x4F, int(@as(i8, 1))); + try expectEqual(0x2880, int(@as(u16, 1))); + try expectEqual(0x42741D6, int(@as(u32, 1))); + try expectEqual(0x71894DE00D9981F, int(@as(u64, 1))); + try expectEqual(0x50BC2BB18910C3DE0BAA2CE0D0C5B83E, int(@as(u128, 1))); } test { From d09fd249c090c68f6bba9731572e52f51afa7536 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 1 Nov 2024 12:03:26 -0700 Subject: [PATCH 3/5] std.hash.int: restore previous behavior In the parent commit, I handled odd bit sizes by upcasting and truncating. However it seems the else branch is intended to handle those cases instead, so this commit reverts that behavior. --- lib/std/hash.zig | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/std/hash.zig b/lib/std/hash.zig index b768a85861..7e0afb5d9b 100644 --- a/lib/std/hash.zig +++ b/lib/std/hash.zig @@ -51,12 +51,8 @@ pub fn int(input: anytype) @TypeOf(input) { const Unsigned = @Type(.{ .int = .{ .signedness = .unsigned, .bits = info.bits } }); const casted: Unsigned = @bitCast(input); return @bitCast(int(casted)); - } else if (info.bits < 16) { - return @truncate(int(@as(u16, input))); - } else if (info.bits < 32) { - return @truncate(int(@as(u32, input))); - } else if (info.bits < 64) { - return @truncate(int(@as(u64, input))); + } else if (info.bits < 4) { + return @truncate(int(@as(u4, input))); } var x = input; switch (info.bits) { From ae6c24b49053853686699913297764361a6d6303 Mon Sep 17 00:00:00 2001 From: Francesco Alemanno <50984334+francescoalemanno@users.noreply.github.com> Date: Sat, 2 Nov 2024 00:11:44 +0100 Subject: [PATCH 4/5] std.hash.int: better handle odd bit sizes Uses the non rational solution of a quadratic, I made it work up to 256 bits, added Mathematica code in case anyone wants to verify the magic constant. integers between sizes 3...15 were affected by fatal bias, it is best to make them pass through the generic solution. Thanks to RetroDev256 & Andrew feedback. --- lib/std/hash.zig | 129 ++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 62 deletions(-) diff --git a/lib/std/hash.zig b/lib/std/hash.zig index 7e0afb5d9b..63bf972bf5 100644 --- a/lib/std/hash.zig +++ b/lib/std/hash.zig @@ -37,78 +37,83 @@ pub const XxHash3 = xxhash.XxHash3; pub const XxHash64 = xxhash.XxHash64; pub const XxHash32 = xxhash.XxHash32; -/// Deprecated in favor of `int`. -pub fn uint32(input: u32) u32 { - return int(input); +/// Easy & fast hash function for integer types +pub fn int(input: anytype) @TypeOf(input) { + // This function is only intended for integer types + const info = @typeInfo(@TypeOf(input)).int; + const bits = info.bits; + // Convert input to unsigned integer (easier to deal with) + const Uint = @Type(.{ .int = .{ .bits = bits, .signedness = .unsigned } }); + const u_input: Uint = @bitCast(input); + if (bits > 256) @compileError("bit widths > 256 are unsupported, use std.hash.autoHash functionality."); + // For bit widths that don't have a dedicated function, use a heuristic + // construction with a multiplier suited to diffusion - + // a mod 2^bits where a^2 - 46 * a + 1 = 0 mod 2^(bits + 4), + // on Mathematica: bits = 256; BaseForm[Solve[1 - 46 a + a^2 == 0, a, Modulus -> 2^(bits + 4)][[-1]][[1]][[2]], 16] + const mult: Uint = @truncate(0xfac2e27ed2036860a062b5f264d80a512b00aa459b448bf1eca24d41c96f59e5b); + // The bit width of the input integer determines how to hash it + const output = switch (bits) { + 0...2 => u_input *% mult, + 16 => uint16(u_input), + 32 => uint32(u_input), + 64 => uint64(u_input), + else => blk: { + var x: Uint = u_input; + inline for (0..4) |_| { + x ^= x >> (bits / 2); + x *%= mult; + } + break :blk x; + }, + }; + return @bitCast(output); } -/// Applies a bit-mangling transformation to an unsigned integer type `T`. -/// Optimized per type: for `u16` and `u32`, Skeeto's xorshift-multiply; for `u64`, Maiga's mx3. -/// Falls back on an avalanche pattern for other integer types, ensuring high entropy. -pub fn int(input: anytype) @TypeOf(input) { - const info = @typeInfo(@TypeOf(input)).int; - if (info.signedness == .signed) { - const Unsigned = @Type(.{ .int = .{ .signedness = .unsigned, .bits = info.bits } }); - const casted: Unsigned = @bitCast(input); - return @bitCast(int(casted)); - } else if (info.bits < 4) { - return @truncate(int(@as(u4, input))); - } - var x = input; - switch (info.bits) { - 16 => { - // https://github.com/skeeto/hash-prospector - // 3-round xorshift-multiply (-Xn3) - // bias = 0.0045976709018820602 - x = (x ^ (x >> 7)) *% 0x2993; - x = (x ^ (x >> 5)) *% 0xe877; - x = (x ^ (x >> 9)) *% 0x0235; - x = x ^ (x >> 10); - }, - 32 => { - // https://github.com/skeeto/hash-prospector - x = (x ^ (x >> 17)) *% 0xed5ad4bb; - x = (x ^ (x >> 11)) *% 0xac4c1b51; - x = (x ^ (x >> 15)) *% 0x31848bab; - x = x ^ (x >> 14); - }, - 64 => { - // https://github.com/jonmaiga/mx3 - // https://github.com/jonmaiga/mx3/blob/48924ee743d724aea2cafd2b4249ef8df57fa8b9/mx3.h#L17 - const c = 0xbea225f9eb34556d; - x = (x ^ (x >> 32)) *% c; - x = (x ^ (x >> 29)) *% c; - x = (x ^ (x >> 32)) *% c; - x = x ^ (x >> 29); - }, - else => { - // This construction provides robust avalanche properties, but it is not optimal for any given size. - const hsize = info.bits >> 1; - const c = comptime blk: { - const max = (1 << info.bits) - 1; - var mul = 1; - while (mul * 3 < max) mul *= 3; - break :blk ((mul ^ (mul >> hsize)) | 1); - }; - inline for (0..2) |_| { - x = (x ^ (x >> hsize + 1)) *% c; - x = (x ^ (x >> hsize - 1)) *% c; - } - x ^= (x >> hsize); - }, - } +/// Source: https://github.com/skeeto/hash-prospector +fn uint16(input: u16) u16 { + var x: u16 = input; + x = (x ^ (x >> 7)) *% 0x2993; + x = (x ^ (x >> 5)) *% 0xe877; + x = (x ^ (x >> 9)) *% 0x0235; + x = x ^ (x >> 10); + return x; +} + +/// DEPRECATED: use std.hash.int() +/// Source: https://github.com/skeeto/hash-prospector +pub fn uint32(input: u32) u32 { + var x: u32 = input; + x = (x ^ (x >> 17)) *% 0xed5ad4bb; + x = (x ^ (x >> 11)) *% 0xac4c1b51; + x = (x ^ (x >> 15)) *% 0x31848bab; + x = x ^ (x >> 14); + return x; +} + +/// Source: https://github.com/jonmaiga/mx3 +fn uint64(input: u64) u64 { + var x: u64 = input; + const c = 0xbea225f9eb34556d; + x = (x ^ (x >> 32)) *% c; + x = (x ^ (x >> 29)) *% c; + x = (x ^ (x >> 32)) *% c; + x = x ^ (x >> 29); return x; } test int { const expectEqual = @import("std").testing.expectEqual; - try expectEqual(0xC, int(@as(u4, 1))); - try expectEqual(0x4F, int(@as(u8, 1))); - try expectEqual(0x4F, int(@as(i8, 1))); + try expectEqual(0x1, int(@as(u1, 1))); + try expectEqual(0x3, int(@as(u2, 1))); + try expectEqual(0x4, int(@as(u3, 1))); + try expectEqual(0xD6, int(@as(u8, 1))); try expectEqual(0x2880, int(@as(u16, 1))); + try expectEqual(0x2880, int(@as(i16, 1))); + try expectEqual(0x838380, int(@as(u24, 1))); try expectEqual(0x42741D6, int(@as(u32, 1))); + try expectEqual(0x42741D6, int(@as(i32, 1))); try expectEqual(0x71894DE00D9981F, int(@as(u64, 1))); - try expectEqual(0x50BC2BB18910C3DE0BAA2CE0D0C5B83E, int(@as(u128, 1))); + try expectEqual(0x71894DE00D9981F, int(@as(i64, 1))); } test { From ca67f80b6e9599246441e0f3b016fabc5c3315aa Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Sun, 24 Nov 2024 15:30:52 -0800 Subject: [PATCH 5/5] std.hash.int: avoid words like "easy" and "fast" in doc comments --- lib/std/hash.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/hash.zig b/lib/std/hash.zig index 63bf972bf5..c4af9011b8 100644 --- a/lib/std/hash.zig +++ b/lib/std/hash.zig @@ -37,7 +37,7 @@ pub const XxHash3 = xxhash.XxHash3; pub const XxHash64 = xxhash.XxHash64; pub const XxHash32 = xxhash.XxHash32; -/// Easy & fast hash function for integer types +/// Integer-to-integer hashing for bit widths <= 256. pub fn int(input: anytype) @TypeOf(input) { // This function is only intended for integer types const info = @typeInfo(@TypeOf(input)).int;