cache_hash: hash function change

This makes the `cache_hash` hash function easier to replace.

BLAKE3 would be a natural fit for hashing large files, but:
- second preimage resistance is not necessary for the cache_hash use cases
- our BLAKE3 implementation is currently very slow

Switch to SipHash128, which gives us an immediate speed boost.
This commit is contained in:
Frank Denis 2020-08-21 15:08:15 +02:00 committed by Andrew Kelley
parent 0fa3cfdb4a
commit f540dc1b7e

View File

@ -4,7 +4,8 @@
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const std = @import("std.zig");
const Blake3 = std.crypto.hash.Blake3;
const crypto = std.crypto;
const Hasher = crypto.auth.siphash.SipHash128(1, 3); // provides enough collision resistance for the CacheHash use cases, while being one of our fastest options right now
const fs = std.fs;
const base64 = std.base64;
const ArrayList = std.ArrayList;
@ -16,9 +17,8 @@ const Allocator = std.mem.Allocator;
const base64_encoder = fs.base64_encoder;
const base64_decoder = fs.base64_decoder;
/// This is 70 more bits than UUIDs. For an analysis of probability of collisions, see:
/// https://en.wikipedia.org/wiki/Universally_unique_identifier#Collisions
const BIN_DIGEST_LEN = 24;
/// This is 128 bits - Even with 2^54 cache entries, the probably of a collision would be under 10^-6
const BIN_DIGEST_LEN = 16;
const BASE64_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN);
const MANIFEST_FILE_SIZE_MAX = 50 * 1024 * 1024;
@ -43,9 +43,13 @@ pub const File = struct {
}
};
/// CacheHash manages project-local `zig-cache` directories.
/// This is not a general-purpose cache.
/// It was designed to be fast and simple, not to withstand attacks using specially-crafted input.
pub const CacheHash = struct {
allocator: *Allocator,
blake3: Blake3,
hasher_init: Hasher, // initial state, that can be copied
hasher: Hasher, // current state for incremental hashing
manifest_dir: fs.Dir,
manifest_file: ?fs.File,
manifest_dirty: bool,
@ -54,9 +58,11 @@ pub const CacheHash = struct {
/// Be sure to call release after successful initialization.
pub fn init(allocator: *Allocator, dir: fs.Dir, manifest_dir_path: []const u8) !CacheHash {
const hasher_init = Hasher.init(&[_]u8{0} ** Hasher.minimum_key_length);
return CacheHash{
.allocator = allocator,
.blake3 = Blake3.init(.{}),
.hasher_init = hasher_init,
.hasher = hasher_init,
.manifest_dir = try dir.makeOpenPath(manifest_dir_path, .{}),
.manifest_file = null,
.manifest_dirty = false,
@ -69,8 +75,8 @@ pub const CacheHash = struct {
pub fn addSlice(self: *CacheHash, val: []const u8) void {
assert(self.manifest_file == null);
self.blake3.update(val);
self.blake3.update(&[_]u8{0});
self.hasher.update(val);
self.hasher.update(&[_]u8{0});
}
/// Convert the input value into bytes and record it as a dependency of the
@ -133,12 +139,12 @@ pub const CacheHash = struct {
assert(self.manifest_file == null);
var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
self.blake3.final(&bin_digest);
self.hasher.final(&bin_digest);
base64_encoder.encode(self.b64_digest[0..], &bin_digest);
self.blake3 = Blake3.init(.{});
self.blake3.update(&bin_digest);
self.hasher = self.hasher_init;
self.hasher.update(&bin_digest);
const manifest_file_path = try fmt.allocPrint(self.allocator, "{}.txt", .{self.b64_digest});
defer self.allocator.free(manifest_file_path);
@ -238,7 +244,7 @@ pub const CacheHash = struct {
}
var actual_digest: [BIN_DIGEST_LEN]u8 = undefined;
try hashFile(this_file, &actual_digest);
try hashFile(this_file, &actual_digest, self.hasher_init);
if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
cache_hash_file.bin_digest = actual_digest;
@ -248,7 +254,7 @@ pub const CacheHash = struct {
}
if (!any_file_changed) {
self.blake3.update(&cache_hash_file.bin_digest);
self.hasher.update(&cache_hash_file.bin_digest);
}
}
@ -256,8 +262,8 @@ pub const CacheHash = struct {
// cache miss
// keep the manifest file open
// reset the hash
self.blake3 = Blake3.init(.{});
self.blake3.update(&bin_digest);
self.hasher = self.hasher_init;
self.hasher.update(&bin_digest);
// Remove files not in the initial hash
for (self.files.items[input_file_count..]) |*file| {
@ -266,7 +272,7 @@ pub const CacheHash = struct {
self.files.shrink(input_file_count);
for (self.files.items) |file| {
self.blake3.update(&file.bin_digest);
self.hasher.update(&file.bin_digest);
}
return null;
}
@ -304,23 +310,23 @@ pub const CacheHash = struct {
// Hash while reading from disk, to keep the contents in the cpu cache while
// doing hashing.
var blake3 = Blake3.init(.{});
var hasher = self.hasher_init;
var off: usize = 0;
while (true) {
// give me everything you've got, captain
const bytes_read = try file.read(contents[off..]);
if (bytes_read == 0) break;
blake3.update(contents[off..][0..bytes_read]);
hasher.update(contents[off..][0..bytes_read]);
off += bytes_read;
}
blake3.final(&ch_file.bin_digest);
hasher.final(&ch_file.bin_digest);
ch_file.contents = contents;
} else {
try hashFile(file, &ch_file.bin_digest);
try hashFile(file, &ch_file.bin_digest, self.hasher_init);
}
self.blake3.update(&ch_file.bin_digest);
self.hasher.update(&ch_file.bin_digest);
}
/// Add a file as a dependency of process being cached, after the initial hash has been
@ -382,7 +388,7 @@ pub const CacheHash = struct {
// the artifacts to cache.
var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
self.blake3.final(&bin_digest);
self.hasher.final(&bin_digest);
var out_digest: [BASE64_DIGEST_LEN]u8 = undefined;
base64_encoder.encode(&out_digest, &bin_digest);
@ -433,17 +439,17 @@ pub const CacheHash = struct {
}
};
fn hashFile(file: fs.File, bin_digest: []u8) !void {
var blake3 = Blake3.init(.{});
fn hashFile(file: fs.File, bin_digest: []u8, hasher_init: anytype) !void {
var buf: [1024]u8 = undefined;
var hasher = hasher_init;
while (true) {
const bytes_read = try file.read(&buf);
if (bytes_read == 0) break;
blake3.update(buf[0..bytes_read]);
hasher.update(buf[0..bytes_read]);
}
blake3.final(bin_digest);
hasher.final(bin_digest);
}
/// If the wall clock time, rounded to the same precision as the
@ -507,7 +513,7 @@ test "cache file and then recall it" {
_ = try ch.addFile(temp_file, null);
// There should be nothing in the cache
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
digest1 = ch.final();
}
@ -575,7 +581,7 @@ test "check that changing a file makes cache fail" {
const temp_file_idx = try ch.addFile(temp_file, 100);
// There should be nothing in the cache
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.items[temp_file_idx].contents.?));
@ -592,7 +598,7 @@ test "check that changing a file makes cache fail" {
const temp_file_idx = try ch.addFile(temp_file, 100);
// A file that we depend on has been updated, so the cache should not contain an entry for it
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
// The cache system does not keep the contents of re-hashed input files.
testing.expect(ch.files.items[temp_file_idx].contents == null);
@ -625,7 +631,7 @@ test "no file inputs" {
ch.add("1234");
// There should be nothing in the cache
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
digest1 = ch.final();
}
@ -672,7 +678,7 @@ test "CacheHashes with files added after initial hash work" {
_ = try ch.addFile(temp_file1, null);
// There should be nothing in the cache
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
_ = try ch.addFilePost(temp_file2);
@ -705,7 +711,7 @@ test "CacheHashes with files added after initial hash work" {
_ = try ch.addFile(temp_file1, null);
// A file that we depend on has been updated, so the cache should not contain an entry for it
testing.expectEqual(@as(?[32]u8, null), try ch.hit());
testing.expectEqual(@as(?[BASE64_DIGEST_LEN]u8, null), try ch.hit());
_ = try ch.addFilePost(temp_file2);