zig/lib/std/cache_hash.zig
LeRoyce Pearson 0fa89dc51d Fix read from null pointer in CacheHash.hit
It occured when the manifest file was manually edited to include an extra
file. Now it will simply copy the file name in the manifest file
2020-05-25 13:48:43 -04:00

512 lines
18 KiB
Zig

const Blake3 = @import("crypto.zig").Blake3;
const fs = @import("fs.zig");
const base64 = @import("base64.zig");
const ArrayList = @import("array_list.zig").ArrayList;
const debug = @import("debug.zig");
const testing = @import("testing.zig");
const mem = @import("mem.zig");
const fmt = @import("fmt.zig");
const Allocator = mem.Allocator;
const os = @import("os.zig");
const time = @import("time.zig");
const base64_encoder = fs.base64_encoder;
const base64_decoder = fs.base64_decoder;
const BIN_DIGEST_LEN = 48;
const BASE64_DIGEST_LEN = base64.Base64Encoder.calcSize(BIN_DIGEST_LEN);
pub const File = struct {
path: ?[]const u8,
stat: fs.File.Stat,
bin_digest: [BIN_DIGEST_LEN]u8,
contents: ?[]const u8 = null,
pub fn deinit(self: *@This(), alloc: *Allocator) void {
if (self.path) |owned_slice| {
alloc.free(owned_slice);
self.path = null;
}
if (self.contents) |contents| {
alloc.free(contents);
self.contents = null;
}
}
};
pub const CacheHash = struct {
alloc: *Allocator,
blake3: Blake3,
manifest_dir: fs.Dir,
manifest_file: ?fs.File,
manifest_dirty: bool,
files: ArrayList(File),
b64_digest: [BASE64_DIGEST_LEN]u8,
pub fn init(alloc: *Allocator, manifest_dir_path: []const u8) !@This() {
try fs.cwd().makePath(manifest_dir_path);
const manifest_dir = try fs.cwd().openDir(manifest_dir_path, .{ .iterate = true });
return CacheHash{
.alloc = alloc,
.blake3 = Blake3.init(),
.manifest_dir = manifest_dir,
.manifest_file = null,
.manifest_dirty = false,
.files = ArrayList(File).init(alloc),
.b64_digest = undefined,
};
}
/// Record a slice of bytes as an dependency of the process being cached
pub fn addSlice(self: *@This(), val: []const u8) void {
debug.assert(self.manifest_file == null);
self.blake3.update(val);
self.blake3.update(&[_]u8{0});
}
/// Convert the input value into bytes and record it as a dependency of the
/// process being cached
pub fn add(self: *@This(), val: var) void {
debug.assert(self.manifest_file == null);
const valPtr = switch (@typeInfo(@TypeOf(val))) {
.Int => &val,
.Pointer => val,
else => &val,
};
self.addSlice(mem.asBytes(valPtr));
}
/// Add a file as a dependency of process being cached. When `CacheHash.hit` is
/// called, the file's contents will be checked to ensure that it matches
/// the contents from previous times.
///
/// Returns the index of the entry in the `CacheHash.files` ArrayList. You can use it
/// to access the contents of the file after calling `CacheHash.hit()` like so:
///
/// ```
/// var file_contents = cache_hash.files.items[file_index].contents.?;
/// ```
pub fn addFile(self: *@This(), file_path: []const u8) !usize {
debug.assert(self.manifest_file == null);
const idx = self.files.items.len;
var cache_hash_file = try self.files.addOne();
cache_hash_file.path = try fs.path.resolve(self.alloc, &[_][]const u8{file_path});
self.addSlice(cache_hash_file.path.?);
return idx;
}
/// Check the cache to see if the input exists in it. If it exists, a base64 encoding
/// of it's hash will be returned; otherwise, null will be returned.
///
/// This function will also acquire an exclusive lock to the manifest file. This means
/// that a process holding a CacheHash will block any other process attempting to
/// acquire the lock.
///
/// The lock on the manifest file is released when `CacheHash.release` is called.
pub fn hit(self: *@This()) !?[BASE64_DIGEST_LEN]u8 {
debug.assert(self.manifest_file == null);
var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
self.blake3.final(&bin_digest);
base64_encoder.encode(self.b64_digest[0..], &bin_digest);
self.blake3 = Blake3.init();
self.blake3.update(&bin_digest);
const manifest_file_path = try fmt.allocPrint(self.alloc, "{}.txt", .{self.b64_digest});
defer self.alloc.free(manifest_file_path);
if (self.files.items.len != 0) {
self.manifest_file = try self.manifest_dir.createFile(manifest_file_path, .{
.read = true,
.truncate = false,
.lock = .Exclusive,
});
} else {
// If there are no file inputs, we check if the manifest file exists instead of
// comparing the hashes on the files used for the cached item
self.manifest_file = self.manifest_dir.openFile(manifest_file_path, .{
.read = true,
.write = true,
.lock = .Exclusive,
}) catch |err| switch (err) {
error.FileNotFound => {
self.manifest_dirty = true;
self.manifest_file = try self.manifest_dir.createFile(manifest_file_path, .{
.read = true,
.truncate = false,
.lock = .Exclusive,
});
return null;
},
else => |e| return e,
};
}
// TODO: Figure out a good max value?
const file_contents = try self.manifest_file.?.inStream().readAllAlloc(self.alloc, 16 * 1024);
defer self.alloc.free(file_contents);
const input_file_count = self.files.items.len;
var any_file_changed = false;
var line_iter = mem.tokenize(file_contents, "\n");
var idx: usize = 0;
while (line_iter.next()) |line| {
defer idx += 1;
var cache_hash_file: *File = undefined;
if (idx < input_file_count) {
cache_hash_file = &self.files.items[idx];
} else {
cache_hash_file = try self.files.addOne();
cache_hash_file.path = null;
}
var iter = mem.tokenize(line, " ");
const inode = iter.next() orelse return error.InvalidFormat;
const mtime_nsec_str = iter.next() orelse return error.InvalidFormat;
const digest_str = iter.next() orelse return error.InvalidFormat;
const file_path = iter.rest();
cache_hash_file.stat.inode = fmt.parseInt(os.ino_t, mtime_nsec_str, 10) catch return error.InvalidFormat;
cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
base64_decoder.decode(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat;
if (file_path.len == 0) {
return error.InvalidFormat;
}
if (cache_hash_file.path != null and !mem.eql(u8, file_path, cache_hash_file.path.?)) {
return error.InvalidFormat;
}
if (cache_hash_file.path == null) {
cache_hash_file.path = try mem.dupe(self.alloc, u8, file_path);
}
const this_file = fs.cwd().openFile(cache_hash_file.path.?, .{ .read = true }) catch {
return error.CacheUnavailable;
};
defer this_file.close();
const actual_stat = try this_file.stat();
const mtime_match = actual_stat.mtime == cache_hash_file.stat.mtime;
const inode_match = actual_stat.inode == cache_hash_file.stat.inode;
if (!mtime_match or !inode_match) {
self.manifest_dirty = true;
cache_hash_file.stat = actual_stat;
if (is_problematic_timestamp(cache_hash_file.stat.mtime)) {
cache_hash_file.stat.mtime = 0;
cache_hash_file.stat.inode = 0;
}
var actual_digest: [BIN_DIGEST_LEN]u8 = undefined;
cache_hash_file.contents = try hash_file(self.alloc, &actual_digest, &this_file);
if (!mem.eql(u8, &cache_hash_file.bin_digest, &actual_digest)) {
mem.copy(u8, &cache_hash_file.bin_digest, &actual_digest);
// keep going until we have the input file digests
any_file_changed = true;
}
}
if (!any_file_changed) {
self.blake3.update(&cache_hash_file.bin_digest);
}
}
if (any_file_changed) {
// cache miss
// keep the manifest file open
// reset the hash
self.blake3 = Blake3.init();
self.blake3.update(&bin_digest);
try self.files.resize(input_file_count);
for (self.files.items) |file| {
self.blake3.update(&file.bin_digest);
}
return null;
}
if (idx < input_file_count) {
self.manifest_dirty = true;
while (idx < input_file_count) : (idx += 1) {
var cache_hash_file = &self.files.items[idx];
const contents = self.populate_file_hash(cache_hash_file) catch |err| {
return error.CacheUnavailable;
};
}
return null;
}
return self.final();
}
fn populate_file_hash_fetch(self: *@This(), otherAlloc: *mem.Allocator, cache_hash_file: *File) ![]u8 {
debug.assert(cache_hash_file.path != null);
const this_file = try fs.cwd().openFile(cache_hash_file.path.?, .{});
defer this_file.close();
cache_hash_file.stat = try this_file.stat();
if (is_problematic_timestamp(cache_hash_file.stat.mtime)) {
cache_hash_file.stat.mtime = 0;
cache_hash_file.stat.inode = 0;
}
const contents = try hash_file(otherAlloc, &cache_hash_file.bin_digest, &this_file);
self.blake3.update(&cache_hash_file.bin_digest);
return contents;
}
fn populate_file_hash(self: *@This(), cache_hash_file: *File) !void {
cache_hash_file.contents = try self.populate_file_hash_fetch(self.alloc, cache_hash_file);
}
/// Add a file as a dependency of process being cached, after the initial hash has been
/// calculated. This is useful for processes that don't know the all the files that
/// are depended on ahead of time. For example, a source file that can import other files
/// will need to be recompiled if the imported file is changed.
///
/// Returns the contents of the file, allocated with the given allocator.
pub fn addFilePostFetch(self: *@This(), otherAlloc: *mem.Allocator, file_path: []const u8) ![]u8 {
debug.assert(self.manifest_file != null);
var cache_hash_file = try self.files.addOne();
cache_hash_file.path = try fs.path.resolve(self.alloc, &[_][]const u8{file_path});
return try self.populate_file_hash_fetch(otherAlloc, cache_hash_file);
}
/// Add a file as a dependency of process being cached, after the initial hash has been
/// calculated. This is useful for processes that don't know the all the files that
/// are depended on ahead of time. For example, a source file that can import other files
/// will need to be recompiled if the imported file is changed.
pub fn addFilePost(self: *@This(), file_path: []const u8) !void {
const contents = try self.addFilePostFetch(self.alloc, file_path);
self.alloc.free(contents);
}
/// Returns a base64 encoded hash of the inputs.
pub fn final(self: *@This()) [BASE64_DIGEST_LEN]u8 {
debug.assert(self.manifest_file != null);
// We don't close the manifest file yet, because we want to
// keep it locked until the API user is done using it.
// We also don't write out the manifest yet, because until
// cache_release is called we still might be working on creating
// the artifacts to cache.
var bin_digest: [BIN_DIGEST_LEN]u8 = undefined;
self.blake3.final(&bin_digest);
var out_digest: [BASE64_DIGEST_LEN]u8 = undefined;
base64_encoder.encode(&out_digest, &bin_digest);
return out_digest;
}
pub fn write_manifest(self: *@This()) !void {
debug.assert(self.manifest_file != null);
var encoded_digest: [BASE64_DIGEST_LEN]u8 = undefined;
var contents = ArrayList(u8).init(self.alloc);
var outStream = contents.outStream();
defer contents.deinit();
for (self.files.items) |file| {
base64_encoder.encode(encoded_digest[0..], &file.bin_digest);
try outStream.print("{} {} {} {}\n", .{ file.stat.inode, file.stat.mtime, encoded_digest[0..], file.path });
}
try self.manifest_file.?.seekTo(0);
try self.manifest_file.?.writeAll(contents.items);
}
/// Releases the manifest file and frees any memory the CacheHash was using.
/// `CacheHash.hit` must be called first.
///
/// Will also attempt to write to the manifest file if the manifest is dirty.
/// Writing to the manifest file is the only way that this file can return an
/// error.
pub fn release(self: *@This()) !void {
debug.assert(self.manifest_file != null);
if (self.manifest_dirty) {
try self.write_manifest();
}
self.manifest_file.?.close();
for (self.files.items) |*file| {
file.deinit(self.alloc);
}
self.files.deinit();
self.manifest_dir.close();
}
};
/// Hash the file, and return the contents as an array
fn hash_file(alloc: *Allocator, bin_digest: []u8, handle: *const fs.File) ![]u8 {
var blake3 = Blake3.init();
const contents = try handle.inStream().readAllAlloc(alloc, 64 * 1024);
blake3.update(contents);
blake3.final(bin_digest);
return contents;
}
/// If the wall clock time, rounded to the same precision as the
/// mtime, is equal to the mtime, then we cannot rely on this mtime
/// yet. We will instead save an mtime value that indicates the hash
/// must be unconditionally computed.
fn is_problematic_timestamp(file_mtime_ns: i64) bool {
const now_ms = time.milliTimestamp();
const file_mtime_ms = @divFloor(file_mtime_ns, time.millisecond);
return now_ms == file_mtime_ms;
}
test "cache file and then recall it" {
const cwd = fs.cwd();
const temp_file = "test.txt";
const temp_manifest_dir = "temp_manifest_dir";
try cwd.writeFile(temp_file, "Hello, world!\n");
var digest1: [BASE64_DIGEST_LEN]u8 = undefined;
var digest2: [BASE64_DIGEST_LEN]u8 = undefined;
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add(true);
ch.add(@as(u16, 1234));
ch.add("1234");
_ = try ch.addFile(temp_file);
// There should be nothing in the cache
testing.expectEqual(@as(?[64]u8, null), try ch.hit());
digest1 = ch.final();
}
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add(true);
ch.add(@as(u16, 1234));
ch.add("1234");
_ = try ch.addFile(temp_file);
// Cache hit! We just "built" the same file
digest2 = (try ch.hit()).?;
}
testing.expectEqual(digest1, digest2);
try cwd.deleteTree(temp_manifest_dir);
try cwd.deleteFile(temp_file);
}
test "give problematic timestamp" {
const now_ns = @intCast(i64, time.milliTimestamp() * time.millisecond);
testing.expect(is_problematic_timestamp(now_ns));
}
test "give nonproblematic timestamp" {
const now_ns = @intCast(i64, time.milliTimestamp() * time.millisecond) - 1000;
testing.expect(!is_problematic_timestamp(now_ns));
}
test "check that changing a file makes cache fail" {
const cwd = fs.cwd();
const temp_file = "cache_hash_change_file_test.txt";
const temp_manifest_dir = "cache_hash_change_file_manifest_dir";
try cwd.writeFile(temp_file, "Hello, world!\n");
var digest1: [BASE64_DIGEST_LEN]u8 = undefined;
var digest2: [BASE64_DIGEST_LEN]u8 = undefined;
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add("1234");
_ = try ch.addFile(temp_file);
// There should be nothing in the cache
testing.expectEqual(@as(?[64]u8, null), try ch.hit());
digest1 = ch.final();
}
try cwd.writeFile(temp_file, "Hello, world; but updated!\n");
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add("1234");
_ = try ch.addFile(temp_file);
// A file that we depend on has been updated, so the cache should not contain an entry for it
testing.expectEqual(@as(?[64]u8, null), try ch.hit());
digest2 = ch.final();
}
testing.expect(!mem.eql(u8, digest1[0..], digest2[0..]));
try cwd.deleteTree(temp_manifest_dir);
try cwd.deleteFile(temp_file);
}
test "no file inputs" {
const cwd = fs.cwd();
const temp_manifest_dir = "no_file_inputs_manifest_dir";
defer cwd.deleteTree(temp_manifest_dir) catch unreachable;
var digest1: [BASE64_DIGEST_LEN]u8 = undefined;
var digest2: [BASE64_DIGEST_LEN]u8 = undefined;
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add("1234");
// There should be nothing in the cache
testing.expectEqual(@as(?[64]u8, null), try ch.hit());
digest1 = ch.final();
}
{
var ch = try CacheHash.init(testing.allocator, temp_manifest_dir);
defer ch.release() catch unreachable;
ch.add("1234");
digest2 = (try ch.hit()).?;
}
testing.expectEqual(digest1, digest2);
}