std.Build.Cache: use an array hash map for files

Rather than an ArrayList. Provides deduplication.
2025-12-16 03:03:09 +00:00 · 2024-03-21 19:53:24 -07:00 · 2024-03-21 19:53:24 -07:00 · 2f4bbd6c63
commit 2f4bbd6c63
parent ebec7336e2
4 changed files with 152 additions and 72 deletions
--- a/lib/std/Build/Cache.zig
+++ b/lib/std/Build/Cache.zig
@ -55,7 +55,15 @@ pub fn prefixes(cache: *const Cache) []const Directory {
 const PrefixedPath = struct {
    prefix: u8,
-    sub_path: []u8,
+    sub_path: []const u8,
    fn eql(a: PrefixedPath, b: PrefixedPath) bool {
        return a.prefix == b.prefix and std.mem.eql(u8, a.sub_path, b.sub_path);
    }
    fn hash(pp: PrefixedPath) u32 {
        return @truncate(std.hash.Wyhash.hash(pp.prefix, pp.sub_path));
    }
 };
 fn findPrefix(cache: *const Cache, file_path: []const u8) !PrefixedPath {
@ -132,7 +140,7 @@ pub const hasher_init: Hasher = Hasher.init(&[_]u8{
 });
 pub const File = struct {
-    prefixed_path: ?PrefixedPath,
+    prefixed_path: PrefixedPath,
    max_file_size: ?usize,
    stat: Stat,
    bin_digest: BinDigest,
@ -145,16 +153,18 @@ pub const File = struct {
    };
    pub fn deinit(self: *File, gpa: Allocator) void {
-        if (self.prefixed_path) |pp| {
+        gpa.free(self.prefixed_path.sub_path);
            gpa.free(pp.sub_path);
            self.prefixed_path = null;
        }
        if (self.contents) |contents| {
            gpa.free(contents);
            self.contents = null;
        }
        self.* = undefined;
    }
    pub fn updateMaxSize(file: *File, new_max_size: ?usize) void {
        const new = new_max_size orelse return;
        file.max_file_size = if (file.max_file_size) |old| @max(old, new) else new;
    }
 };
 pub const HashHelper = struct {
@ -296,7 +306,7 @@ pub const Manifest = struct {
    // order to obtain a problematic timestamp for the next call. Calls after that
    // will then use the same timestamp, to avoid unnecessary filesystem writes.
    want_refresh_timestamp: bool = true,
-    files: std.ArrayListUnmanaged(File) = .{},
+    files: Files = .{},
    hex_digest: HexDigest,
    /// Populated when hit() returns an error because of one
    /// of the files listed in the manifest.
@ -305,6 +315,34 @@ pub const Manifest = struct {
    /// what time the file system thinks it is, according to its own granularity.
    recent_problematic_timestamp: i128 = 0,
    pub const Files = std.ArrayHashMapUnmanaged(File, void, FilesContext, false);
    pub const FilesContext = struct {
        pub fn hash(fc: FilesContext, file: File) u32 {
            _ = fc;
            return file.prefixed_path.hash();
        }
        pub fn eql(fc: FilesContext, a: File, b: File, b_index: usize) bool {
            _ = fc;
            _ = b_index;
            return a.prefixed_path.eql(b.prefixed_path);
        }
    };
    const FilesAdapter = struct {
        pub fn eql(context: @This(), a: PrefixedPath, b: File, b_index: usize) bool {
            _ = context;
            _ = b_index;
            return a.eql(b.prefixed_path);
        }
        pub fn hash(context: @This(), key: PrefixedPath) u32 {
            _ = context;
            return key.hash();
        }
    };
    /// Add a file as a dependency of process being cached. When `hit` is
    /// called, the file's contents will be checked to ensure that it matches
    /// the contents from previous times.
@ -317,7 +355,7 @@ pub const Manifest = struct {
    /// to access the contents of the file after calling `hit()` like so:
    ///
    /// ```
-    /// var file_contents = cache_hash.files.items[file_index].contents.?;
+    /// var file_contents = cache_hash.files.keys()[file_index].contents.?;
    /// ```
    pub fn addFile(self: *Manifest, file_path: []const u8, max_file_size: ?usize) !usize {
        assert(self.manifest_file == null);
@ -327,7 +365,12 @@ pub const Manifest = struct {
        const prefixed_path = try self.cache.findPrefix(file_path);
        errdefer gpa.free(prefixed_path.sub_path);
-        self.files.addOneAssumeCapacity().* = .{
+        const gop = self.files.getOrPutAssumeCapacityAdapted(prefixed_path, FilesAdapter{});
        if (gop.found_existing) {
            gop.key_ptr.updateMaxSize(max_file_size);
            return gop.index;
        }
        gop.key_ptr.* = .{
            .prefixed_path = prefixed_path,
            .contents = null,
            .max_file_size = max_file_size,
@ -338,7 +381,7 @@ pub const Manifest = struct {
        self.hash.add(prefixed_path.prefix);
        self.hash.addBytes(prefixed_path.sub_path);
-        return self.files.items.len - 1;
+        return gop.index;
    }
    pub fn addOptionalFile(self: *Manifest, optional_file_path: ?[]const u8) !void {
@ -418,7 +461,7 @@ pub const Manifest = struct {
        self.want_refresh_timestamp = true;
-        const input_file_count = self.files.items.len;
+        const input_file_count = self.files.entries.len;
        while (true) : (self.unhit(bin_digest, input_file_count)) {
            const file_contents = try self.manifest_file.?.reader().readAllAlloc(gpa, manifest_file_size_max);
            defer gpa.free(file_contents);
@ -430,7 +473,7 @@ pub const Manifest = struct {
                if (try self.upgradeToExclusiveLock()) continue;
                self.manifest_dirty = true;
                while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                    self.populateFileHash(ch_file) catch |err| {
                        self.failed_file_index = idx;
                        return err;
@ -441,18 +484,6 @@ pub const Manifest = struct {
            while (line_iter.next()) |line| {
                defer idx += 1;
                const cache_hash_file = if (idx < input_file_count) &self.files.items[idx] else blk: {
                    const new = try self.files.addOne(gpa);
                    new.* = .{
                        .prefixed_path = null,
                        .contents = null,
                        .max_file_size = null,
                        .stat = undefined,
                        .bin_digest = undefined,
                    };
                    break :blk new;
                };
                var iter = mem.tokenizeScalar(u8, line, ' ');
                const size = iter.next() orelse return error.InvalidFormat;
                const inode = iter.next() orelse return error.InvalidFormat;
@ -461,30 +492,61 @@ pub const Manifest = struct {
                const prefix_str = iter.next() orelse return error.InvalidFormat;
                const file_path = iter.rest();
-                cache_hash_file.stat.size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
+                const stat_size = fmt.parseInt(u64, size, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
+                const stat_inode = fmt.parseInt(fs.File.INode, inode, 10) catch return error.InvalidFormat;
-                cache_hash_file.stat.mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
+                const stat_mtime = fmt.parseInt(i64, mtime_nsec_str, 10) catch return error.InvalidFormat;
-                _ = fmt.hexToBytes(&cache_hash_file.bin_digest, digest_str) catch return error.InvalidFormat;
+                const file_bin_digest = b: {
                    if (digest_str.len != hex_digest_len) return error.InvalidFormat;
                    var bd: BinDigest = undefined;
                    _ = fmt.hexToBytes(&bd, digest_str) catch return error.InvalidFormat;
                    break :b bd;
                };
                const prefix = fmt.parseInt(u8, prefix_str, 10) catch return error.InvalidFormat;
                if (prefix >= self.cache.prefixes_len) return error.InvalidFormat;
-                if (file_path.len == 0) {
+                if (file_path.len == 0) return error.InvalidFormat;
                    return error.InvalidFormat;
                }
                if (cache_hash_file.prefixed_path) |pp| {
                    if (pp.prefix != prefix or !mem.eql(u8, file_path, pp.sub_path)) {
                        return error.InvalidFormat;
                    }
                }
-                if (cache_hash_file.prefixed_path == null) {
+                const cache_hash_file = f: {
-                    cache_hash_file.prefixed_path = .{
+                    const prefixed_path: PrefixedPath = .{
                        .prefix = prefix,
-                        .sub_path = try gpa.dupe(u8, file_path),
+                        .sub_path = file_path, // expires with file_contents
                    };
-                }
+                    if (idx < input_file_count) {
                        const file = &self.files.keys()[idx];
                        if (!file.prefixed_path.eql(prefixed_path))
                            return error.InvalidFormat;
-                const pp = cache_hash_file.prefixed_path.?;
+                        file.stat = .{
                            .size = stat_size,
                            .inode = stat_inode,
                            .mtime = stat_mtime,
                        };
                        file.bin_digest = file_bin_digest;
                        break :f file;
                    }
                    const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
                    errdefer assert(self.files.popOrNull() != null);
                    if (!gop.found_existing) {
                        gop.key_ptr.* = .{
                            .prefixed_path = .{
                                .prefix = prefix,
                                .sub_path = try gpa.dupe(u8, file_path),
                            },
                            .contents = null,
                            .max_file_size = null,
                            .stat = .{
                                .size = stat_size,
                                .inode = stat_inode,
                                .mtime = stat_mtime,
                            },
                            .bin_digest = file_bin_digest,
                        };
                    }
                    break :f gop.key_ptr;
                };
                const pp = cache_hash_file.prefixed_path;
                const dir = self.cache.prefixes()[pp.prefix].handle;
                const this_file = dir.openFile(pp.sub_path, .{ .mode = .read_only }) catch |err| switch (err) {
                    error.FileNotFound => {
@ -548,7 +610,7 @@ pub const Manifest = struct {
                if (try self.upgradeToExclusiveLock()) continue;
                self.manifest_dirty = true;
                while (idx < input_file_count) : (idx += 1) {
-                    const ch_file = &self.files.items[idx];
+                    const ch_file = &self.files.keys()[idx];
                    self.populateFileHash(ch_file) catch |err| {
                        self.failed_file_index = idx;
                        return err;
@ -571,12 +633,12 @@ pub const Manifest = struct {
        self.hash.hasher.update(&bin_digest);
        // Remove files not in the initial hash.
-        for (self.files.items[input_file_count..]) |*file| {
+        for (self.files.keys()[input_file_count..]) |*file| {
            file.deinit(self.cache.gpa);
        }
        self.files.shrinkRetainingCapacity(input_file_count);
-        for (self.files.items) |file| {
+        for (self.files.keys()) |file| {
            self.hash.hasher.update(&file.bin_digest);
        }
    }
@ -616,7 +678,7 @@ pub const Manifest = struct {
    }
    fn populateFileHash(self: *Manifest, ch_file: *File) !void {
-        const pp = ch_file.prefixed_path.?;
+        const pp = ch_file.prefixed_path;
        const dir = self.cache.prefixes()[pp.prefix].handle;
        const file = try dir.openFile(pp.sub_path, .{});
        defer file.close();
@ -682,7 +744,7 @@ pub const Manifest = struct {
            .bin_digest = undefined,
            .contents = null,
        };
-        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
+        errdefer self.files.shrinkRetainingCapacity(self.files.entries.len - 1);
        try self.populateFileHash(new_ch_file);
@ -690,9 +752,11 @@ pub const Manifest = struct {
    }
    /// Add a file as a dependency of process being cached, after the initial hash has been
-    /// calculated. This is useful for processes that don't know the all the files that
+    /// calculated.
-    /// are depended on ahead of time. For example, a source file that can import other files
+    ///
-    /// will need to be recompiled if the imported file is changed.
+    /// This is useful for processes that don't know the all the files that are
    /// depended on ahead of time. For example, a source file that can import
    /// other files will need to be recompiled if the imported file is changed.
    pub fn addFilePost(self: *Manifest, file_path: []const u8) !void {
        assert(self.manifest_file != null);
@ -700,17 +764,26 @@ pub const Manifest = struct {
        const prefixed_path = try self.cache.findPrefix(file_path);
        errdefer gpa.free(prefixed_path.sub_path);
-        const new_ch_file = try self.files.addOne(gpa);
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
-        new_ch_file.* = .{
+        errdefer assert(self.files.popOrNull() != null);
        if (gop.found_existing) {
            gpa.free(prefixed_path.sub_path);
            return;
        }
        gop.key_ptr.* = .{
            .prefixed_path = prefixed_path,
            .max_file_size = null,
            .stat = undefined,
            .bin_digest = undefined,
            .contents = null,
        };
        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
-        try self.populateFileHash(new_ch_file);
+        self.files.lockPointers();
        defer self.files.unlockPointers();
        try self.populateFileHash(gop.key_ptr);
    }
    /// Like `addFilePost` but when the file contents have already been loaded from disk.
@ -724,13 +797,20 @@ pub const Manifest = struct {
        assert(self.manifest_file != null);
        const gpa = self.cache.gpa;
        const ch_file = try self.files.addOne(gpa);
        errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
        const prefixed_path = try self.cache.findPrefixResolved(resolved_path);
        errdefer gpa.free(prefixed_path.sub_path);
-        ch_file.* = .{
+        const gop = try self.files.getOrPutAdapted(gpa, prefixed_path, FilesAdapter{});
        errdefer assert(self.files.popOrNull() != null);
        if (gop.found_existing) {
            gpa.free(prefixed_path.sub_path);
            return;
        }
        const new_file = gop.key_ptr;
        new_file.* = .{
            .prefixed_path = prefixed_path,
            .max_file_size = null,
            .stat = stat,
@ -738,19 +818,19 @@ pub const Manifest = struct {
            .contents = null,
        };
-        if (self.isProblematicTimestamp(ch_file.stat.mtime)) {
+        if (self.isProblematicTimestamp(new_file.stat.mtime)) {
            // The actual file has an unreliable timestamp, force it to be hashed
-            ch_file.stat.mtime = 0;
+            new_file.stat.mtime = 0;
-            ch_file.stat.inode = 0;
+            new_file.stat.inode = 0;
        }
        {
            var hasher = hasher_init;
            hasher.update(bytes);
-            hasher.final(&ch_file.bin_digest);
+            hasher.final(&new_file.bin_digest);
        }
-        self.hash.hasher.update(&ch_file.bin_digest);
+        self.hash.hasher.update(&new_file.bin_digest);
    }
    pub fn addDepFilePost(self: *Manifest, dir: fs.Dir, dep_file_basename: []const u8) !void {
@ -816,14 +896,14 @@ pub const Manifest = struct {
            const writer = contents.writer();
            try writer.writeAll(manifest_header ++ "\n");
-            for (self.files.items) |file| {
+            for (self.files.keys()) |file| {
                try writer.print("{d} {d} {d} {} {d} {s}\n", .{
                    file.stat.size,
                    file.stat.inode,
                    file.stat.mtime,
                    fmt.fmtSliceHexLower(&file.bin_digest),
-                    file.prefixed_path.?.prefix,
+                    file.prefixed_path.prefix,
-                    file.prefixed_path.?.sub_path,
+                    file.prefixed_path.sub_path,
                });
            }
@ -892,7 +972,7 @@ pub const Manifest = struct {
            file.close();
        }
-        for (self.files.items) |*file| {
+        for (self.files.keys()) |*file| {
            file.deinit(self.cache.gpa);
        }
        self.files.deinit(self.cache.gpa);
@ -1061,7 +1141,7 @@ test "check that changing a file makes cache fail" {
            // There should be nothing in the cache
            try testing.expectEqual(false, try ch.hit());
-            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.items[temp_file_idx].contents.?));
+            try testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.keys()[temp_file_idx].contents.?));
            digest1 = ch.final();
@ -1081,7 +1161,7 @@ test "check that changing a file makes cache fail" {
            try testing.expectEqual(false, try ch.hit());
            // The cache system does not keep the contents of re-hashed input files.
-            try testing.expect(ch.files.items[temp_file_idx].contents == null);
+            try testing.expect(ch.files.keys()[temp_file_idx].contents == null);
            digest2 = ch.final();
--- a/lib/std/Build/Step.zig
+++ b/lib/std/Build/Step.zig
@ -544,7 +544,7 @@ pub fn cacheHit(s: *Step, man: *std.Build.Cache.Manifest) !bool {
 fn failWithCacheError(s: *Step, man: *const std.Build.Cache.Manifest, err: anyerror) anyerror {
    const i = man.failed_file_index orelse return err;
-    const pp = man.files.items[i].prefixed_path orelse return err;
+    const pp = man.files.keys()[i].prefixed_path;
    const prefix = man.cache.prefixes()[pp.prefix].path orelse "";
    return s.fail("{s}: {s}/{s}", .{ @errorName(err), prefix, pp.sub_path });
 }
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@ -1999,7 +1999,7 @@ pub fn update(comp: *Compilation, main_progress_node: *std.Progress.Node) !void
            const is_hit = man.hit() catch |err| {
                const i = man.failed_file_index orelse return err;
-                const pp = man.files.items[i].prefixed_path orelse return err;
+                const pp = man.files.keys()[i].prefixed_path;
                const prefix = man.cache.prefixes()[pp.prefix];
                return comp.setMiscFailure(
                    .check_whole_cache,
@ -4147,7 +4147,7 @@ pub fn cImport(comp: *Compilation, c_src: []const u8, owner_mod: *Package.Module
    const prev_hash_state = man.hash.peekBin();
    const actual_hit = hit: {
        _ = try man.hit();
-        if (man.files.items.len == 0) {
+        if (man.files.entries.len == 0) {
            man.unhit(prev_hash_state, 0);
            break :hit false;
        }
--- a/src/glibc.zig
+++ b/src/glibc.zig
@ -713,7 +713,7 @@ pub fn buildSharedObjects(comp: *Compilation, prog_node: *std.Progress.Node) !vo
    };
    defer o_directory.handle.close();
-    const abilists_contents = man.files.items[abilists_index].contents.?;
+    const abilists_contents = man.files.keys()[abilists_index].contents.?;
    const metadata = try loadMetaData(comp.gpa, abilists_contents);
    defer metadata.destroy(comp.gpa);