stage2: implement @embedFile

2026-02-21 00:35:10 +00:00 · 2021-10-17 18:57:54 -07:00 · 2021-10-17 18:57:54 -07:00 · e5dac0a0b3
commit e5dac0a0b3
parent ad17108bdd
3 changed files with 275 additions and 8 deletions
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@ -55,6 +55,10 @@ c_object_work_queue: std.fifo.LinearFifo(*CObject, .Dynamic),
 /// since the last compilation, as well as scan for `@import` and queue up
 /// additional jobs corresponding to those new files.
 astgen_work_queue: std.fifo.LinearFifo(*Module.File, .Dynamic),
+/// These jobs are to inspect the file system stat() and if the embedded file has changed
+/// on disk, mark the corresponding Decl outdated and queue up an `analyze_decl`
+/// task for it.
+embed_file_work_queue: std.fifo.LinearFifo(*Module.EmbedFile, .Dynamic),

 /// The ErrorMsg memory is owned by the `CObject`, using Compilation's general purpose allocator.
 /// This data is accessed by multiple threads and is protected by `mutex`.
@ -181,6 +185,10 @@ const Job = union(enum) {
    /// It may have already be analyzed, or it may have been determined
    /// to be outdated; in this case perform semantic analysis again.
    analyze_decl: *Module.Decl,
+    /// The file that was loaded with `@embedFile` has changed on disk
+    /// and has been re-loaded into memory. All Decls that depend on it
+    /// need to be re-analyzed.
+    update_embed_file: *Module.EmbedFile,
    /// The source file containing the Decl has been updated, and so the
    /// Decl may need its line number information updated in the debug info.
    update_line_number: *Module.Decl,
@ -1447,6 +1455,7 @@ pub fn create(gpa: *Allocator, options: InitOptions) !*Compilation {
            .work_queue = std.fifo.LinearFifo(Job, .Dynamic).init(gpa),
            .c_object_work_queue = std.fifo.LinearFifo(*CObject, .Dynamic).init(gpa),
            .astgen_work_queue = std.fifo.LinearFifo(*Module.File, .Dynamic).init(gpa),
+            .embed_file_work_queue = std.fifo.LinearFifo(*Module.EmbedFile, .Dynamic).init(gpa),
            .keep_source_files_loaded = options.keep_source_files_loaded,
            .use_clang = use_clang,
            .clang_argv = options.clang_argv,
@ -1632,6 +1641,7 @@ pub fn destroy(self: *Compilation) void {
    self.work_queue.deinit();
    self.c_object_work_queue.deinit();
    self.astgen_work_queue.deinit();
+    self.embed_file_work_queue.deinit();

    {
        var it = self.crt_files.iterator();
@ -1747,6 +1757,16 @@ pub fn update(self: *Compilation) !void {
        }

        if (!use_stage1) {
+            // Put a work item in for checking if any files used with `@embedFile` changed.
+            {
+                try self.embed_file_work_queue.ensureUnusedCapacity(module.embed_table.count());
+                var it = module.embed_table.iterator();
+                while (it.next()) |entry| {
+                    const embed_file = entry.value_ptr.*;
+                    self.embed_file_work_queue.writeItemAssumeCapacity(embed_file);
+                }
+            }
+
            try self.work_queue.writeItem(.{ .analyze_pkg = std_pkg });
            if (self.bin_file.options.is_test) {
                try self.work_queue.writeItem(.{ .analyze_pkg = module.main_pkg });
@ -1870,6 +1890,7 @@ pub fn totalErrorCount(self: *Compilation) usize {

    if (self.bin_file.options.module) |module| {
        total += module.failed_exports.count();
+        total += module.failed_embed_files.count();

        {
            var it = module.failed_files.iterator();
@ -1966,6 +1987,13 @@ pub fn getAllErrorsAlloc(self: *Compilation) !AllErrors {
                }
            }
        }
+        {
+            var it = module.failed_embed_files.iterator();
+            while (it.next()) |entry| {
+                const msg = entry.value_ptr.*;
+                try AllErrors.add(module, &arena, &errors, msg.*);
+            }
+        }
        {
            var it = module.failed_decls.iterator();
            while (it.next()) |entry| {
@ -2065,6 +2093,9 @@ pub fn performAllTheWork(self: *Compilation) error{ TimerUnsupported, OutOfMemor
    var c_obj_prog_node = main_progress_node.start("Compile C Objects", self.c_source_files.len);
    defer c_obj_prog_node.end();

+    var embed_file_prog_node = main_progress_node.start("Detect @embedFile updates", self.embed_file_work_queue.count);
+    defer embed_file_prog_node.end();
+
    self.work_queue_wait_group.reset();
    defer self.work_queue_wait_group.wait();

@ -2079,6 +2110,13 @@ pub fn performAllTheWork(self: *Compilation) error{ TimerUnsupported, OutOfMemor
            });
        }

+        while (self.embed_file_work_queue.readItem()) |embed_file| {
+            self.astgen_wait_group.start();
+            try self.thread_pool.spawn(workerCheckEmbedFile, .{
+                self, embed_file, &embed_file_prog_node, &self.astgen_wait_group,
+            });
+        }
+
        while (self.c_object_work_queue.readItem()) |c_object| {
            self.work_queue_wait_group.start();
            try self.thread_pool.spawn(workerUpdateCObject, .{
@ -2260,6 +2298,15 @@ pub fn performAllTheWork(self: *Compilation) error{ TimerUnsupported, OutOfMemor
                error.AnalysisFail => continue,
            };
        },
+        .update_embed_file => |embed_file| {
+            if (build_options.omit_stage2)
+                @panic("sadly stage2 is omitted from this build to save memory on the CI server");
+            const module = self.bin_file.options.module.?;
+            module.updateEmbedFile(embed_file) catch |err| switch (err) {
+                error.OutOfMemory => return error.OutOfMemory,
+                error.AnalysisFail => continue,
+            };
+        },
        .update_line_number => |decl| {
            if (build_options.omit_stage2)
                @panic("sadly stage2 is omitted from this build to save memory on the CI server");
@ -2542,6 +2589,29 @@ fn workerAstGenFile(
    }
 }

+fn workerCheckEmbedFile(
+    comp: *Compilation,
+    embed_file: *Module.EmbedFile,
+    prog_node: *std.Progress.Node,
+    wg: *WaitGroup,
+) void {
+    defer wg.finish();
+
+    var child_prog_node = prog_node.start(embed_file.sub_file_path, 0);
+    child_prog_node.activate();
+    defer child_prog_node.end();
+
+    const mod = comp.bin_file.options.module.?;
+    mod.detectEmbedFileUpdate(embed_file) catch |err| {
+        comp.reportRetryableEmbedFileError(embed_file, err) catch |oom| switch (oom) {
+            // Swallowing this error is OK because it's implied to be OOM when
+            // there is a missing `failed_embed_files` error message.
+            error.OutOfMemory => {},
+        };
+        return;
+    };
+}
+
 pub fn obtainCObjectCacheManifest(comp: *const Compilation) Cache.Manifest {
    var man = comp.cache_parent.obtain();

@ -2790,6 +2860,36 @@ fn reportRetryableAstGenError(
    }
 }

+fn reportRetryableEmbedFileError(
+    comp: *Compilation,
+    embed_file: *Module.EmbedFile,
+    err: anyerror,
+) error{OutOfMemory}!void {
+    const mod = comp.bin_file.options.module.?;
+    const gpa = mod.gpa;
+
+    const src_loc: Module.SrcLoc = embed_file.owner_decl.srcLoc();
+
+    const err_msg = if (embed_file.pkg.root_src_directory.path) |dir_path|
+        try Module.ErrorMsg.create(
+            gpa,
+            src_loc,
+            "unable to load '{s}" ++ std.fs.path.sep_str ++ "{s}': {s}",
+            .{ dir_path, embed_file.sub_file_path, @errorName(err) },
+        )
+    else
+        try Module.ErrorMsg.create(gpa, src_loc, "unable to load '{s}': {s}", .{
+            embed_file.sub_file_path, @errorName(err),
+        });
+    errdefer err_msg.destroy(gpa);
+
+    {
+        const lock = comp.mutex.acquire();
+        defer lock.release();
+        try mod.failed_embed_files.putNoClobber(gpa, embed_file, err_msg);
+    }
+}
+
 fn updateCObject(comp: *Compilation, c_object: *CObject, c_obj_prog_node: *std.Progress.Node) !void {
    if (!build_options.have_llvm) {
        return comp.failCObj(c_object, "clang not available: compiler built without LLVM extensions", .{});
--- a/src/Module.zig
+++ b/src/Module.zig
@ -55,11 +55,17 @@ decl_exports: std.AutoArrayHashMapUnmanaged(*Decl, []*Export) = .{},
 /// is performing the export of another Decl.
 /// This table owns the Export memory.
 export_owners: std.AutoArrayHashMapUnmanaged(*Decl, []*Export) = .{},
-/// The set of all the files in the Module. We keep track of this in order to iterate
-/// over it and check which source files have been modified on the file system when
+/// The set of all the Zig source files in the Module. We keep track of this in order
+/// to iterate over it and check which source files have been modified on the file system when
 /// an update is requested, as well as to cache `@import` results.
 /// Keys are fully resolved file paths. This table owns the keys and values.
 import_table: std.StringArrayHashMapUnmanaged(*File) = .{},
+/// The set of all the files which have been loaded with `@embedFile` in the Module.
+/// We keep track of this in order to iterate over it and check which files have been
+/// modified on the file system when an update is requested, as well as to cache
+/// `@embedFile` results.
+/// Keys are fully resolved file paths. This table owns the keys and values.
+embed_table: std.StringHashMapUnmanaged(*EmbedFile) = .{},

 /// The set of all the generic function instantiations. This is used so that when a generic
 /// function is called twice with the same comptime parameter arguments, both calls dispatch
@ -87,6 +93,8 @@ compile_log_decls: std.AutoArrayHashMapUnmanaged(*Decl, i32) = .{},
 /// Using a map here for consistency with the other fields here.
 /// The ErrorMsg memory is owned by the `File`, using Module's general purpose allocator.
 failed_files: std.AutoArrayHashMapUnmanaged(*File, ?*ErrorMsg) = .{},
+/// The ErrorMsg memory is owned by the `EmbedFile`, using Module's general purpose allocator.
+failed_embed_files: std.AutoArrayHashMapUnmanaged(*EmbedFile, *ErrorMsg) = .{},
 /// Using a map here for consistency with the other fields here.
 /// The ErrorMsg memory is owned by the `Export`, using Module's general purpose allocator.
 failed_exports: std.AutoArrayHashMapUnmanaged(*Export, *ErrorMsg) = .{},
@ -1534,6 +1542,23 @@ pub const File = struct {
    }
 };

+/// Represents the contents of a file loaded with `@embedFile`.
+pub const EmbedFile = struct {
+    /// Relative to the owning package's root_src_dir.
+    /// Memory is stored in gpa, owned by EmbedFile.
+    sub_file_path: []const u8,
+    bytes: [:0]const u8,
+    stat_size: u64,
+    stat_inode: std.fs.File.INode,
+    stat_mtime: i128,
+    /// Package that this file is a part of, managed externally.
+    pkg: *Package,
+    /// The Decl that was created from the `@embedFile` to own this resource.
+    /// This is how zig knows what other Decl objects to invalidate if the file
+    /// changes on disk.
+    owner_decl: *Decl,
+};
+
 /// This struct holds data necessary to construct API-facing `AllErrors.Message`.
 /// Its memory is managed with the general purpose allocator so that they
 /// can be created and destroyed in response to incremental updates.
@ -2364,6 +2389,11 @@ pub fn deinit(mod: *Module) void {
    }
    mod.failed_files.deinit(gpa);

+    for (mod.failed_embed_files.values()) |msg| {
+        msg.destroy(gpa);
+    }
+    mod.failed_embed_files.deinit(gpa);
+
    for (mod.failed_exports.values()) |value| {
        value.destroy(gpa);
    }
@ -3060,6 +3090,32 @@ pub fn ensureDeclAnalyzed(mod: *Module, decl: *Decl) SemaError!void {
    }
 }

+pub fn updateEmbedFile(mod: *Module, embed_file: *EmbedFile) SemaError!void {
+    const tracy = trace(@src());
+    defer tracy.end();
+
+    // TODO we can potentially relax this if we store some more information along
+    // with decl dependency edges
+    for (embed_file.owner_decl.dependants.keys()) |dep| {
+        switch (dep.analysis) {
+            .unreferenced => unreachable,
+            .in_progress => continue, // already doing analysis, ok
+            .outdated => continue, // already queued for update
+
+            .file_failure,
+            .dependency_failure,
+            .sema_failure,
+            .sema_failure_retryable,
+            .codegen_failure,
+            .codegen_failure_retryable,
+            .complete,
+            => if (dep.generation != mod.generation) {
+                try mod.markOutdatedDecl(dep);
+            },
+        }
+    }
+}
+
 pub fn semaPkg(mod: *Module, pkg: *Package) !void {
    const file = (try mod.importPkg(pkg)).file;
    return mod.semaFile(file);
@ -3551,6 +3607,84 @@ pub fn importFile(
    };
 }

+pub fn embedFile(mod: *Module, cur_file: *File, rel_file_path: []const u8) !*EmbedFile {
+    const gpa = mod.gpa;
+
+    // The resolved path is used as the key in the table, to detect if
+    // a file refers to the same as another, despite different relative paths.
+    const cur_pkg_dir_path = cur_file.pkg.root_src_directory.path orelse ".";
+    const resolved_path = try std.fs.path.resolve(gpa, &[_][]const u8{
+        cur_pkg_dir_path, cur_file.sub_file_path, "..", rel_file_path,
+    });
+    var keep_resolved_path = false;
+    defer if (!keep_resolved_path) gpa.free(resolved_path);
+
+    const gop = try mod.embed_table.getOrPut(gpa, resolved_path);
+    if (gop.found_existing) return gop.value_ptr.*;
+    keep_resolved_path = true; // It's now owned by embed_table.
+
+    const new_file = try gpa.create(EmbedFile);
+    errdefer gpa.destroy(new_file);
+
+    const resolved_root_path = try std.fs.path.resolve(gpa, &[_][]const u8{cur_pkg_dir_path});
+    defer gpa.free(resolved_root_path);
+
+    if (!mem.startsWith(u8, resolved_path, resolved_root_path)) {
+        return error.ImportOutsidePkgPath;
+    }
+    // +1 for the directory separator here.
+    const sub_file_path = try gpa.dupe(u8, resolved_path[resolved_root_path.len + 1 ..]);
+    errdefer gpa.free(sub_file_path);
+
+    var file = try cur_file.pkg.root_src_directory.handle.openFile(sub_file_path, .{});
+    defer file.close();
+
+    const stat = try file.stat();
+    const bytes = try file.readToEndAllocOptions(gpa, std.math.maxInt(u32), stat.size, 1, 0);
+
+    log.debug("new embedFile. resolved_root_path={s}, resolved_path={s}, sub_file_path={s}, rel_file_path={s}", .{
+        resolved_root_path, resolved_path, sub_file_path, rel_file_path,
+    });
+
+    gop.value_ptr.* = new_file;
+    new_file.* = .{
+        .sub_file_path = sub_file_path,
+        .bytes = bytes,
+        .stat_size = stat.size,
+        .stat_inode = stat.inode,
+        .stat_mtime = stat.mtime,
+        .pkg = cur_file.pkg,
+        .owner_decl = undefined, // Set by Sema immediately after this function returns.
+    };
+    return new_file;
+}
+
+pub fn detectEmbedFileUpdate(mod: *Module, embed_file: *EmbedFile) !void {
+    var file = try embed_file.pkg.root_src_directory.handle.openFile(embed_file.sub_file_path, .{});
+    defer file.close();
+
+    const stat = try file.stat();
+
+    const unchanged_metadata =
+        stat.size == embed_file.stat_size and
+        stat.mtime == embed_file.stat_mtime and
+        stat.inode == embed_file.stat_inode;
+
+    if (unchanged_metadata) return;
+
+    const gpa = mod.gpa;
+    const bytes = try file.readToEndAllocOptions(gpa, std.math.maxInt(u32), stat.size, 1, 0);
+    gpa.free(embed_file.bytes);
+    embed_file.bytes = bytes;
+    embed_file.stat_size = stat.size;
+    embed_file.stat_mtime = stat.mtime;
+    embed_file.stat_inode = stat.inode;
+
+    const lock = mod.comp.mutex.acquire();
+    defer lock.release();
+    try mod.comp.work_queue.writeItem(.{ .update_embed_file = embed_file });
+}
+
 pub fn scanNamespace(
    mod: *Module,
    namespace: *Namespace,
--- a/src/Sema.zig
+++ b/src/Sema.zig
@ -6467,6 +6467,45 @@ fn zirImport(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
    return sema.addConstant(file_root_decl.ty, file_root_decl.val);
 }

+fn zirEmbedFile(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
+    const tracy = trace(@src());
+    defer tracy.end();
+
+    const mod = sema.mod;
+    const inst_data = sema.code.instructions.items(.data)[inst].un_node;
+    const src = inst_data.src();
+    const name = try sema.resolveConstString(block, src, inst_data.operand);
+
+    const embed_file = mod.embedFile(block.getFileScope(), name) catch |err| switch (err) {
+        error.ImportOutsidePkgPath => {
+            return sema.fail(block, src, "embed of file outside package path: '{s}'", .{name});
+        },
+        else => {
+            // TODO: these errors are file system errors; make sure an update() will
+            // retry this and not cache the file system error, which may be transient.
+            return sema.fail(block, src, "unable to open '{s}': {s}", .{ name, @errorName(err) });
+        },
+    };
+
+    var anon_decl = try block.startAnonDecl();
+    defer anon_decl.deinit();
+
+    const bytes_including_null = embed_file.bytes[0 .. embed_file.bytes.len + 1];
+
+    // TODO instead of using `Value.Tag.bytes`, create a new value tag for pointing at
+    // a `*Module.EmbedFile`. The purpose of this would be:
+    // - If only the length is read and the bytes are not inspected by comptime code,
+    //   there can be an optimization where the codegen backend does a copy_file_range
+    //   into the final binary, and never loads the data into memory.
+    // - When a Decl is destroyed, it can free the `*Module.EmbedFile`.
+    embed_file.owner_decl = try anon_decl.finish(
+        try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), embed_file.bytes.len),
+        try Value.Tag.bytes.create(anon_decl.arena(), bytes_including_null),
+    );
+
+    return sema.analyzeDeclRef(embed_file.owner_decl);
+}
+
 fn zirRetErrValueCode(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
    _ = block;
    _ = inst;
@ -9020,12 +9059,6 @@ fn zirBoolToInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
    return block.addUnOp(.bool_to_int, operand);
 }

-fn zirEmbedFile(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
-    const inst_data = sema.code.instructions.items(.data)[inst].un_node;
-    const src = inst_data.src();
-    return sema.fail(block, src, "TODO: Sema.zirEmbedFile", .{});
-}
-
 fn zirErrorName(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
    const inst_data = sema.code.instructions.items(.data)[inst].un_node;
    const src = inst_data.src();