From dbde746f9d95f2dbc8872e2d6283c2db64ac7519 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Wed, 13 Sep 2023 19:05:22 +0200 Subject: [PATCH] elf: parse archives --- src/link/Elf.zig | 148 +++++++++++++++++++++++++++++++---- src/link/Elf/Archive.zig | 153 +++++++++++++++++++++++++++++++++++++ src/link/Elf/Object.zig | 6 +- src/link/Elf/ZigModule.zig | 9 +++ src/link/Elf/file.zig | 13 ++++ 5 files changed, 312 insertions(+), 17 deletions(-) create mode 100644 src/link/Elf/Archive.zig diff --git a/src/link/Elf.zig b/src/link/Elf.zig index 6b9509d9a3..0867fe017b 100644 --- a/src/link/Elf.zig +++ b/src/link/Elf.zig @@ -1052,13 +1052,6 @@ pub fn flushModule(self: *Elf, comp: *Compilation, prog_node: *std.Progress.Node const directory = self.base.options.emit.?.directory; // Just an alias to make it shorter to type. const full_out_path = try directory.join(arena, &[_][]const u8{self.base.options.emit.?.sub_path}); - const compiler_rt_path: ?[]const u8 = blk: { - if (comp.compiler_rt_lib) |x| break :blk x.full_object_path; - if (comp.compiler_rt_obj) |x| break :blk x.full_object_path; - break :blk null; - }; - _ = compiler_rt_path; - // Here we will parse input positional and library files (if referenced). // This will roughly match in any linker backend we support. var positionals = std.ArrayList(Compilation.LinkObject).init(arena); @@ -1084,6 +1077,15 @@ pub fn flushModule(self: *Elf, comp: *Compilation, prog_node: *std.Progress.Node try positionals.append(.{ .path = key.status.success.object_path }); } + const compiler_rt_path: ?[]const u8 = blk: { + if (comp.compiler_rt_lib) |x| break :blk x.full_object_path; + if (comp.compiler_rt_obj) |x| break :blk x.full_object_path; + break :blk null; + }; + if (compiler_rt_path) |path| { + try positionals.append(.{ .path = path }); + } + for (positionals.items) |obj| { const in_file = try std.fs.cwd().openFile(obj.path, .{}); defer in_file.close(); @@ -1140,7 +1142,7 @@ pub fn flushModule(self: *Elf, comp: *Compilation, prog_node: *std.Progress.Node // input Object files. // Any qualifing unresolved symbol will be upgraded to an absolute, weak // symbol for potential resolution at load-time. - self.resolveSymbols(); + try self.resolveSymbols(); self.markImportsExports(); self.claimUnresolved(); @@ -1403,6 +1405,7 @@ const ParseError = error{ EndOfStream, FileSystem, NotSupported, + InvalidCharacter, } || std.os.SeekError || std.fs.File.OpenError || std.fs.File.ReadError; fn parsePositional( @@ -1414,10 +1417,32 @@ fn parsePositional( ) ParseError!void { const tracy = trace(@src()); defer tracy.end(); - _ = must_link; if (Object.isObject(in_file)) { try self.parseObject(in_file, path, ctx); + } else { + try self.parseLibrary(in_file, path, .{ + .path = null, + .needed = false, + .weak = false, + }, must_link, ctx); + } +} + +fn parseLibrary( + self: *Elf, + in_file: std.fs.File, + path: []const u8, + lib: link.SystemLib, + must_link: bool, + ctx: *ParseErrorCtx, +) ParseError!void { + const tracy = trace(@src()); + defer tracy.end(); + _ = lib; + + if (Archive.isArchive(in_file)) { + try self.parseArchive(in_file, path, must_link, ctx); } else return error.UnknownFileType; } @@ -1442,15 +1467,109 @@ fn parseObject(self: *Elf, in_file: std.fs.File, path: []const u8, ctx: *ParseEr if (ctx.detected_cpu_arch != self.base.options.target.cpu.arch) return error.InvalidCpuArch; } -fn resolveSymbols(self: *Elf) void { - if (self.zig_module_index) |index| { - const zig_module = self.file(index).?.zig_module; - zig_module.resolveSymbols(self); +fn parseArchive( + self: *Elf, + in_file: std.fs.File, + path: []const u8, + must_link: bool, + ctx: *ParseErrorCtx, +) ParseError!void { + const tracy = trace(@src()); + defer tracy.end(); + + const gpa = self.base.allocator; + const data = try in_file.readToEndAlloc(gpa, std.math.maxInt(u32)); + var archive = Archive{ .path = path, .data = data }; + defer archive.deinit(gpa); + try archive.parse(self); + + for (archive.objects.items) |extracted| { + const index = @as(File.Index, @intCast(try self.files.addOne(gpa))); + self.files.set(index, .{ .object = extracted }); + const object = &self.files.items(.data)[index].object; + object.index = index; + object.alive = must_link; + try object.parse(self); + try self.objects.append(gpa, index); + + ctx.detected_cpu_arch = object.header.?.e_machine.toTargetCpuArch().?; + if (ctx.detected_cpu_arch != self.base.options.target.cpu.arch) return error.InvalidCpuArch; + } +} + +/// When resolving symbols, we approach the problem similarly to `mold`. +/// 1. Resolve symbols across all objects (including those preemptively extracted archives). +/// 2. Resolve symbols across all shared objects. +/// 3. Mark live objects (see `Elf.markLive`) +/// 4. Reset state of all resolved globals since we will redo this bit on the pruned set. +/// 5. Remove references to dead objects/shared objects +/// 6. Re-run symbol resolution on pruned objects and shared objects sets. +fn resolveSymbols(self: *Elf) error{Overflow}!void { + // Resolve symbols in the ZigModule. For now, we assume that it's always live. + if (self.zig_module_index) |index| self.file(index).?.resolveSymbols(self); + // Resolve symbols on the set of all objects and shared objects (even if some are unneeded). + for (self.objects.items) |index| self.file(index).?.resolveSymbols(self); + + // Mark live objects. + self.markLive(); + + // Reset state of all globals after marking live objects. + if (self.zig_module_index) |index| self.file(index).?.resetGlobals(self); + for (self.objects.items) |index| self.file(index).?.resetGlobals(self); + + // Prune dead objects and shared objects. + var i: usize = 0; + while (i < self.objects.items.len) { + const index = self.objects.items[i]; + if (!self.file(index).?.isAlive()) { + _ = self.objects.orderedRemove(i); + } else i += 1; + } + + // Dedup comdat groups. + for (self.objects.items) |index| { + const object = self.file(index).?.object; + for (object.comdat_groups.items) |cg_index| { + const cg = self.comdatGroup(cg_index); + const cg_owner = self.comdatGroupOwner(cg.owner); + const owner_file_index = if (self.file(cg_owner.file)) |file_ptr| + file_ptr.object.index + else + std.math.maxInt(File.Index); + cg_owner.file = @min(owner_file_index, index); + } } for (self.objects.items) |index| { const object = self.file(index).?.object; - object.resolveSymbols(self); + for (object.comdat_groups.items) |cg_index| { + const cg = self.comdatGroup(cg_index); + const cg_owner = self.comdatGroupOwner(cg.owner); + if (cg_owner.file != index) { + for (try object.comdatGroupMembers(cg.shndx)) |shndx| { + const atom_index = object.atoms.items[shndx]; + if (self.atom(atom_index)) |atom_ptr| { + atom_ptr.alive = false; + // atom_ptr.markFdesDead(self); + } + } + } + } + } + + // Re-resolve the symbols. + if (self.zig_module_index) |index| self.file(index).?.resolveSymbols(self); + for (self.objects.items) |index| self.file(index).?.resolveSymbols(self); +} + +/// Traverses all objects and shared objects marking any object referenced by +/// a live object/shared object as alive itself. +/// This routine will prune unneeded objects extracted from archives and +/// unneeded shared objects. +fn markLive(self: *Elf) void { + for (self.objects.items) |index| { + const file_ptr = self.file(index).?; + if (file_ptr.isAlive()) file_ptr.markLive(self); } } @@ -4059,6 +4178,7 @@ const synthetic_sections = @import("Elf/synthetic_sections.zig"); const Air = @import("../Air.zig"); const Allocator = std.mem.Allocator; +const Archive = @import("Elf/Archive.zig"); pub const Atom = @import("Elf/Atom.zig"); const Cache = std.Build.Cache; const Compilation = @import("../Compilation.zig"); diff --git a/src/link/Elf/Archive.zig b/src/link/Elf/Archive.zig new file mode 100644 index 0000000000..f34b323206 --- /dev/null +++ b/src/link/Elf/Archive.zig @@ -0,0 +1,153 @@ +path: []const u8, +data: []const u8, + +objects: std.ArrayListUnmanaged(Object) = .{}, +strtab: []const u8 = &[0]u8{}, + +// Archive files start with the ARMAG identifying string. Then follows a +// `struct ar_hdr', and as many bytes of member file data as its `ar_size' +// member indicates, for each member file. +/// String that begins an archive file. +pub const ARMAG: *const [SARMAG:0]u8 = "!\n"; +/// Size of that string. +pub const SARMAG: u4 = 8; + +/// String in ar_fmag at the end of each header. +const ARFMAG: *const [2:0]u8 = "`\n"; + +const SYM64NAME: *const [7:0]u8 = "/SYM64/"; + +const ar_hdr = extern struct { + /// Member file name, sometimes / terminated. + ar_name: [16]u8, + + /// File date, decimal seconds since Epoch. + ar_date: [12]u8, + + /// User ID, in ASCII format. + ar_uid: [6]u8, + + /// Group ID, in ASCII format. + ar_gid: [6]u8, + + /// File mode, in ASCII octal. + ar_mode: [8]u8, + + /// File size, in ASCII decimal. + ar_size: [10]u8, + + /// Always contains ARFMAG. + ar_fmag: [2]u8, + + fn date(self: ar_hdr) !u64 { + const value = getValue(&self.ar_date); + return std.fmt.parseInt(u64, value, 10); + } + + fn size(self: ar_hdr) !u32 { + const value = getValue(&self.ar_size); + return std.fmt.parseInt(u32, value, 10); + } + + fn getValue(raw: []const u8) []const u8 { + return mem.trimRight(u8, raw, &[_]u8{@as(u8, 0x20)}); + } + + fn isStrtab(self: ar_hdr) bool { + return mem.eql(u8, getValue(&self.ar_name), "//"); + } + + fn isSymtab(self: ar_hdr) bool { + return mem.eql(u8, getValue(&self.ar_name), "/"); + } +}; + +pub fn isArchive(file: std.fs.File) bool { + const reader = file.reader(); + const magic = reader.readBytesNoEof(Archive.SARMAG) catch return false; + defer file.seekTo(0) catch {}; + if (!mem.eql(u8, &magic, ARMAG)) return false; + return true; +} + +pub fn deinit(self: *Archive, allocator: Allocator) void { + allocator.free(self.data); + self.objects.deinit(allocator); +} + +pub fn parse(self: *Archive, elf_file: *Elf) !void { + const gpa = elf_file.base.allocator; + + var stream = std.io.fixedBufferStream(self.data); + const reader = stream.reader(); + _ = try reader.readBytesNoEof(SARMAG); + + while (true) { + if (stream.pos % 2 != 0) { + stream.pos += 1; + } + + const hdr = reader.readStruct(ar_hdr) catch break; + + if (!mem.eql(u8, &hdr.ar_fmag, ARFMAG)) { + // TODO convert into an error + log.debug( + "{s}: invalid header delimiter: expected '{s}', found '{s}'", + .{ self.path, std.fmt.fmtSliceEscapeLower(ARFMAG), std.fmt.fmtSliceEscapeLower(&hdr.ar_fmag) }, + ); + return; + } + + const size = try hdr.size(); + defer { + _ = stream.seekBy(size) catch {}; + } + + if (hdr.isSymtab()) continue; + if (hdr.isStrtab()) { + self.strtab = self.data[stream.pos..][0..size]; + continue; + } + + const name = ar_hdr.getValue(&hdr.ar_name); + + if (mem.eql(u8, name, "__.SYMDEF") or mem.eql(u8, name, "__.SYMDEF SORTED")) continue; + + const object_name = blk: { + if (name[0] == '/') { + const off = try std.fmt.parseInt(u32, name[1..], 10); + break :blk self.getString(off); + } + break :blk name; + }; + + const object = Object{ + .archive = self.path, + .path = try gpa.dupe(u8, object_name[0 .. object_name.len - 1]), // To account for trailing '/' + .data = try gpa.dupe(u8, self.data[stream.pos..][0..size]), + .index = undefined, + .alive = false, + }; + + log.debug("extracting object '{s}' from archive '{s}'", .{ object.path, self.path }); + + try self.objects.append(gpa, object); + } +} + +fn getString(self: Archive, off: u32) []const u8 { + assert(off < self.strtab.len); + return mem.sliceTo(@as([*:'\n']const u8, @ptrCast(self.strtab.ptr + off)), 0); +} + +const std = @import("std"); +const assert = std.debug.assert; +const elf = std.elf; +const fs = std.fs; +const log = std.log.scoped(.link); +const mem = std.mem; + +const Allocator = mem.Allocator; +const Archive = @This(); +const Elf = @import("../Elf.zig"); +const Object = @import("Object.zig"); diff --git a/src/link/Elf/Object.zig b/src/link/Elf/Object.zig index b0a6ef2a1c..32c96b8d95 100644 --- a/src/link/Elf/Object.zig +++ b/src/link/Elf/Object.zig @@ -485,9 +485,9 @@ pub fn claimUnresolved(self: *Object, elf_file: *Elf) void { pub fn resetGlobals(self: *Object, elf_file: *Elf) void { for (self.globals()) |index| { const global = elf_file.symbol(index); - const name = global.name; + const off = global.name_offset; global.* = .{}; - global.name = name; + global.name_offset = off; } } @@ -499,7 +499,7 @@ pub fn markLive(self: *Object, elf_file: *Elf) void { if (sym.st_bind() == elf.STB_WEAK) continue; const global = elf_file.symbol(index); - const file = global.getFile(elf_file) orelse continue; + const file = global.file(elf_file) orelse continue; const should_keep = sym.st_shndx == elf.SHN_UNDEF or (sym.st_shndx == elf.SHN_COMMON and global.elfSym(elf_file).st_shndx != elf.SHN_COMMON); if (should_keep and !file.isAlive()) { diff --git a/src/link/Elf/ZigModule.zig b/src/link/Elf/ZigModule.zig index 46a382abf9..98496a2c38 100644 --- a/src/link/Elf/ZigModule.zig +++ b/src/link/Elf/ZigModule.zig @@ -148,6 +148,15 @@ pub fn scanRelocs(self: *ZigModule, elf_file: *Elf, undefs: anytype) !void { } } +pub fn resetGlobals(self: *ZigModule, elf_file: *Elf) void { + for (self.globals()) |index| { + const global = elf_file.symbol(index); + const off = global.name_offset; + global.* = .{}; + global.name_offset = off; + } +} + pub fn updateSymtabSize(self: *ZigModule, elf_file: *Elf) void { for (self.locals()) |local_index| { const local = elf_file.symbol(local_index); diff --git a/src/link/Elf/file.zig b/src/link/Elf/file.zig index 2b49f43bf1..cbfc9ade60 100644 --- a/src/link/Elf/file.zig +++ b/src/link/Elf/file.zig @@ -62,6 +62,19 @@ pub const File = union(enum) { return (@as(u32, base) << 24) + file.index(); } + pub fn resolveSymbols(file: File, elf_file: *Elf) void { + switch (file) { + inline else => |x| x.resolveSymbols(elf_file), + } + } + + pub fn resetGlobals(file: File, elf_file: *Elf) void { + switch (file) { + .linker_defined => unreachable, + inline else => |x| x.resetGlobals(elf_file), + } + } + pub fn setAlive(file: File) void { switch (file) { .zig_module, .linker_defined => {},