From b0105029caafda4668f788387270ee8fcc1ac175 Mon Sep 17 00:00:00 2001 From: Jakub Konka Date: Fri, 26 Mar 2021 21:39:07 +0100 Subject: [PATCH] zld: refactor object and archive parsing --- src/link/MachO/Archive.zig | 197 ++++++++++++++++--------------------- src/link/MachO/Object.zig | 99 ++++++++++--------- src/link/MachO/Zld.zig | 129 ++++++++++++++++++++---- 3 files changed, 241 insertions(+), 184 deletions(-) diff --git a/src/link/MachO/Archive.zig b/src/link/MachO/Archive.zig index debf22dead..be6c4af284 100644 --- a/src/link/MachO/Archive.zig +++ b/src/link/MachO/Archive.zig @@ -9,16 +9,14 @@ const mem = std.mem; const Allocator = mem.Allocator; const Object = @import("Object.zig"); -const parseName = @import("Zld.zig").parseName; usingnamespace @import("commands.zig"); allocator: *Allocator, -file: fs.File, -header: ar_hdr, -name: []u8, - -objects: std.ArrayListUnmanaged(Object) = .{}, +arch: ?std.Target.Cpu.Arch = null, +file: ?fs.File = null, +header: ?ar_hdr = null, +name: ?[]u8 = null, /// Parsed table of contents. /// Each symbol name points to a list of all definition @@ -29,14 +27,14 @@ toc: std.StringArrayHashMapUnmanaged(std.ArrayListUnmanaged(u32)) = .{}, // `struct ar_hdr', and as many bytes of member file data as its `ar_size' // member indicates, for each member file. /// String that begins an archive file. -const ARMAG: *const [SARMAG:0]u8 = "!\n"; +pub const ARMAG: *const [SARMAG:0]u8 = "!\n"; /// Size of that string. -const SARMAG: u4 = 8; +pub const SARMAG: u4 = 8; /// String in ar_fmag at the end of each header. -const ARFMAG: *const [2:0]u8 = "`\n"; +pub const ARFMAG: *const [2:0]u8 = "`\n"; -const ar_hdr = extern struct { +pub const ar_hdr = extern struct { /// Member file name, sometimes / terminated. ar_name: [16]u8, @@ -87,64 +85,91 @@ const ar_hdr = extern struct { } }; +pub fn init(allocator: *Allocator) Archive { + return .{ + .allocator = allocator, + }; +} + pub fn deinit(self: *Archive) void { - self.allocator.free(self.name); - for (self.objects.items) |*object| { - object.deinit(); - } - self.objects.deinit(self.allocator); for (self.toc.items()) |*entry| { self.allocator.free(entry.key); entry.value.deinit(self.allocator); } self.toc.deinit(self.allocator); + + if (self.name) |n| { + self.allocator.free(n); + } } -/// Caller owns the returned Archive instance and is responsible for calling -/// `deinit` to free allocated memory. -pub fn initFromFile(allocator: *Allocator, arch: std.Target.Cpu.Arch, ar_name: []const u8, file: fs.File) !Archive { - var reader = file.reader(); - var magic = try readMagic(allocator, reader); - defer allocator.free(magic); +pub fn closeFile(self: Archive) void { + if (self.file) |f| { + f.close(); + } +} - if (!mem.eql(u8, magic, ARMAG)) { - // Reset file cursor. - try file.seekTo(0); - return error.NotArchive; +pub fn parse(self: *Archive) !void { + var reader = self.file.?.reader(); + const magic = try reader.readBytesNoEof(SARMAG); + + if (!mem.eql(u8, &magic, ARMAG)) { + log.err("invalid magic: expected '{s}', found '{s}'", .{ ARMAG, magic }); + return error.MalformedArchive; } - const header = try reader.readStruct(ar_hdr); + self.header = try reader.readStruct(ar_hdr); - if (!mem.eql(u8, &header.ar_fmag, ARFMAG)) + if (!mem.eql(u8, &self.header.?.ar_fmag, ARFMAG)) { + log.err("invalid header delimiter: expected '{s}', found '{s}'", .{ ARFMAG, self.header.?.ar_fmag }); return error.MalformedArchive; + } - var embedded_name = try getName(allocator, header, reader); - log.debug("parsing archive '{s}' at '{s}'", .{ embedded_name, ar_name }); - defer allocator.free(embedded_name); - - var name = try allocator.dupe(u8, ar_name); - var self = Archive{ - .allocator = allocator, - .file = file, - .header = header, - .name = name, - }; + var embedded_name = try parseName(self.allocator, self.header.?, reader); + log.warn("parsing archive '{s}' at '{s}'", .{ embedded_name, self.name.? }); + defer self.allocator.free(embedded_name); try self.parseTableOfContents(reader); - return self; + try reader.context.seekTo(0); +} + +fn parseName(allocator: *Allocator, header: ar_hdr, reader: anytype) ![]u8 { + const name_or_length = try header.nameOrLength(); + var name: []u8 = undefined; + switch (name_or_length) { + .Name => |n| { + name = try allocator.dupe(u8, n); + }, + .Length => |len| { + var n = try allocator.alloc(u8, len); + defer allocator.free(n); + try reader.readNoEof(n); + const actual_len = mem.indexOfScalar(u8, n, @as(u8, 0)) orelse n.len; + name = try allocator.dupe(u8, n[0..actual_len]); + }, + } + return name; } fn parseTableOfContents(self: *Archive, reader: anytype) !void { const symtab_size = try reader.readIntLittle(u32); var symtab = try self.allocator.alloc(u8, symtab_size); defer self.allocator.free(symtab); - try reader.readNoEof(symtab); + + reader.readNoEof(symtab) catch { + log.err("incomplete symbol table: expected symbol table of length 0x{x}", .{symtab_size}); + return error.MalformedArchive; + }; const strtab_size = try reader.readIntLittle(u32); var strtab = try self.allocator.alloc(u8, strtab_size); defer self.allocator.free(strtab); - try reader.readNoEof(strtab); + + reader.readNoEof(strtab) catch { + log.err("incomplete symbol table: expected string table of length 0x{x}", .{strtab_size}); + return error.MalformedArchive; + }; var symtab_stream = std.io.fixedBufferStream(symtab); var symtab_reader = symtab_stream.reader(); @@ -169,85 +194,29 @@ fn parseTableOfContents(self: *Archive, reader: anytype) !void { } } -fn readObject(self: *Archive, arch: std.Target.Cpu.Arch, ar_name: []const u8, reader: anytype) !void { +/// Caller owns the Object instance. +pub fn parseObject(self: Archive, offset: u32) !Object { + var reader = self.file.?.reader(); + try reader.context.seekTo(offset); + const object_header = try reader.readStruct(ar_hdr); - if (!mem.eql(u8, &object_header.ar_fmag, ARFMAG)) + if (!mem.eql(u8, &object_header.ar_fmag, ARFMAG)) { + log.err("invalid header delimiter: expected '{s}', found '{s}'", .{ ARFMAG, object_header.ar_fmag }); return error.MalformedArchive; - - var object_name = try getName(self.allocator, object_header, reader); - log.debug("extracting object '{s}' from archive '{s}'", .{ object_name, self.name }); - - const offset = @intCast(u32, try reader.context.getPos()); - const header = try reader.readStruct(macho.mach_header_64); - - const this_arch: std.Target.Cpu.Arch = switch (header.cputype) { - macho.CPU_TYPE_ARM64 => .aarch64, - macho.CPU_TYPE_X86_64 => .x86_64, - else => |value| { - log.err("unsupported cpu architecture 0x{x}", .{value}); - return error.UnsupportedCpuArchitecture; - }, - }; - if (this_arch != arch) { - log.err("mismatched cpu architecture: found {s}, expected {s}", .{ this_arch, arch }); - return error.MismatchedCpuArchitecture; } - // TODO Implement std.fs.File.clone() or similar. - var new_file = try fs.cwd().openFile(ar_name, .{}); - var object = Object{ - .allocator = self.allocator, - .name = object_name, - .ar_name = try mem.dupe(self.allocator, u8, ar_name), - .file = new_file, - .header = header, - }; + const object_name = try parseName(self.allocator, object_header, reader); + log.warn("extracting object '{s}' from archive '{s}'", .{ object_name, self.name.? }); - try object.readLoadCommands(reader, .{ .offset = offset }); + var object = Object.init(self.allocator); + object.arch = self.arch.?; + object.file = try fs.cwd().openFile(self.name.?, .{}); + object.name = object_name; + object.file_offset = @intCast(u32, try reader.context.getPos()); + try object.parse(); - if (object.symtab_cmd_index != null) { - try object.readSymtab(); - try object.readStrtab(); - } + try reader.context.seekTo(0); - if (object.data_in_code_cmd_index != null) try object.readDataInCode(); - - log.debug("\n\n", .{}); - log.debug("{s} defines symbols", .{object.name}); - for (object.symtab.items) |sym| { - const symname = object.getString(sym.n_strx); - log.debug("'{s}': {}", .{ symname, sym }); - } - - try self.objects.append(self.allocator, object); -} - -fn readMagic(allocator: *Allocator, reader: anytype) ![]u8 { - var magic = std.ArrayList(u8).init(allocator); - try magic.ensureCapacity(SARMAG); - var i: usize = 0; - while (i < SARMAG) : (i += 1) { - const next = try reader.readByte(); - magic.appendAssumeCapacity(next); - } - return magic.toOwnedSlice(); -} - -fn getName(allocator: *Allocator, header: ar_hdr, reader: anytype) ![]u8 { - const name_or_length = try header.nameOrLength(); - var name: []u8 = undefined; - switch (name_or_length) { - .Name => |n| { - name = try allocator.dupe(u8, n); - }, - .Length => |len| { - var n = try allocator.alloc(u8, len); - defer allocator.free(n); - try reader.readNoEof(n); - const actual_len = mem.indexOfScalar(u8, n, @as(u8, 0)) orelse n.len; - name = try allocator.dupe(u8, n[0..actual_len]); - }, - } - return name; + return object; } diff --git a/src/link/MachO/Object.zig b/src/link/MachO/Object.zig index 45cafb9c2f..ac160288b5 100644 --- a/src/link/MachO/Object.zig +++ b/src/link/MachO/Object.zig @@ -15,11 +15,11 @@ const parseName = @import("Zld.zig").parseName; usingnamespace @import("commands.zig"); allocator: *Allocator, -file: fs.File, -name: []u8, -ar_name: ?[]u8 = null, - -header: macho.mach_header_64, +arch: ?std.Target.Cpu.Arch = null, +header: ?macho.mach_header_64 = null, +file: ?fs.File = null, +file_offset: ?u32 = null, +name: ?[]u8 = null, load_commands: std.ArrayListUnmanaged(LoadCommand) = .{}, @@ -42,6 +42,12 @@ strtab: std.ArrayListUnmanaged(u8) = .{}, data_in_code_entries: std.ArrayListUnmanaged(macho.data_in_code_entry) = .{}, +pub fn init(allocator: *Allocator) Object { + return .{ + .allocator = allocator, + }; +} + pub fn deinit(self: *Object) void { for (self.load_commands.items) |*lc| { lc.deinit(self.allocator); @@ -50,25 +56,32 @@ pub fn deinit(self: *Object) void { self.symtab.deinit(self.allocator); self.strtab.deinit(self.allocator); self.data_in_code_entries.deinit(self.allocator); - self.allocator.free(self.name); - if (self.ar_name) |v| { - self.allocator.free(v); + + if (self.name) |n| { + self.allocator.free(n); } } -/// Caller owns the returned Object instance and is responsible for calling -/// `deinit` to free allocated memory. -pub fn initFromFile(allocator: *Allocator, arch: std.Target.Cpu.Arch, name: []const u8, file: fs.File) !Object { - var reader = file.reader(); - const header = try reader.readStruct(macho.mach_header_64); +pub fn closeFile(self: Object) void { + if (self.file) |f| { + f.close(); + } +} - if (header.filetype != macho.MH_OBJECT) { - // Reset file cursor. - try file.seekTo(0); - return error.NotObject; +pub fn parse(self: *Object) !void { + var reader = self.file.?.reader(); + if (self.file_offset) |offset| { + try reader.context.seekTo(offset); } - const this_arch: std.Target.Cpu.Arch = switch (header.cputype) { + self.header = try reader.readStruct(macho.mach_header_64); + + if (self.header.?.filetype != macho.MH_OBJECT) { + log.err("invalid filetype: expected 0x{x}, found 0x{x}", .{ macho.MH_OBJECT, self.header.?.filetype }); + return error.MalformedObject; + } + + const this_arch: std.Target.Cpu.Arch = switch (self.header.?.cputype) { macho.CPU_TYPE_ARM64 => .aarch64, macho.CPU_TYPE_X86_64 => .x86_64, else => |value| { @@ -76,35 +89,22 @@ pub fn initFromFile(allocator: *Allocator, arch: std.Target.Cpu.Arch, name: []co return error.UnsupportedCpuArchitecture; }, }; - if (this_arch != arch) { - log.err("mismatched cpu architecture: found {s}, expected {s}", .{ this_arch, arch }); + if (this_arch != self.arch.?) { + log.err("mismatched cpu architecture: expected {s}, found {s}", .{ self.arch.?, this_arch }); return error.MismatchedCpuArchitecture; } - var self = Object{ - .allocator = allocator, - .name = try allocator.dupe(u8, name), - .file = file, - .header = header, - }; - - try self.readLoadCommands(reader, .{}); + try self.readLoadCommands(reader); if (self.symtab_cmd_index != null) try self.parseSymtab(); if (self.data_in_code_cmd_index != null) try self.readDataInCode(); - - return self; } -pub const ReadOffset = struct { - offset: ?u32 = null, -}; - -pub fn readLoadCommands(self: *Object, reader: anytype, offset: ReadOffset) !void { - const offset_mod = offset.offset orelse 0; - try self.load_commands.ensureCapacity(self.allocator, self.header.ncmds); +pub fn readLoadCommands(self: *Object, reader: anytype) !void { + const offset = self.file_offset orelse 0; + try self.load_commands.ensureCapacity(self.allocator, self.header.?.ncmds); var i: u16 = 0; - while (i < self.header.ncmds) : (i += 1) { + while (i < self.header.?.ncmds) : (i += 1) { var cmd = try LoadCommand.read(self.allocator, reader); switch (cmd.cmd()) { macho.LC_SEGMENT_64 => { @@ -132,17 +132,18 @@ pub fn readLoadCommands(self: *Object, reader: anytype, offset: ReadOffset) !voi } } - sect.offset += offset_mod; - if (sect.reloff > 0) - sect.reloff += offset_mod; + sect.offset += offset; + if (sect.reloff > 0) { + sect.reloff += offset; + } } - seg.inner.fileoff += offset_mod; + seg.inner.fileoff += offset; }, macho.LC_SYMTAB => { self.symtab_cmd_index = i; - cmd.Symtab.symoff += offset_mod; - cmd.Symtab.stroff += offset_mod; + cmd.Symtab.symoff += offset; + cmd.Symtab.stroff += offset; }, macho.LC_DYSYMTAB => { self.dysymtab_cmd_index = i; @@ -152,7 +153,7 @@ pub fn readLoadCommands(self: *Object, reader: anytype, offset: ReadOffset) !voi }, macho.LC_DATA_IN_CODE => { self.data_in_code_cmd_index = i; - cmd.LinkeditData.dataoff += offset_mod; + cmd.LinkeditData.dataoff += offset; }, else => { log.debug("Unknown load command detected: 0x{x}.", .{cmd.cmd()}); @@ -168,7 +169,7 @@ pub fn parseSymtab(self: *Object) !void { var symtab = try self.allocator.alloc(u8, @sizeOf(macho.nlist_64) * symtab_cmd.nsyms); defer self.allocator.free(symtab); - _ = try self.file.preadAll(symtab, symtab_cmd.symoff); + _ = try self.file.?.preadAll(symtab, symtab_cmd.symoff); try self.symtab.ensureCapacity(self.allocator, symtab_cmd.nsyms); var stream = std.io.fixedBufferStream(symtab); @@ -187,7 +188,7 @@ pub fn parseSymtab(self: *Object) !void { var strtab = try self.allocator.alloc(u8, symtab_cmd.strsize); defer self.allocator.free(strtab); - _ = try self.file.preadAll(strtab, symtab_cmd.stroff); + _ = try self.file.?.preadAll(strtab, symtab_cmd.stroff); try self.strtab.appendSlice(self.allocator, strtab); } @@ -200,7 +201,7 @@ pub fn readSection(self: Object, allocator: *Allocator, index: u16) ![]u8 { const seg = self.load_commands.items[self.segment_cmd_index.?].Segment; const sect = seg.sections.items[index]; var buffer = try allocator.alloc(u8, sect.size); - _ = try self.file.preadAll(buffer, sect.offset); + _ = try self.file.?.preadAll(buffer, sect.offset); return buffer; } @@ -211,7 +212,7 @@ pub fn readDataInCode(self: *Object) !void { var buffer = try self.allocator.alloc(u8, data_in_code.datasize); defer self.allocator.free(buffer); - _ = try self.file.preadAll(buffer, data_in_code.dataoff); + _ = try self.file.?.preadAll(buffer, data_in_code.dataoff); var stream = io.fixedBufferStream(buffer); var reader = stream.reader(); diff --git a/src/link/MachO/Zld.zig b/src/link/MachO/Zld.zig index 30d6b56c4b..b75cdb18cc 100644 --- a/src/link/MachO/Zld.zig +++ b/src/link/MachO/Zld.zig @@ -225,14 +225,14 @@ pub fn deinit(self: *Zld) void { self.undefs.deinit(self.allocator); } -pub fn closeFiles(self: *Zld) void { - for (self.objects.items) |*object| { - object.file.close(); +pub fn closeFiles(self: Zld) void { + for (self.objects.items) |object| { + object.closeFile(); } - for (self.archives.items) |*archive| { - archive.file.close(); + for (self.archives.items) |archive| { + archive.closeFile(); } - if (self.file) |*f| f.close(); + if (self.file) |f| f.close(); } pub fn link(self: *Zld, files: []const []const u8, out_path: []const u8) !void { @@ -272,6 +272,7 @@ pub fn link(self: *Zld, files: []const []const u8, out_path: []const u8) !void { try self.populateMetadata(); try self.parseInputFiles(files); + try self.resolveSymbols(); self.printSymtab(); // try self.sortSections(); // try self.allocateTextSegment(); @@ -284,31 +285,76 @@ pub fn link(self: *Zld, files: []const []const u8, out_path: []const u8) !void { } fn parseInputFiles(self: *Zld, files: []const []const u8) !void { + const Input = struct { + kind: enum { + object, + archive, + }, + file: fs.File, + name: []const u8, + }; + var classified = std.ArrayList(Input).init(self.allocator); + defer classified.deinit(); + + // First, classify input files as either object or archive. for (files) |file_name| { const file = try fs.cwd().openFile(file_name, .{}); try_object: { - var object = Object.initFromFile(self.allocator, self.arch.?, file_name, file) catch |err| switch (err) { - error.NotObject => break :try_object, - else => |e| return e, - }; - const index = @intCast(u16, self.objects.items.len); - try self.objects.append(self.allocator, object); - try self.resolveSymbols(index); + const header = try file.reader().readStruct(macho.mach_header_64); + if (header.filetype != macho.MH_OBJECT) { + try file.seekTo(0); + break :try_object; + } + + try file.seekTo(0); + try classified.append(.{ + .kind = .object, + .file = file, + .name = file_name, + }); continue; } try_archive: { - var archive = Archive.initFromFile(self.allocator, self.arch.?, file_name, file) catch |err| switch (err) { - error.NotArchive => break :try_archive, - else => |e| return e, - }; - try self.archives.append(self.allocator, archive); + const magic = try file.reader().readBytesNoEof(Archive.SARMAG); + if (!mem.eql(u8, &magic, Archive.ARMAG)) { + try file.seekTo(0); + break :try_archive; + } + + try file.seekTo(0); + try classified.append(.{ + .kind = .archive, + .file = file, + .name = file_name, + }); continue; } - log.err("unexpected file type: expected object '.o' or archive '.a': {s}", .{file_name}); - return error.UnexpectedInputFileType; + log.warn("unexpected input file of unknown type '{s}'", .{file_name}); + } + + // Based on our classification, proceed with parsing. + for (classified.items) |input| { + switch (input.kind) { + .object => { + var object = Object.init(self.allocator); + object.arch = self.arch.?; + object.name = try self.allocator.dupe(u8, input.name); + object.file = input.file; + try object.parse(); + try self.objects.append(self.allocator, object); + }, + .archive => { + var archive = Archive.init(self.allocator); + archive.arch = self.arch.?; + archive.name = try self.allocator.dupe(u8, input.name); + archive.file = input.file; + try archive.parse(); + try self.archives.append(self.allocator, archive); + }, + } } } @@ -1153,7 +1199,7 @@ fn writeStubInStubHelper(self: *Zld, index: u32) !void { try self.file.?.pwriteAll(code, stub_off); } -fn resolveSymbols(self: *Zld, object_id: u16) !void { +fn resolveSymbolsInObject(self: *Zld, object_id: u16) !void { const object = self.objects.items[object_id]; log.warn("resolving symbols in '{s}'", .{object.name}); @@ -1200,6 +1246,47 @@ fn resolveSymbols(self: *Zld, object_id: u16) !void { } } +fn resolveSymbols(self: *Zld) !void { + // First pass, resolve symbols in provided objects. + for (self.objects.items) |object, object_id| { + try self.resolveSymbolsInObject(@intCast(u16, object_id)); + } + + var next: usize = 0; + while (true) { + var archive = &self.archives.items[next]; + var hit: bool = false; + + for (self.undefs.items()) |entry| { + const sym_name = entry.key; + + // Check if the entry exists in a static archive. + const offsets = archive.toc.get(sym_name) orelse { + // No hit. + continue; + }; + assert(offsets.items.len > 0); + + const object = try archive.parseObject(offsets.items[0]); + const object_id = @intCast(u16, self.objects.items.len); + try self.objects.append(self.allocator, object); + try self.resolveSymbolsInObject(object_id); + + hit = true; + break; + } + + if (!hit) { + // Next archive. + next += 1; + if (next == self.archives.items.len) { + break; + } + archive = &self.archives.items[next]; + } + } +} + fn doRelocs(self: *Zld) !void { for (self.objects.items) |object, object_id| { log.debug("\n\n", .{});