elf: parse GNU ld script as system lib indirection

This commit is contained in:
Jakub Konka 2023-10-18 12:38:15 +02:00
parent 533e2671c9
commit 52e0ca1312
6 changed files with 686 additions and 8 deletions

View File

@ -589,6 +589,7 @@ set(ZIG_STAGE2_SOURCES
"${CMAKE_SOURCE_DIR}/src/link/Elf.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/Archive.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/Atom.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/LdScript.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/LinkerDefined.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/Object.zig"
"${CMAKE_SOURCE_DIR}/src/link/Elf/SharedObject.zig"

View File

@ -1656,7 +1656,7 @@ const ParseError = error{
FileSystem,
NotSupported,
InvalidCharacter,
} || std.os.SeekError || std.fs.File.OpenError || std.fs.File.ReadError;
} || LdScript.Error || std.os.AccessError || std.os.SeekError || std.fs.File.OpenError || std.fs.File.ReadError;
fn parsePositional(
self: *Elf,
@ -1689,7 +1689,13 @@ fn parseLibrary(
try self.parseArchive(in_file, lib.path, must_link, ctx);
} else if (SharedObject.isSharedObject(in_file)) {
try self.parseSharedObject(in_file, lib, ctx);
} else return error.UnknownFileType;
} else {
// TODO if the script has a top-level comment identifying it as GNU ld script,
// then report parse errors. Otherwise return UnknownFileType.
self.parseLdScript(in_file, lib, ctx) catch |err| switch (err) {
else => return error.UnknownFileType,
};
}
}
fn parseObject(self: *Elf, in_file: std.fs.File, path: []const u8, ctx: *ParseErrorCtx) ParseError!void {
@ -1700,7 +1706,7 @@ fn parseObject(self: *Elf, in_file: std.fs.File, path: []const u8, ctx: *ParseEr
const data = try in_file.readToEndAlloc(gpa, std.math.maxInt(u32));
const index = @as(File.Index, @intCast(try self.files.addOne(gpa)));
self.files.set(index, .{ .object = .{
.path = path,
.path = try gpa.dupe(u8, path),
.data = data,
.index = index,
} });
@ -1725,11 +1731,14 @@ fn parseArchive(
const gpa = self.base.allocator;
const data = try in_file.readToEndAlloc(gpa, std.math.maxInt(u32));
var archive = Archive{ .path = path, .data = data };
var archive = Archive{ .path = try gpa.dupe(u8, path), .data = data };
defer archive.deinit(gpa);
try archive.parse(self);
for (archive.objects.items) |extracted| {
const objects = try archive.objects.toOwnedSlice(gpa);
defer gpa.free(objects);
for (objects) |extracted| {
const index = @as(File.Index, @intCast(try self.files.addOne(gpa)));
self.files.set(index, .{ .object = extracted });
const object = &self.files.items(.data)[index].object;
@ -1756,7 +1765,7 @@ fn parseSharedObject(
const data = try in_file.readToEndAlloc(gpa, std.math.maxInt(u32));
const index = @as(File.Index, @intCast(try self.files.addOne(gpa)));
self.files.set(index, .{ .shared_object = .{
.path = lib.path,
.path = try gpa.dupe(u8, lib.path),
.data = data,
.index = index,
.needed = lib.needed,
@ -1771,6 +1780,123 @@ fn parseSharedObject(
if (ctx.detected_cpu_arch != self.base.options.target.cpu.arch) return error.InvalidCpuArch;
}
fn parseLdScript(self: *Elf, in_file: std.fs.File, lib: SystemLib, ctx: *ParseErrorCtx) ParseError!void {
const tracy = trace(@src());
defer tracy.end();
const gpa = self.base.allocator;
const data = try in_file.readToEndAlloc(gpa, std.math.maxInt(u32));
defer gpa.free(data);
var script = LdScript{};
defer script.deinit(gpa);
try script.parse(data, self);
if (script.cpu_arch) |cpu_arch| {
ctx.detected_cpu_arch = cpu_arch;
if (ctx.detected_cpu_arch != self.base.options.target.cpu.arch) return error.InvalidCpuArch;
}
const lib_dirs = self.base.options.lib_dirs;
var arena_allocator = std.heap.ArenaAllocator.init(gpa);
defer arena_allocator.deinit();
const arena = arena_allocator.allocator();
var test_path = std.ArrayList(u8).init(arena);
var checked_paths = std.ArrayList([]const u8).init(arena);
for (script.args.items) |scr_obj| {
checked_paths.clearRetainingCapacity();
success: {
if (mem.startsWith(u8, scr_obj.path, "-l")) {
const lib_name = scr_obj.path["-l".len..];
// TODO I think technically we should re-use the mechanism used by the frontend here.
// Maybe we should hoist search-strategy all the way here?
for (lib_dirs) |lib_dir| {
if (!self.isStatic()) {
if (try self.accessLibPath(&test_path, &checked_paths, lib_dir, lib_name, .Dynamic))
break :success;
}
if (try self.accessLibPath(&test_path, &checked_paths, lib_dir, lib_name, .Static))
break :success;
}
try self.reportMissingLibraryError(
checked_paths.items,
"missing library dependency: GNU ld script '{s}' requires '{s}', but file not found",
.{
lib.path,
scr_obj.path,
},
);
} else {
var buffer: [fs.MAX_PATH_BYTES]u8 = undefined;
if (fs.realpath(scr_obj.path, &buffer)) |path| {
test_path.clearRetainingCapacity();
try test_path.writer().writeAll(path);
break :success;
} else |_| {}
try checked_paths.append(try gpa.dupe(u8, scr_obj.path));
for (lib_dirs) |lib_dir| {
if (try self.accessLibPath(&test_path, &checked_paths, lib_dir, scr_obj.path, null))
break :success;
}
try self.reportMissingLibraryError(
checked_paths.items,
"missing library dependency: GNU ld script '{s}' requires '{s}', but file not found",
.{
lib.path,
scr_obj.path,
},
);
}
}
const full_path = test_path.items;
const scr_file = try std.fs.cwd().openFile(full_path, .{});
defer scr_file.close();
var scr_ctx: ParseErrorCtx = .{ .detected_cpu_arch = undefined };
self.parseLibrary(scr_file, .{
.needed = scr_obj.needed,
.path = full_path,
}, false, &scr_ctx) catch |err| try self.handleAndReportParseError(full_path, err, &scr_ctx);
}
}
fn accessLibPath(
self: *Elf,
test_path: *std.ArrayList(u8),
checked_paths: *std.ArrayList([]const u8),
lib_dir_path: []const u8,
lib_name: []const u8,
link_mode: ?std.builtin.LinkMode,
) !bool {
const sep = fs.path.sep_str;
const target = self.base.options.target;
test_path.clearRetainingCapacity();
try test_path.writer().print("{s}" ++ sep ++ "{s}{s}{s}", .{
lib_dir_path,
target.libPrefix(),
lib_name,
if (link_mode) |mode| switch (mode) {
.Static => target.staticLibSuffix(),
.Dynamic => target.dynamicLibSuffix(),
} else "",
});
try checked_paths.append(try self.base.allocator.dupe(u8, test_path.items));
fs.cwd().access(test_path.items, .{}) catch |err| switch (err) {
error.FileNotFound => return false,
else => |e| return e,
};
return true;
}
/// When resolving symbols, we approach the problem similarly to `mold`.
/// 1. Resolve symbols across all objects (including those preemptively extracted archives).
/// 2. Resolve symbols across all shared objects.
@ -5893,6 +6019,19 @@ fn reportUndefined(self: *Elf, undefs: anytype) !void {
}
}
fn reportMissingLibraryError(
self: *Elf,
checked_paths: []const []const u8,
comptime format: []const u8,
args: anytype,
) error{OutOfMemory}!void {
var err = try self.addErrorWithNotes(checked_paths.len);
try err.addMsg(self, format, args);
for (checked_paths) |path| {
try err.addNote(self, "tried {s}", .{path});
}
}
const ParseErrorCtx = struct {
detected_cpu_arch: std.Target.Cpu.Arch,
};
@ -6189,7 +6328,7 @@ pub const null_shdr = elf.Elf64_Shdr{
.sh_entsize = 0,
};
const SystemLib = struct {
pub const SystemLib = struct {
needed: bool = false,
path: []const u8,
};
@ -6235,6 +6374,7 @@ const GnuHashSection = synthetic_sections.GnuHashSection;
const GotSection = synthetic_sections.GotSection;
const GotPltSection = synthetic_sections.GotPltSection;
const HashSection = synthetic_sections.HashSection;
const LdScript = @import("Elf/LdScript.zig");
const LinkerDefined = @import("Elf/LinkerDefined.zig");
const Liveness = @import("../Liveness.zig");
const LlvmObject = @import("../codegen/llvm.zig").Object;

View File

@ -71,6 +71,7 @@ pub fn isArchive(file: std.fs.File) bool {
}
pub fn deinit(self: *Archive, allocator: Allocator) void {
allocator.free(self.path);
allocator.free(self.data);
self.objects.deinit(allocator);
}
@ -122,7 +123,7 @@ pub fn parse(self: *Archive, elf_file: *Elf) !void {
};
const object = Object{
.archive = self.path,
.archive = try gpa.dupe(u8, self.path),
.path = try gpa.dupe(u8, object_name[0 .. object_name.len - 1]), // To account for trailing '/'
.data = try gpa.dupe(u8, self.data[stream.pos..][0..size]),
.index = undefined,

533
src/link/Elf/LdScript.zig Normal file
View File

@ -0,0 +1,533 @@
cpu_arch: ?std.Target.Cpu.Arch = null,
args: std.ArrayListUnmanaged(Elf.SystemLib) = .{},
pub fn deinit(scr: *LdScript, allocator: Allocator) void {
scr.args.deinit(allocator);
}
pub const Error = error{
InvalidScript,
UnexpectedToken,
UnknownCpuArch,
OutOfMemory,
};
pub fn parse(scr: *LdScript, data: []const u8, elf_file: *Elf) Error!void {
const gpa = elf_file.base.allocator;
var tokenizer = Tokenizer{ .source = data };
var tokens = std.ArrayList(Token).init(gpa);
defer tokens.deinit();
var line_col = std.ArrayList(LineColumn).init(gpa);
defer line_col.deinit();
var line: usize = 0;
var prev_line_last_col: usize = 0;
while (true) {
const tok = tokenizer.next();
try tokens.append(tok);
const column = tok.start - prev_line_last_col;
try line_col.append(.{ .line = line, .column = column });
switch (tok.id) {
.invalid => {
// TODO errors
// elf_file.base.fatal("invalid token in ld script: '{s}' ({d}:{d})", .{
// tok.get(data),
// line,
// column,
// });
return error.InvalidScript;
},
.new_line => {
line += 1;
prev_line_last_col = tok.end;
},
.eof => break,
else => {},
}
}
var it = TokenIterator{ .tokens = tokens.items };
var parser = Parser{ .source = data, .it = &it };
var args = std.ArrayList(Elf.SystemLib).init(gpa);
scr.doParse(.{
.parser = &parser,
.args = &args,
}) catch |err| switch (err) {
error.UnexpectedToken => {
// const last_token_id = parser.it.pos - 1;
// const last_token = parser.it.get(last_token_id);
// const lcol = line_col.items[last_token_id];
// TODO errors
// elf_file.base.fatal("unexpected token in ld script: {s} : '{s}' ({d}:{d})", .{
// @tagName(last_token.id),
// last_token.get(data),
// lcol.line,
// lcol.column,
// });
return error.InvalidScript;
},
else => |e| return e,
};
scr.args = args.moveToUnmanaged();
}
fn doParse(scr: *LdScript, ctx: struct {
parser: *Parser,
args: *std.ArrayList(Elf.SystemLib),
}) !void {
while (true) {
ctx.parser.skipAny(&.{ .comment, .new_line });
if (ctx.parser.maybe(.command)) |cmd_id| {
const cmd = ctx.parser.getCommand(cmd_id);
switch (cmd) {
.output_format => scr.cpu_arch = try ctx.parser.outputFormat(),
.group => try ctx.parser.group(ctx.args),
else => return error.UnexpectedToken,
}
} else break;
}
if (ctx.parser.it.next()) |tok| switch (tok.id) {
.eof => {},
else => return error.UnexpectedToken,
};
}
const LineColumn = struct {
line: usize,
column: usize,
};
const Command = enum {
output_format,
group,
as_needed,
fn fromString(s: []const u8) ?Command {
inline for (@typeInfo(Command).Enum.fields) |field| {
comptime var buf: [field.name.len]u8 = undefined;
inline for (field.name, 0..) |c, i| {
buf[i] = comptime std.ascii.toUpper(c);
}
if (std.mem.eql(u8, &buf, s)) return @field(Command, field.name);
}
return null;
}
};
const Parser = struct {
source: []const u8,
it: *TokenIterator,
fn outputFormat(p: *Parser) !std.Target.Cpu.Arch {
const value = value: {
if (p.skip(&.{.lparen})) {
const value_id = try p.require(.literal);
const value = p.it.get(value_id);
_ = try p.require(.rparen);
break :value value.get(p.source);
} else if (p.skip(&.{ .new_line, .lbrace })) {
const value_id = try p.require(.literal);
const value = p.it.get(value_id);
_ = p.skip(&.{.new_line});
_ = try p.require(.rbrace);
break :value value.get(p.source);
} else return error.UnexpectedToken;
};
if (std.mem.eql(u8, value, "elf64-x86-64")) return .x86_64;
return error.UnknownCpuArch;
}
fn group(p: *Parser, args: *std.ArrayList(Elf.SystemLib)) !void {
if (!p.skip(&.{.lparen})) return error.UnexpectedToken;
while (true) {
if (p.maybe(.literal)) |tok_id| {
const tok = p.it.get(tok_id);
const path = tok.get(p.source);
try args.append(.{ .path = path, .needed = true });
} else if (p.maybe(.command)) |cmd_id| {
const cmd = p.getCommand(cmd_id);
switch (cmd) {
.as_needed => try p.asNeeded(args),
else => return error.UnexpectedToken,
}
} else break;
}
_ = try p.require(.rparen);
}
fn asNeeded(p: *Parser, args: *std.ArrayList(Elf.SystemLib)) !void {
if (!p.skip(&.{.lparen})) return error.UnexpectedToken;
while (p.maybe(.literal)) |tok_id| {
const tok = p.it.get(tok_id);
const path = tok.get(p.source);
try args.append(.{ .path = path, .needed = false });
}
_ = try p.require(.rparen);
}
fn skip(p: *Parser, comptime ids: []const Token.Id) bool {
const pos = p.it.pos;
inline for (ids) |id| {
const tok = p.it.next() orelse return false;
if (tok.id != id) {
p.it.seekTo(pos);
return false;
}
}
return true;
}
fn skipAny(p: *Parser, comptime ids: []const Token.Id) void {
outer: while (p.it.next()) |tok| {
inline for (ids) |id| {
if (id == tok.id) continue :outer;
}
break p.it.seekBy(-1);
}
}
fn maybe(p: *Parser, comptime id: Token.Id) ?Token.Index {
const pos = p.it.pos;
const tok = p.it.next() orelse return null;
if (tok.id == id) return pos;
p.it.seekBy(-1);
return null;
}
fn require(p: *Parser, comptime id: Token.Id) !Token.Index {
return p.maybe(id) orelse return error.UnexpectedToken;
}
fn getCommand(p: *Parser, index: Token.Index) Command {
const tok = p.it.get(index);
assert(tok.id == .command);
return Command.fromString(tok.get(p.source)).?;
}
};
const Token = struct {
id: Id,
start: usize,
end: usize,
const Id = enum {
// zig fmt: off
eof,
invalid,
new_line,
lparen, // (
rparen, // )
lbrace, // {
rbrace, // }
comment, // /* */
command, // literal with special meaning, see Command
literal,
// zig fmt: on
};
const Index = usize;
inline fn get(tok: Token, source: []const u8) []const u8 {
return source[tok.start..tok.end];
}
};
const Tokenizer = struct {
source: []const u8,
index: usize = 0,
fn matchesPattern(comptime pattern: []const u8, slice: []const u8) bool {
comptime var count: usize = 0;
inline while (count < pattern.len) : (count += 1) {
if (count >= slice.len) return false;
const c = slice[count];
if (pattern[count] != c) return false;
}
return true;
}
fn matches(tok: Tokenizer, comptime pattern: []const u8) bool {
return matchesPattern(pattern, tok.source[tok.index..]);
}
fn isCommand(tok: Tokenizer, start: usize, end: usize) bool {
return if (Command.fromString(tok.source[start..end]) == null) false else true;
}
fn next(tok: *Tokenizer) Token {
var result = Token{
.id = .eof,
.start = tok.index,
.end = undefined,
};
var state: enum {
start,
comment,
literal,
} = .start;
while (tok.index < tok.source.len) : (tok.index += 1) {
const c = tok.source[tok.index];
switch (state) {
.start => switch (c) {
' ', '\t' => result.start += 1,
'\n' => {
result.id = .new_line;
tok.index += 1;
break;
},
'\r' => {
if (tok.matches("\r\n")) {
result.id = .new_line;
tok.index += "\r\n".len;
} else {
result.id = .invalid;
tok.index += 1;
}
break;
},
'/' => if (tok.matches("/*")) {
state = .comment;
tok.index += "/*".len;
} else {
state = .literal;
},
'(' => {
result.id = .lparen;
tok.index += 1;
break;
},
')' => {
result.id = .rparen;
tok.index += 1;
break;
},
'{' => {
result.id = .lbrace;
tok.index += 1;
break;
},
'}' => {
result.id = .rbrace;
tok.index += 1;
break;
},
else => state = .literal,
},
.comment => switch (c) {
'*' => if (tok.matches("*/")) {
result.id = .comment;
tok.index += "*/".len;
break;
},
else => {},
},
.literal => switch (c) {
' ', '(', '\n' => {
if (tok.isCommand(result.start, tok.index)) {
result.id = .command;
} else {
result.id = .literal;
}
break;
},
')' => {
result.id = .literal;
break;
},
'\r' => {
if (tok.matches("\r\n")) {
if (tok.isCommand(result.start, tok.index)) {
result.id = .command;
} else {
result.id = .literal;
}
} else {
result.id = .invalid;
tok.index += 1;
}
break;
},
else => {},
},
}
}
result.end = tok.index;
return result;
}
};
const TokenIterator = struct {
tokens: []const Token,
pos: Token.Index = 0,
fn next(it: *TokenIterator) ?Token {
const token = it.peek() orelse return null;
it.pos += 1;
return token;
}
fn peek(it: TokenIterator) ?Token {
if (it.pos >= it.tokens.len) return null;
return it.tokens[it.pos];
}
inline fn reset(it: *TokenIterator) void {
it.pos = 0;
}
inline fn seekTo(it: *TokenIterator, pos: Token.Index) void {
it.pos = pos;
}
fn seekBy(it: *TokenIterator, offset: isize) void {
const new_pos = @as(isize, @bitCast(it.pos)) + offset;
if (new_pos < 0) {
it.pos = 0;
} else {
it.pos = @as(usize, @intCast(new_pos));
}
}
inline fn get(it: *TokenIterator, pos: Token.Index) Token {
assert(pos < it.tokens.len);
return it.tokens[pos];
}
};
const testing = std.testing;
fn testExpectedTokens(input: []const u8, expected: []const Token.Id) !void {
var given = std.ArrayList(Token.Id).init(testing.allocator);
defer given.deinit();
var tokenizer = Tokenizer{ .source = input };
while (true) {
const tok = tokenizer.next();
if (tok.id == .invalid) {
std.debug.print(" {s} => '{s}'\n", .{ @tagName(tok.id), tok.get(input) });
}
try given.append(tok.id);
if (tok.id == .eof) break;
}
try testing.expectEqualSlices(Token.Id, expected, given.items);
}
test "Tokenizer - just comments" {
try testExpectedTokens(
\\/* GNU ld script
\\ Use the shared library, but some functions are only in
\\ the static library, so try that secondarily. */
, &.{ .comment, .eof });
}
test "Tokenizer - comments with a simple command" {
try testExpectedTokens(
\\/* GNU ld script
\\ Use the shared library, but some functions are only in
\\ the static library, so try that secondarily. */
\\OUTPUT_FORMAT(elf64-x86-64)
, &.{ .comment, .new_line, .command, .lparen, .literal, .rparen, .eof });
}
test "Tokenizer - libc.so" {
try testExpectedTokens(
\\/* GNU ld script
\\ Use the shared library, but some functions are only in
\\ the static library, so try that secondarily. */
\\OUTPUT_FORMAT(elf64-x86-64)
\\GROUP ( /a/b/c.so.6 /a/d/e.a AS_NEEDED ( /f/g/h.so.2 ) )
, &.{
.comment, .new_line, // GNU comment
.command, .lparen, .literal, .rparen, .new_line, // output format
.command, .lparen, .literal, .literal, // group start
.command, .lparen, .literal, .rparen, // as needed
.rparen, // group end
.eof,
});
}
test "Parser - output format" {
const source =
\\OUTPUT_FORMAT(elf64-x86-64)
;
var tokenizer = Tokenizer{ .source = source };
var tokens = std.ArrayList(Token).init(testing.allocator);
defer tokens.deinit();
while (true) {
const tok = tokenizer.next();
try testing.expect(tok.id != .invalid);
try tokens.append(tok);
if (tok.id == .eof) break;
}
var it = TokenIterator{ .tokens = tokens.items };
var parser = Parser{ .source = source, .it = &it };
const tok_id = try parser.require(.command);
try testing.expectEqual(parser.getCommand(tok_id), .output_format);
const cpu_arch = try parser.outputFormat();
try testing.expectEqual(cpu_arch, .x86_64);
}
test "Parser - group with as-needed" {
const source =
\\GROUP ( /a/b/c.so.6 /a/d/e.a AS_NEEDED ( /f/g/h.so.2 ) )
;
var tokenizer = Tokenizer{ .source = source };
var tokens = std.ArrayList(Token).init(testing.allocator);
defer tokens.deinit();
while (true) {
const tok = tokenizer.next();
try testing.expect(tok.id != .invalid);
try tokens.append(tok);
if (tok.id == .eof) break;
}
var it = TokenIterator{ .tokens = tokens.items };
var parser = Parser{ .source = source, .it = &it };
var args = std.ArrayList(Elf.LinkObject).init(testing.allocator);
defer args.deinit();
const tok_id = try parser.require(.command);
try testing.expectEqual(parser.getCommand(tok_id), .group);
try parser.group(&args);
try testing.expectEqualStrings("/a/b/c.so.6", args.items[0].path);
try testing.expect(args.items[0].needed);
try testing.expectEqualStrings("/a/d/e.a", args.items[1].path);
try testing.expect(args.items[1].needed);
try testing.expectEqualStrings("/f/g/h.so.2", args.items[2].path);
try testing.expect(!args.items[2].needed);
}
const LdScript = @This();
const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const Elf = @import("../Elf.zig");

View File

@ -34,6 +34,8 @@ pub fn isObject(file: std.fs.File) bool {
}
pub fn deinit(self: *Object, allocator: Allocator) void {
if (self.archive) |path| allocator.free(path);
allocator.free(self.path);
allocator.free(self.data);
self.shdrs.deinit(allocator);
self.strings.deinit(allocator);

View File

@ -33,6 +33,7 @@ pub fn isSharedObject(file: std.fs.File) bool {
}
pub fn deinit(self: *SharedObject, allocator: Allocator) void {
allocator.free(self.path);
allocator.free(self.data);
self.versyms.deinit(allocator);
self.verstrings.deinit(allocator);