Move trie structure into its own file-module

Signed-off-by: Jakub Konka <kubkon@jakubkonka.com>
This commit is contained in:
Jakub Konka 2020-10-07 19:36:50 +02:00
parent b13b36a71d
commit bdab4f53c1
No known key found for this signature in database
GPG Key ID: 3AEF55DD984C8344
2 changed files with 268 additions and 200 deletions

View File

@ -20,6 +20,8 @@ const File = link.File;
const Cache = @import("../Cache.zig");
const target_util = @import("../target.zig");
const Trie = @import("MachO/Trie.zig");
pub const base_tag: File.Tag = File.Tag.macho;
const LoadCommand = union(enum) {
@ -64,156 +66,6 @@ const LoadCommand = union(enum) {
}
};
/// Represents export trie used in MachO executables and dynamic libraries.
/// The purpose of an export trie is to encode as compactly as possible all
/// export symbols for the loader `dyld`.
/// The export trie encodes offset and other information using ULEB128
/// encoding, and is part of the __LINKEDIT segment.
const Trie = struct {
const Node = struct {
const Edge = struct {
from: *Node,
to: *Node,
label: []const u8,
pub fn deinit(self: *Edge, alloc: *Allocator) void {
self.to.deinit(alloc);
alloc.destroy(self.to);
self.from = undefined;
self.to = undefined;
}
};
export_flags: ?u64 = null,
offset: ?u64 = null,
edges: std.ArrayListUnmanaged(Edge) = .{},
pub fn deinit(self: *Node, alloc: *Allocator) void {
for (self.edges.items) |*edge| {
edge.deinit(alloc);
}
self.edges.deinit(alloc);
}
pub fn put(self: *Node, alloc: *Allocator, fromEdge: ?*Edge, prefix: usize, label: []const u8) !*Node {
// Traverse all edges.
for (self.edges.items) |*edge| {
const match = mem.indexOfDiff(u8, edge.label, label) orelse return self; // Got a full match, don't do anything.
if (match - prefix > 0) {
// If we match, we advance further down the trie.
return edge.to.put(alloc, edge, match, label);
}
}
if (fromEdge) |from| {
if (mem.eql(u8, from.label, label[0..prefix])) {
if (prefix == label.len) return self;
} else {
// Fixup nodes. We need to insert an intermediate node between
// from.to and self.
const mid = try alloc.create(Node);
mid.* = .{};
const to_label = from.label;
from.to = mid;
from.label = label[0..prefix];
try mid.edges.append(alloc, .{
.from = mid,
.to = self,
.label = to_label,
});
if (prefix == label.len) return self; // We're done.
const new_node = try alloc.create(Node);
new_node.* = .{};
try mid.edges.append(alloc, .{
.from = mid,
.to = new_node,
.label = label,
});
return new_node;
}
}
// Add a new edge.
const node = try alloc.create(Node);
node.* = .{};
try self.edges.append(alloc, .{
.from = self,
.to = node,
.label = label,
});
return node;
}
pub fn write(self: Node, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) Trie.WriteError!void {
if (self.offset) |off| {
// Terminal node info: encode export flags and vmaddr offset of this symbol.
var info_buf_len: usize = 0;
var info_buf: [@sizeOf(u64) * 2]u8 = undefined;
info_buf_len += try std.debug.leb.writeULEB128Mem(info_buf[0..], self.export_flags.?);
info_buf_len += try std.debug.leb.writeULEB128Mem(info_buf[info_buf_len..], off);
// Encode the size of the terminal node info.
var size_buf: [@sizeOf(u64)]u8 = undefined;
const size_buf_len = try std.debug.leb.writeULEB128Mem(size_buf[0..], info_buf_len);
// Now, write them to the output buffer.
try buffer.ensureCapacity(alloc, buffer.items.len + info_buf_len + size_buf_len);
buffer.appendSliceAssumeCapacity(size_buf[0..size_buf_len]);
buffer.appendSliceAssumeCapacity(info_buf[0..info_buf_len]);
} else {
// Non-terminal node is delimited by 0 byte.
try buffer.append(alloc, 0);
}
// Write number of edges (max legal number of edges is 256).
try buffer.append(alloc, @intCast(u8, self.edges.items.len));
var node_offset_info: [@sizeOf(u8)]u64 = undefined;
for (self.edges.items) |edge, i| {
// Write edges labels leaving out space in-between to later populate
// with offsets to each node.
try buffer.ensureCapacity(alloc, buffer.items.len + edge.label.len + 1 + @sizeOf(u64)); // +1 to account for null-byte
buffer.appendSliceAssumeCapacity(edge.label);
buffer.appendAssumeCapacity(0);
node_offset_info[i] = buffer.items.len;
const padding = [_]u8{0} ** @sizeOf(u64);
buffer.appendSliceAssumeCapacity(padding[0..]);
}
for (self.edges.items) |edge, i| {
const offset = buffer.items.len;
try edge.to.write(alloc, buffer);
// We can now populate the offset to the node pointed by this edge.
var offset_buf: [@sizeOf(u64)]u8 = undefined;
const offset_buf_len = try std.debug.leb.writeULEB128Mem(offset_buf[0..], offset);
mem.copy(u8, buffer.items[node_offset_info[i]..], offset_buf[0..offset_buf_len]);
}
}
};
root: Node,
pub fn put(self: *Trie, alloc: *Allocator, word: []const u8) !*Node {
return self.root.put(alloc, null, 0, word);
}
pub const WriteError = error{ OutOfMemory, NoSpaceLeft };
pub fn write(self: Trie, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) WriteError!void {
return self.root.write(alloc, buffer);
}
pub fn deinit(self: *Trie, alloc: *Allocator) void {
self.root.deinit(alloc);
}
};
base: File,
/// Table of all load commands
@ -1541,19 +1393,21 @@ fn writeExportTrie(self: *MachO) !void {
defer trie.deinit(self.base.allocator);
const text_segment = self.load_commands.items[self.text_segment_cmd_index.?].Segment;
for (self.global_symbols.items) |symbol| {
// TODO figure out if we should put all global symbols into the export trie
const name = self.getString(symbol.n_strx);
const node = try trie.put(self.base.allocator, name);
node.offset = symbol.n_value - text_segment.vmaddr;
node.export_flags = 0; // TODO workout creation of export flags
assert(symbol.n_value >= text_segment.vmaddr);
try trie.put(self.base.allocator, .{
.name = name,
.offset = symbol.n_value - text_segment.vmaddr,
.export_flags = 0, // TODO workout creation of export flags
});
}
var buffer: std.ArrayListUnmanaged(u8) = .{};
defer buffer.deinit(self.base.allocator);
try trie.write(self.base.allocator, &buffer);
try trie.writeULEB128Mem(self.base.allocator, &buffer);
const dyld_info = &self.load_commands.items[self.dyld_info_cmd_index.?].DyldInfo;
try self.base.file.?.pwriteAll(buffer.items, dyld_info.export_off);
@ -1688,48 +1542,3 @@ fn satMul(a: anytype, b: anytype) @TypeOf(a, b) {
const T = @TypeOf(a, b);
return std.math.mul(T, a, b) catch std.math.maxInt(T);
}
test "Trie basic" {
const testing = @import("std").testing;
var gpa = testing.allocator;
var trie: Trie = .{
.root = .{},
};
defer trie.deinit(gpa);
// root
testing.expect(trie.root.edges.items.len == 0);
// root --- _st ---> node
try trie.put(gpa, "_st");
testing.expect(trie.root.edges.items.len == 1);
testing.expect(mem.eql(u8, trie.root.edges.items[0].label, "_st"));
{
// root --- _st ---> node --- _start ---> node
try trie.put(gpa, "_start");
testing.expect(trie.root.edges.items.len == 1);
const nextEdge = &trie.root.edges.items[0];
testing.expect(mem.eql(u8, nextEdge.label, "_st"));
testing.expect(nextEdge.to.edges.items.len == 1);
testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_start"));
}
{
// root --- _ ---> node --- _st ---> node --- _start ---> node
// |
// | --- _main ---> node
try trie.put(gpa, "_main");
testing.expect(trie.root.edges.items.len == 1);
const nextEdge = &trie.root.edges.items[0];
testing.expect(mem.eql(u8, nextEdge.label, "_"));
testing.expect(nextEdge.to.edges.items.len == 2);
testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_st"));
testing.expect(mem.eql(u8, nextEdge.to.edges.items[1].label, "_main"));
const nextNextEdge = &nextEdge.to.edges.items[0];
testing.expect(mem.eql(u8, nextNextEdge.to.edges.items[0].label, "_start"));
}
}

259
src/link/MachO/Trie.zig Normal file
View File

@ -0,0 +1,259 @@
/// Represents export trie used in MachO executables and dynamic libraries.
/// The purpose of an export trie is to encode as compactly as possible all
/// export symbols for the loader `dyld`.
/// The export trie encodes offset and other information using ULEB128
/// encoding, and is part of the __LINKEDIT segment.
///
/// Description from loader.h:
///
/// The symbols exported by a dylib are encoded in a trie. This is a compact
/// representation that factors out common prefixes. It also reduces LINKEDIT pages
/// in RAM because it encodes all information (name, address, flags) in one small,
/// contiguous range. The export area is a stream of nodes. The first node sequentially
/// is the start node for the trie.
///
/// Nodes for a symbol start with a uleb128 that is the length of the exported symbol
/// information for the string so far. If there is no exported symbol, the node starts
/// with a zero byte. If there is exported info, it follows the length.
///
/// First is a uleb128 containing flags. Normally, it is followed by a uleb128 encoded
/// offset which is location of the content named by the symbol from the mach_header
/// for the image. If the flags is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags
/// is a uleb128 encoded library ordinal, then a zero terminated UTF8 string. If the string
/// is zero length, then the symbol is re-export from the specified dylib with the same name.
/// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following the flags is two
/// uleb128s: the stub offset and the resolver offset. The stub is used by non-lazy pointers.
/// The resolver is used by lazy pointers and must be called to get the actual address to use.
///
/// After the optional exported symbol information is a byte of how many edges (0-255) that
/// this node has leaving it, followed by each edge. Each edge is a zero terminated UTF8 of
/// the addition chars in the symbol, followed by a uleb128 offset for the node that edge points to.
const Trie = @This();
const std = @import("std");
const mem = std.mem;
const leb = std.debug.leb;
const log = std.log.scoped(.link);
const Allocator = mem.Allocator;
pub const Symbol = struct {
name: []const u8,
offset: u64,
export_flags: u64,
};
const Edge = struct {
from: *Node,
to: *Node,
label: []const u8,
fn deinit(self: *Edge, alloc: *Allocator) void {
self.to.deinit(alloc);
alloc.destroy(self.to);
self.from = undefined;
self.to = undefined;
}
};
const Node = struct {
export_flags: ?u64 = null,
offset: ?u64 = null,
edges: std.ArrayListUnmanaged(Edge) = .{},
fn deinit(self: *Node, alloc: *Allocator) void {
for (self.edges.items) |*edge| {
edge.deinit(alloc);
}
self.edges.deinit(alloc);
}
fn put(self: *Node, alloc: *Allocator, fromEdge: ?*Edge, prefix: usize, label: []const u8) !*Node {
// Traverse all edges.
for (self.edges.items) |*edge| {
const match = mem.indexOfDiff(u8, edge.label, label) orelse return self; // Got a full match, don't do anything.
if (match - prefix > 0) {
// If we match, we advance further down the trie.
return edge.to.put(alloc, edge, match, label);
}
}
if (fromEdge) |from| {
if (mem.eql(u8, from.label, label[0..prefix])) {
if (prefix == label.len) return self;
} else {
// Fixup nodes. We need to insert an intermediate node between
// from.to and self.
// Is: A -> B
// Should be: A -> C -> B
const mid = try alloc.create(Node);
mid.* = .{};
const to_label = from.label;
from.to = mid;
from.label = label[0..prefix];
try mid.edges.append(alloc, .{
.from = mid,
.to = self,
.label = to_label,
});
if (prefix == label.len) return self; // We're done.
const new_node = try alloc.create(Node);
new_node.* = .{};
try mid.edges.append(alloc, .{
.from = mid,
.to = new_node,
.label = label,
});
return new_node;
}
}
// Add a new edge.
const node = try alloc.create(Node);
node.* = .{};
try self.edges.append(alloc, .{
.from = self,
.to = node,
.label = label,
});
return node;
}
fn writeULEB128Mem(self: Node, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) Trie.WriteError!void {
if (self.offset) |offset| {
// Terminal node info: encode export flags and vmaddr offset of this symbol.
var info_buf_len: usize = 0;
var info_buf: [@sizeOf(u64) * 2]u8 = undefined;
info_buf_len += try leb.writeULEB128Mem(info_buf[0..], self.export_flags.?);
info_buf_len += try leb.writeULEB128Mem(info_buf[info_buf_len..], offset);
// Encode the size of the terminal node info.
var size_buf: [@sizeOf(u64)]u8 = undefined;
const size_buf_len = try leb.writeULEB128Mem(size_buf[0..], info_buf_len);
// Now, write them to the output buffer.
try buffer.ensureCapacity(alloc, buffer.items.len + info_buf_len + size_buf_len);
buffer.appendSliceAssumeCapacity(size_buf[0..size_buf_len]);
buffer.appendSliceAssumeCapacity(info_buf[0..info_buf_len]);
} else {
// Non-terminal node is delimited by 0 byte.
try buffer.append(alloc, 0);
}
// Write number of edges (max legal number of edges is 256).
try buffer.append(alloc, @intCast(u8, self.edges.items.len));
var node_offset_info: [@sizeOf(u8)]u64 = undefined;
for (self.edges.items) |edge, i| {
// Write edges labels leaving out space in-between to later populate
// with offsets to each node.
try buffer.ensureCapacity(alloc, buffer.items.len + edge.label.len + 1 + @sizeOf(u64)); // +1 to account for null-byte
buffer.appendSliceAssumeCapacity(edge.label);
buffer.appendAssumeCapacity(0);
node_offset_info[i] = buffer.items.len;
const padding = [_]u8{0} ** @sizeOf(u64);
buffer.appendSliceAssumeCapacity(padding[0..]);
}
for (self.edges.items) |edge, i| {
const offset = buffer.items.len;
try edge.to.writeULEB128Mem(alloc, buffer);
// We can now populate the offset to the node pointed by this edge.
// TODO this is not the approach taken by `ld64` which does several iterations
// to close the gap between the space encoding the offset to the node pointed
// by this edge. However, it seems that as long as we are contiguous, the padding
// introduced here should not influence the performance of `dyld`. I'm leaving
// this TODO here though as a reminder to re-investigate in the future and especially
// when we start working on dylibs in case `dyld` refuses to cooperate and/or the
// performance is noticably sufferring.
// Link to official impl: https://opensource.apple.com/source/ld64/ld64-123.2.1/src/abstraction/MachOTrie.hpp
var offset_buf: [@sizeOf(u64)]u8 = undefined;
const offset_buf_len = try leb.writeULEB128Mem(offset_buf[0..], offset);
mem.copy(u8, buffer.items[node_offset_info[i]..], offset_buf[0..offset_buf_len]);
}
}
};
root: Node,
/// Insert a symbol into the trie, updating the prefixes in the process.
/// This operation may change the layout of the trie by splicing edges in
/// certain circumstances.
pub fn put(self: *Trie, alloc: *Allocator, symbol: Symbol) !void {
const node = try self.root.put(alloc, null, 0, symbol.name);
node.offset = symbol.offset;
node.export_flags = symbol.export_flags;
}
pub const WriteError = error{ OutOfMemory, NoSpaceLeft };
/// Write the trie to a buffer ULEB128 encoded.
pub fn writeULEB128Mem(self: Trie, alloc: *Allocator, buffer: *std.ArrayListUnmanaged(u8)) WriteError!void {
return self.root.writeULEB128Mem(alloc, buffer);
}
pub fn deinit(self: *Trie, alloc: *Allocator) void {
self.root.deinit(alloc);
}
test "Trie basic" {
const testing = @import("std").testing;
var gpa = testing.allocator;
var trie: Trie = .{
.root = .{},
};
defer trie.deinit(gpa);
// root
testing.expect(trie.root.edges.items.len == 0);
// root --- _st ---> node
try trie.put(gpa, .{
.name = "_st",
.offset = 0,
.export_flags = 0,
});
testing.expect(trie.root.edges.items.len == 1);
testing.expect(mem.eql(u8, trie.root.edges.items[0].label, "_st"));
{
// root --- _st ---> node --- _start ---> node
try trie.put(gpa, .{
.name = "_start",
.offset = 0,
.export_flags = 0,
});
testing.expect(trie.root.edges.items.len == 1);
const nextEdge = &trie.root.edges.items[0];
testing.expect(mem.eql(u8, nextEdge.label, "_st"));
testing.expect(nextEdge.to.edges.items.len == 1);
testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_start"));
}
{
// root --- _ ---> node --- _st ---> node --- _start ---> node
// |
// | --- _main ---> node
try trie.put(gpa, .{
.name = "_main",
.offset = 0,
.export_flags = 0,
});
testing.expect(trie.root.edges.items.len == 1);
const nextEdge = &trie.root.edges.items[0];
testing.expect(mem.eql(u8, nextEdge.label, "_"));
testing.expect(nextEdge.to.edges.items.len == 2);
testing.expect(mem.eql(u8, nextEdge.to.edges.items[0].label, "_st"));
testing.expect(mem.eql(u8, nextEdge.to.edges.items[1].label, "_main"));
const nextNextEdge = &nextEdge.to.edges.items[0];
testing.expect(mem.eql(u8, nextNextEdge.to.edges.items[0].label, "_start"));
}
}