mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
After fixing some issues with inline assembly in the C backend, the std cleanups have the side effect of making these functions compatible with the backend, allowing it to be used on linux without linking libc.
342 lines
12 KiB
Zig
342 lines
12 KiB
Zig
const std = @import("std");
|
|
const os = std.os;
|
|
const mem = std.mem;
|
|
const elf = std.elf;
|
|
const math = std.math;
|
|
const assert = std.debug.assert;
|
|
const native_arch = @import("builtin").cpu.arch;
|
|
|
|
// This file implements the two TLS variants [1] used by ELF-based systems.
|
|
//
|
|
// The variant I has the following layout in memory:
|
|
// -------------------------------------------------------
|
|
// | DTV | Zig | DTV | Alignment | TLS |
|
|
// | storage | thread data | pointer | | block |
|
|
// ------------------------^------------------------------
|
|
// `-- The thread pointer register points here
|
|
//
|
|
// In this case we allocate additional space for our control structure that's
|
|
// placed _before_ the DTV pointer together with the DTV.
|
|
//
|
|
// NOTE: Some systems such as power64 or mips use this variant with a twist: the
|
|
// alignment is not present and the tp and DTV addresses are offset by a
|
|
// constant.
|
|
//
|
|
// On the other hand the variant II has the following layout in memory:
|
|
// ---------------------------------------
|
|
// | TLS | TCB | Zig | DTV |
|
|
// | block | | thread data | storage |
|
|
// --------^------------------------------
|
|
// `-- The thread pointer register points here
|
|
//
|
|
// The structure of the TCB is not defined by the ABI so we reserve enough space
|
|
// for a single pointer as some architectures such as x86 and x86_64 need a
|
|
// pointer to the TCB block itself at the address pointed by the tp.
|
|
//
|
|
// In this case the control structure and DTV are placed one after another right
|
|
// after the TLS block data.
|
|
//
|
|
// At the moment the DTV is very simple since we only support static TLS, all we
|
|
// need is a two word vector to hold the number of entries (1) and the address
|
|
// of the first TLS block.
|
|
//
|
|
// [1] https://www.akkadia.org/drepper/tls.pdf
|
|
|
|
const TLSVariant = enum {
|
|
VariantI,
|
|
VariantII,
|
|
};
|
|
|
|
const tls_variant = switch (native_arch) {
|
|
.arm, .armeb, .thumb, .aarch64, .aarch64_be, .riscv32, .riscv64, .mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpc64, .powerpc64le => TLSVariant.VariantI,
|
|
.x86_64, .x86, .sparc64 => TLSVariant.VariantII,
|
|
else => @compileError("undefined tls_variant for this architecture"),
|
|
};
|
|
|
|
// Controls how many bytes are reserved for the Thread Control Block
|
|
const tls_tcb_size = switch (native_arch) {
|
|
// ARM EABI mandates enough space for two pointers: the first one points to
|
|
// the DTV while the second one is unspecified but reserved
|
|
.arm, .armeb, .thumb, .aarch64, .aarch64_be => 2 * @sizeOf(usize),
|
|
// One pointer-sized word that points either to the DTV or the TCB itself
|
|
else => @sizeOf(usize),
|
|
};
|
|
|
|
// Controls if the TP points to the end of the TCB instead of its beginning
|
|
const tls_tp_points_past_tcb = switch (native_arch) {
|
|
.riscv32, .riscv64, .mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpc64, .powerpc64le => true,
|
|
else => false,
|
|
};
|
|
|
|
// Some architectures add some offset to the tp and dtv addresses in order to
|
|
// make the generated code more efficient
|
|
|
|
const tls_tp_offset = switch (native_arch) {
|
|
.mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpc64, .powerpc64le => 0x7000,
|
|
else => 0,
|
|
};
|
|
|
|
const tls_dtv_offset = switch (native_arch) {
|
|
.mips, .mipsel, .mips64, .mips64el, .powerpc, .powerpc64, .powerpc64le => 0x8000,
|
|
.riscv32, .riscv64 => 0x800,
|
|
else => 0,
|
|
};
|
|
|
|
// Per-thread storage for Zig's use
|
|
const CustomData = struct {
|
|
dummy: usize,
|
|
};
|
|
|
|
// Dynamic Thread Vector
|
|
const DTV = extern struct {
|
|
entries: usize,
|
|
tls_block: [1][*]u8,
|
|
};
|
|
|
|
// Holds all the information about the process TLS image
|
|
const TLSImage = struct {
|
|
init_data: []const u8,
|
|
alloc_size: usize,
|
|
alloc_align: usize,
|
|
tcb_offset: usize,
|
|
dtv_offset: usize,
|
|
data_offset: usize,
|
|
data_size: usize,
|
|
// Only used on the x86 architecture
|
|
gdt_entry_number: usize,
|
|
};
|
|
|
|
pub var tls_image: TLSImage = undefined;
|
|
|
|
pub fn setThreadPointer(addr: usize) void {
|
|
switch (native_arch) {
|
|
.x86 => {
|
|
var user_desc = std.os.linux.user_desc{
|
|
.entry_number = tls_image.gdt_entry_number,
|
|
.base_addr = addr,
|
|
.limit = 0xfffff,
|
|
.flags = .{
|
|
.seg_32bit = 1,
|
|
.contents = 0, // Data
|
|
.read_exec_only = 0,
|
|
.limit_in_pages = 1,
|
|
.seg_not_present = 0,
|
|
.useable = 1,
|
|
},
|
|
};
|
|
const rc = std.os.linux.syscall1(.set_thread_area, @intFromPtr(&user_desc));
|
|
assert(rc == 0);
|
|
|
|
const gdt_entry_number = user_desc.entry_number;
|
|
// We have to keep track of our slot as it's also needed for clone()
|
|
tls_image.gdt_entry_number = gdt_entry_number;
|
|
// Update the %gs selector
|
|
asm volatile ("movl %[gs_val], %%gs"
|
|
:
|
|
: [gs_val] "r" (gdt_entry_number << 3 | 3),
|
|
);
|
|
},
|
|
.x86_64 => {
|
|
const rc = std.os.linux.syscall2(.arch_prctl, std.os.linux.ARCH.SET_FS, addr);
|
|
assert(rc == 0);
|
|
},
|
|
.aarch64 => {
|
|
asm volatile (
|
|
\\ msr tpidr_el0, %[addr]
|
|
:
|
|
: [addr] "r" (addr),
|
|
);
|
|
},
|
|
.arm, .thumb => {
|
|
const rc = std.os.linux.syscall1(.set_tls, addr);
|
|
assert(rc == 0);
|
|
},
|
|
.riscv64 => {
|
|
asm volatile (
|
|
\\ mv tp, %[addr]
|
|
:
|
|
: [addr] "r" (addr),
|
|
);
|
|
},
|
|
.mips, .mipsel, .mips64, .mips64el => {
|
|
const rc = std.os.linux.syscall1(.set_thread_area, addr);
|
|
assert(rc == 0);
|
|
},
|
|
.powerpc => {
|
|
asm volatile (
|
|
\\ mr 2, %[addr]
|
|
:
|
|
: [addr] "r" (addr),
|
|
);
|
|
},
|
|
.powerpc64, .powerpc64le => {
|
|
asm volatile (
|
|
\\ mr 13, %[addr]
|
|
:
|
|
: [addr] "r" (addr),
|
|
);
|
|
},
|
|
.sparc64 => {
|
|
asm volatile (
|
|
\\ mov %[addr], %%g7
|
|
:
|
|
: [addr] "r" (addr),
|
|
);
|
|
},
|
|
else => @compileError("Unsupported architecture"),
|
|
}
|
|
}
|
|
|
|
fn initTLS(phdrs: []elf.Phdr) void {
|
|
var tls_phdr: ?*elf.Phdr = null;
|
|
var img_base: usize = 0;
|
|
|
|
for (phdrs) |*phdr| {
|
|
switch (phdr.p_type) {
|
|
elf.PT_PHDR => img_base = @intFromPtr(phdrs.ptr) - phdr.p_vaddr,
|
|
elf.PT_TLS => tls_phdr = phdr,
|
|
else => {},
|
|
}
|
|
}
|
|
|
|
var tls_align_factor: usize = undefined;
|
|
var tls_data: []const u8 = undefined;
|
|
var tls_data_alloc_size: usize = undefined;
|
|
if (tls_phdr) |phdr| {
|
|
// The effective size in memory is represented by p_memsz, the length of
|
|
// the data stored in the PT_TLS segment is p_filesz and may be less
|
|
// than the former
|
|
tls_align_factor = phdr.p_align;
|
|
tls_data = @as([*]u8, @ptrFromInt(img_base + phdr.p_vaddr))[0..phdr.p_filesz];
|
|
tls_data_alloc_size = phdr.p_memsz;
|
|
} else {
|
|
tls_align_factor = @alignOf(usize);
|
|
tls_data = &[_]u8{};
|
|
tls_data_alloc_size = 0;
|
|
}
|
|
|
|
// Offsets into the allocated TLS area
|
|
var tcb_offset: usize = undefined;
|
|
var dtv_offset: usize = undefined;
|
|
var data_offset: usize = undefined;
|
|
// Compute the total size of the ABI-specific data plus our own control
|
|
// structures. All the offset calculated here assume a well-aligned base
|
|
// address.
|
|
const alloc_size = switch (tls_variant) {
|
|
.VariantI => blk: {
|
|
var l: usize = 0;
|
|
dtv_offset = l;
|
|
l += @sizeOf(DTV);
|
|
// Add some padding here so that the thread pointer (tcb_offset) is
|
|
// aligned to p_align and the CustomData structure can be found by
|
|
// simply subtracting its @sizeOf from the tp value
|
|
const delta = (l + @sizeOf(CustomData)) & (tls_align_factor - 1);
|
|
if (delta > 0)
|
|
l += tls_align_factor - delta;
|
|
l += @sizeOf(CustomData);
|
|
tcb_offset = l;
|
|
l += mem.alignForward(usize, tls_tcb_size, tls_align_factor);
|
|
data_offset = l;
|
|
l += tls_data_alloc_size;
|
|
break :blk l;
|
|
},
|
|
.VariantII => blk: {
|
|
var l: usize = 0;
|
|
data_offset = l;
|
|
l += mem.alignForward(usize, tls_data_alloc_size, tls_align_factor);
|
|
// The thread pointer is aligned to p_align
|
|
tcb_offset = l;
|
|
l += tls_tcb_size;
|
|
// The CustomData structure is right after the TCB with no padding
|
|
// in between so it can be easily found
|
|
l += @sizeOf(CustomData);
|
|
l = mem.alignForward(usize, l, @alignOf(DTV));
|
|
dtv_offset = l;
|
|
l += @sizeOf(DTV);
|
|
break :blk l;
|
|
},
|
|
};
|
|
|
|
tls_image = TLSImage{
|
|
.init_data = tls_data,
|
|
.alloc_size = alloc_size,
|
|
.alloc_align = tls_align_factor,
|
|
.tcb_offset = tcb_offset,
|
|
.dtv_offset = dtv_offset,
|
|
.data_offset = data_offset,
|
|
.data_size = tls_data_alloc_size,
|
|
.gdt_entry_number = @as(usize, @bitCast(@as(isize, -1))),
|
|
};
|
|
}
|
|
|
|
inline fn alignPtrCast(comptime T: type, ptr: [*]u8) *T {
|
|
return @ptrCast(@alignCast(ptr));
|
|
}
|
|
|
|
/// Initializes all the fields of the static TLS area and returns the computed
|
|
/// architecture-specific value of the thread-pointer register
|
|
pub fn prepareTLS(area: []u8) usize {
|
|
// Clear the area we're going to use, just to be safe
|
|
@memset(area, 0);
|
|
// Prepare the DTV
|
|
const dtv = alignPtrCast(DTV, area.ptr + tls_image.dtv_offset);
|
|
dtv.entries = 1;
|
|
dtv.tls_block[0] = area.ptr + tls_dtv_offset + tls_image.data_offset;
|
|
// Prepare the TCB
|
|
const tcb_ptr = alignPtrCast([*]u8, area.ptr + tls_image.tcb_offset);
|
|
tcb_ptr.* = switch (tls_variant) {
|
|
.VariantI => area.ptr + tls_image.dtv_offset,
|
|
.VariantII => area.ptr + tls_image.tcb_offset,
|
|
};
|
|
// Copy the data
|
|
@memcpy(area[tls_image.data_offset..][0..tls_image.init_data.len], tls_image.init_data);
|
|
|
|
// Return the corrected value (if needed) for the tp register.
|
|
// Overflow here is not a problem, the pointer arithmetic involving the tp
|
|
// is done with wrapping semantics.
|
|
return @intFromPtr(area.ptr) +% tls_tp_offset +%
|
|
if (tls_tp_points_past_tcb) tls_image.data_offset else tls_image.tcb_offset;
|
|
}
|
|
|
|
// The main motivation for the size chosen here is this is how much ends up being
|
|
// requested for the thread local variables of the std.crypto.random implementation.
|
|
// I'm not sure why it ends up being so much; the struct itself is only 64 bytes.
|
|
// I think it has to do with being page aligned and LLVM or LLD is not smart enough
|
|
// to lay out the TLS data in a space conserving way. Anyway I think it's fine
|
|
// because it's less than 3 pages of memory, and putting it in the ELF like this
|
|
// is equivalent to moving the mmap call below into the kernel, avoiding syscall
|
|
// overhead.
|
|
var main_thread_tls_buffer: [0x2100]u8 align(mem.page_size) = undefined;
|
|
|
|
pub fn initStaticTLS(phdrs: []elf.Phdr) void {
|
|
initTLS(phdrs);
|
|
|
|
const tls_area = blk: {
|
|
// Fast path for the common case where the TLS data is really small,
|
|
// avoid an allocation and use our local buffer.
|
|
if (tls_image.alloc_align <= mem.page_size and
|
|
tls_image.alloc_size <= main_thread_tls_buffer.len)
|
|
{
|
|
break :blk main_thread_tls_buffer[0..tls_image.alloc_size];
|
|
}
|
|
|
|
const alloc_tls_area = os.mmap(
|
|
null,
|
|
tls_image.alloc_size + tls_image.alloc_align - 1,
|
|
os.PROT.READ | os.PROT.WRITE,
|
|
os.MAP.PRIVATE | os.MAP.ANONYMOUS,
|
|
-1,
|
|
0,
|
|
) catch os.abort();
|
|
|
|
// Make sure the slice is correctly aligned.
|
|
const begin_addr = @intFromPtr(alloc_tls_area.ptr);
|
|
const begin_aligned_addr = mem.alignForward(usize, begin_addr, tls_image.alloc_align);
|
|
const start = begin_aligned_addr - begin_addr;
|
|
break :blk alloc_tls_area[start .. start + tls_image.alloc_size];
|
|
};
|
|
|
|
const tp_value = prepareTLS(tls_area);
|
|
setThreadPointer(tp_value);
|
|
}
|