From d8ab301aa81758a7918de446271201c044e7835f Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Sat, 4 May 2019 12:02:55 +0200 Subject: [PATCH] std: Implement TLS support for Linux Tested on x86_64, i386, ARM, AARCH64 --- CMakeLists.txt | 1 + std/os.zig | 17 +-- std/os/linux.zig | 1 + std/os/linux/tls.zig | 242 ++++++++++++++++++++++++++++++++++++++ std/special/bootstrap.zig | 68 ++--------- 5 files changed, 257 insertions(+), 72 deletions(-) create mode 100644 std/os/linux/tls.zig diff --git a/CMakeLists.txt b/CMakeLists.txt index d39ad1ac61..c55ced8927 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -611,6 +611,7 @@ set(ZIG_STD_FILES "os/linux.zig" "os/linux/arm64.zig" "os/linux/errno.zig" + "os/linux/tls.zig" "os/linux/vdso.zig" "os/linux/x86_64.zig" "os/netbsd.zig" diff --git a/std/os.zig b/std/os.zig index 3276ac2b1d..bc9d0ad4d2 100644 --- a/std/os.zig +++ b/std/os.zig @@ -3126,9 +3126,6 @@ pub const SpawnThreadError = error{ Unexpected, }; -pub var linux_tls_phdr: ?*std.elf.Phdr = null; -pub var linux_tls_img_src: [*]const u8 = undefined; // defined if linux_tls_phdr is - /// caller must call wait on the returned thread /// fn startFn(@typeOf(context)) T /// where T is u8, noreturn, void, or !void @@ -3238,12 +3235,10 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread } // Finally, the Thread Local Storage, if any. if (!Thread.use_pthreads) { - if (linux_tls_phdr) |tls_phdr| { - l = mem.alignForward(l, tls_phdr.p_align); + if (linux.tls.tls_image) |tls_img| { + l = mem.alignForward(l, @alignOf(usize)); tls_start_offset = l; - l += tls_phdr.p_memsz; - // the fs register address - l += @sizeOf(usize); + l += tls_img.alloc_size; } } break :blk l; @@ -3284,10 +3279,8 @@ pub fn spawnThread(context: var, comptime startFn: var) SpawnThreadError!*Thread posix.CLONE_THREAD | posix.CLONE_SYSVSEM | posix.CLONE_PARENT_SETTID | posix.CLONE_CHILD_CLEARTID | posix.CLONE_DETACHED; var newtls: usize = undefined; - if (linux_tls_phdr) |tls_phdr| { - @memcpy(@intToPtr([*]u8, mmap_addr + tls_start_offset), linux_tls_img_src, tls_phdr.p_filesz); - newtls = mmap_addr + mmap_len - @sizeOf(usize); - @intToPtr(*usize, newtls).* = newtls; + if (linux.tls.tls_image) |tls_img| { + newtls = linux.tls.copyTLS(mmap_addr + tls_start_offset); flags |= posix.CLONE_SETTLS; } const rc = posix.clone(MainFuncs.linuxThreadMain, mmap_addr + stack_end_offset, flags, arg, &thread_ptr.data.handle, newtls, &thread_ptr.data.handle); diff --git a/std/os/linux.zig b/std/os/linux.zig index 4a3e53176f..49091529a7 100644 --- a/std/os/linux.zig +++ b/std/os/linux.zig @@ -3,6 +3,7 @@ const assert = std.debug.assert; const builtin = @import("builtin"); const maxInt = std.math.maxInt; const elf = std.elf; +pub const tls = @import("linux/tls.zig"); const vdso = @import("linux/vdso.zig"); const dl = @import("../dynamic_library.zig"); pub use switch (builtin.arch) { diff --git a/std/os/linux/tls.zig b/std/os/linux/tls.zig new file mode 100644 index 0000000000..d26a12a69d --- /dev/null +++ b/std/os/linux/tls.zig @@ -0,0 +1,242 @@ +const std = @import("std"); +const mem = std.mem; +const posix = std.posix; +const elf = std.elf; +const builtin = @import("builtin"); +const assert = std.debug.assert; + +// This file implements the two TLS variants [1] used by ELF-based systems. +// +// The variant I has the following layout in memory: +// ------------------------------------------------------- +// | DTV | Zig | DTV | Alignment | TLS | +// | storage | thread data | pointer | | block | +// ------------------------^------------------------------ +// `-- The thread pointer register points here +// +// In this case we allocate additional space for our control structure that's +// placed _before_ the DTV pointer together with the DTV. +// +// NOTE: Some systems such as power64 or mips use this variant with a twist: the +// alignment is not present and the tp and DTV addresses are offset by a +// constant. +// +// On the other hand the variant II has the following layout in memory: +// --------------------------------------- +// | TLS | TCB | Zig | DTV | +// | block | | thread data | storage | +// --------^------------------------------ +// `-- The thread pointer register points here +// +// The structure of the TCB is not defined by the ABI so we reserve enough space +// for a single pointer as some architectures such as i386 and x86_64 need a +// pointer to the TCB block itself at the address pointed by the tp. +// +// In this case the control structure and DTV are placed one after another right +// after the TLS block data. +// +// At the moment the DTV is very simple since we only support static TLS, all we +// need is a two word vector to hold the number of entries (1) and the address +// of the first TLS block. +// +// [1] https://www.akkadia.org/drepper/tls.pdf + +const TLSVariant = enum { + VariantI, + VariantII, +}; + +const tls_variant = switch (builtin.arch) { + .arm, .armeb, .aarch64, .aarch64_be => TLSVariant.VariantI, + .x86_64, .i386 => TLSVariant.VariantII, + else => @compileError("undefined tls_variant for this architecture"), +}; + +// Controls how many bytes are reserved for the Thread Control Block +const tls_tcb_size = switch (builtin.arch) { + // ARM EABI mandates enough space for two pointers: the first one points to + // the DTV while the second one is unspecified but reserved + .arm, .armeb, .aarch64, .aarch64_be => 2 * @sizeOf(usize), + .i386, .x86_64 => @sizeOf(usize), + else => 0, +}; + +// Controls if the TCB should be aligned according to the TLS segment p_align +const tls_tcb_align_size = switch (builtin.arch) { + .arm, .armeb, .aarch64, .aarch64_be => true, + else => false, +}; + +// Check if the architecture-specific parameters look correct +comptime { + if (tls_tcb_align_size and tls_variant != TLSVariant.VariantI) { + @compileError("tls_tcb_align_size is only meaningful for variant I TLS"); + } +} + +// Some architectures add some offset to the tp and dtv addresses in order to +// make the generated code more efficient + +const tls_tp_offset = switch (builtin.arch) { + else => 0, +}; + +const tls_dtv_offset = switch (builtin.arch) { + else => 0, +}; + +// Per-thread storage for Zig's use +const CustomData = packed struct { +}; + +// Dynamic Thread Vector +const DTV = packed struct { + entries: usize, + tls_block: [1]usize, +}; + +// Holds all the information about the process TLS image +const TLSImage = struct { + data_src: []u8, + alloc_size: usize, + tcb_offset: usize, + dtv_offset: usize, + data_offset: usize, +}; + +pub var tls_image: ?TLSImage = null; + +pub fn setThreadPointer(addr: usize) void { + switch (builtin.arch) { + .x86_64 => { + const ARCH_SET_FS = 0x1002; + const rc = std.os.linux.syscall2(std.os.linux.SYS_arch_prctl, ARCH_SET_FS, addr); + // arch_prctl is documented to never fail + assert(rc == 0); + }, + .aarch64 => { + asm volatile ( + \\ msr tpidr_el0, %[addr] + : : [addr] "r" (addr) + ); + }, + else => @compileError("Unsupported architecture"), + } +} + +pub fn initTLS() void { + var tls_phdr: ?*elf.Phdr = null; + var img_base: usize = 0; + + if (std.os.linux_elf_aux_maybe) |auxv| { + var at_phent: usize = undefined; + var at_phnum: usize = undefined; + var at_phdr: usize = undefined; + + var i: usize = 0; + while (auxv[i].a_type != std.elf.AT_NULL) : (i += 1) { + switch (auxv[i].a_type) { + elf.AT_PHENT => at_phent = auxv[i].a_un.a_val, + elf.AT_PHNUM => at_phnum = auxv[i].a_un.a_val, + elf.AT_PHDR => at_phdr = auxv[i].a_un.a_val, + else => continue, + } + } + + // Sanity check + assert(at_phent == @sizeOf(elf.Phdr)); + + // Search the TLS section + const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum]; + + for (phdrs) |*phdr| { + switch (phdr.p_type) { + elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr, + elf.PT_TLS => tls_phdr = phdr, + else => continue, + } + } + } else { + @panic("no auxv vector available!"); + } + + if (tls_phdr) |phdr| { + // Offsets into the allocated TLS area + var tcb_offset: usize = undefined; + var dtv_offset: usize = undefined; + var data_offset: usize = undefined; + var thread_data_offset: usize = undefined; + // Compute the total size of the ABI-specific data plus our own control + // structures + const alloc_size = switch (tls_variant) { + .VariantI => blk: { + var l: usize = 0; + dtv_offset = l; + l += @sizeOf(DTV); + thread_data_offset = l; + l += @sizeOf(CustomData); + l = mem.alignForward(l, phdr.p_align); + tcb_offset = l; + if (tls_tcb_align_size) { + l += mem.alignForward(tls_tcb_size, phdr.p_align); + } else { + l += tls_tcb_size; + } + data_offset = l; + l += phdr.p_memsz; + break :blk l; + }, + .VariantII => blk: { + var l: usize = 0; + data_offset = l; + l += phdr.p_memsz; + l = mem.alignForward(l, phdr.p_align); + tcb_offset = l; + l += tls_tcb_size; + thread_data_offset = l; + l += @sizeOf(CustomData); + dtv_offset = l; + l += @sizeOf(DTV); + break :blk l; + } + }; + + tls_image = TLSImage{ + .data_src = @intToPtr([*]u8, phdr.p_vaddr + img_base)[0..phdr.p_filesz], + .alloc_size = alloc_size, + .tcb_offset = tcb_offset, + .dtv_offset = dtv_offset, + .data_offset = data_offset, + }; + } +} + +pub fn copyTLS(addr: usize) usize { + const tls_img = tls_image orelse @panic("copyTLS called with no TLS section!"); + + // Be paranoid, clear the area we're going to use + @memset(@intToPtr([*]u8, addr), 0, tls_img.alloc_size); + // Prepare the DTV + const dtv = @intToPtr(*DTV, addr + tls_img.dtv_offset); + dtv.entries = 1; + dtv.tls_block[0] = addr + tls_img.data_offset + tls_dtv_offset; + // Set-up the TCB + const tcb_ptr = @intToPtr(*usize, addr + tls_img.tcb_offset); + if (tls_variant == TLSVariant.VariantI) { + tcb_ptr.* = addr + tls_img.dtv_offset; + } else { + tcb_ptr.* = addr + tls_img.tcb_offset; + } + // Copy the data + @memcpy(@intToPtr([*]u8, addr + tls_img.data_offset), tls_img.data_src.ptr, tls_img.data_src.len); + + // Return the corrected (if needed) value for the tp register + return addr + tls_img.tcb_offset + tls_tp_offset; +} + +var main_thread_tls_buffer: [64]u8 align(32) = undefined; + +pub fn allocateTLS(size: usize) usize { + assert(size < main_thread_tls_buffer.len); + return @ptrToInt(&main_thread_tls_buffer); +} diff --git a/std/special/bootstrap.zig b/std/special/bootstrap.zig index d0f8a3a7bd..23e81d2ab1 100644 --- a/std/special/bootstrap.zig +++ b/std/special/bootstrap.zig @@ -67,24 +67,19 @@ fn posixCallMainAndExit() noreturn { var envp_count: usize = 0; while (envp_optional[envp_count]) |_| : (envp_count += 1) {} const envp = @ptrCast([*][*]u8, envp_optional)[0..envp_count]; + if (builtin.os == builtin.Os.linux) { - // Scan auxiliary vector. const auxv = @ptrCast([*]std.elf.Auxv, envp.ptr + envp_count + 1); std.os.linux_elf_aux_maybe = auxv; - var i: usize = 0; - var at_phdr: usize = 0; - var at_phnum: usize = 0; - var at_phent: usize = 0; - while (auxv[i].a_un.a_val != 0) : (i += 1) { - switch (auxv[i].a_type) { - std.elf.AT_PAGESZ => assert(auxv[i].a_un.a_val == std.os.page_size), - std.elf.AT_PHDR => at_phdr = auxv[i].a_un.a_val, - std.elf.AT_PHNUM => at_phnum = auxv[i].a_un.a_val, - std.elf.AT_PHENT => at_phent = auxv[i].a_un.a_val, - else => {}, + + std.os.linux.tls.initTLS(); + if (!builtin.single_threaded) { + if (std.os.linux.tls.tls_image) |tls_img| { + const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size); + const tp = std.os.linux.tls.copyTLS(tls_addr); + std.os.linux.tls.setThreadPointer(tp); } } - if (!builtin.single_threaded) linuxInitializeThreadLocalStorage(at_phdr, at_phnum, at_phent); } std.os.posix.exit(callMainWithArgs(argc, argv, envp)); @@ -140,50 +135,3 @@ inline fn callMain() u8 { const main_thread_tls_align = 32; var main_thread_tls_bytes: [64]u8 align(main_thread_tls_align) = [1]u8{0} ** 64; - -fn linuxInitializeThreadLocalStorage(at_phdr: usize, at_phnum: usize, at_phent: usize) void { - var phdr_addr = at_phdr; - var n = at_phnum; - var base: usize = 0; - while (n != 0) : ({ - n -= 1; - phdr_addr += at_phent; - }) { - const phdr = @intToPtr(*std.elf.Phdr, phdr_addr); - // TODO look for PT_DYNAMIC when we have https://github.com/ziglang/zig/issues/1917 - switch (phdr.p_type) { - std.elf.PT_PHDR => base = at_phdr - phdr.p_vaddr, - std.elf.PT_TLS => std.os.linux_tls_phdr = phdr, - else => continue, - } - } - const tls_phdr = std.os.linux_tls_phdr orelse return; - std.os.linux_tls_img_src = @intToPtr([*]const u8, base + tls_phdr.p_vaddr); - const end_addr = @ptrToInt(&main_thread_tls_bytes) + tls_phdr.p_memsz; - const max_end_addr = @ptrToInt(&main_thread_tls_bytes) + main_thread_tls_bytes.len; - assert(max_end_addr >= end_addr + @sizeOf(usize)); // not enough preallocated Thread Local Storage - assert(main_thread_tls_align >= tls_phdr.p_align); // preallocated Thread Local Storage not aligned enough - @memcpy(&main_thread_tls_bytes, std.os.linux_tls_img_src, tls_phdr.p_filesz); - const end_ptr = @intToPtr(*usize, end_addr); - end_ptr.* = end_addr; - linuxSetThreadArea(end_addr); -} - -fn linuxSetThreadArea(addr: usize) void { - switch (builtin.arch) { - builtin.Arch.x86_64 => { - const ARCH_SET_FS = 0x1002; - const rc = std.os.linux.syscall2(std.os.linux.SYS_arch_prctl, ARCH_SET_FS, addr); - // acrh_prctl is documented to never fail - assert(rc == 0); - }, - builtin.Arch.aarch64 => { - asm volatile ( - \\ msr tpidr_el0,x0 - \\ mov w0,#0 - \\ ret - ); - }, - else => @compileError("Unsupported architecture"), - } -}