From cf4bccf76566ac112f9142863c3e4dbf81e71d08 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Wed, 11 Sep 2019 20:22:49 -0400 Subject: [PATCH] improvements targeted at improving async functions * Reuse bytes of async function frames when non-async functions make `noasync` calls. This prevents explosive stack growth. * Zig now passes a stack size argument to the linker when linking ELF binaries. Linux ignores this value, but it is available as a program header called GNU_STACK. I prototyped some code that memory maps extra space to the stack using this program header, but there was still a problem when accessing stack memory very far down. Stack probing is needed or not working or something. I also prototyped using `@newStackCall` to call main and that does work around the issue but it also brings its own issues. That code is commented out for now in std/special/start.zig. I'm on a plane with no Internet, but I plan to consult with the musl community for advice when I get a chance. * Added `noasync` to a bunch of function calls in std.debug. It's very messy but it's a workaround that makes stack traces functional with evented I/O enabled. Eventually these will be cleaned up as the root bugs are found and fixed. Programs built in blocking mode are unaffected. * Lowered the default stack size of std.io.InStream (for the async version) to 1 MiB instead of 4. Until we figure out how to get choosing a stack size working (see 2nd bullet point above), 4 MiB tends to cause segfaults due to stack size running out, or usage of stack memory too far apart, or something like that. * Default thread stack size is bumped from 8 MiB to 16 to match the size we give for the main thread. It's planned to eventually remove this hard coded value and have Zig able to determine this value during semantic analysis, with call graph analysis and function pointer annotations and extern function annotations. --- src/codegen.cpp | 14 +++++++++++-- src/link.cpp | 5 +++++ std/debug.zig | 49 ++++++++++++++++++++++++++++++------------- std/io/in_stream.zig | 2 +- std/os/linux/tls.zig | 7 ++++++- std/special/start.zig | 43 ++++++++++++++++++++++++++----------- std/thread.zig | 2 +- 7 files changed, 90 insertions(+), 32 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index a9542f96d9..200589cd2a 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -7184,6 +7184,9 @@ static void do_code_gen(CodeGen *g) { if (!is_async) { // allocate async frames for noasync calls & awaits to async functions + ZigType *largest_call_frame_type = nullptr; + IrInstruction *all_calls_alloca = ir_create_alloca(g, &fn_table_entry->fndef_scope->base, + fn_table_entry->body_node, fn_table_entry, g->builtin_types.entry_void, "@async_call_frame"); for (size_t i = 0; i < fn_table_entry->call_list.length; i += 1) { IrInstructionCallGen *call = fn_table_entry->call_list.at(i); if (call->fn_entry == nullptr) @@ -7195,8 +7198,15 @@ static void do_code_gen(CodeGen *g) { if (call->frame_result_loc != nullptr) continue; ZigType *callee_frame_type = get_fn_frame_type(g, call->fn_entry); - call->frame_result_loc = ir_create_alloca(g, call->base.scope, call->base.source_node, - fn_table_entry, callee_frame_type, ""); + if (largest_call_frame_type == nullptr || + callee_frame_type->abi_size > largest_call_frame_type->abi_size) + { + largest_call_frame_type = callee_frame_type; + } + call->frame_result_loc = all_calls_alloca; + } + if (largest_call_frame_type != nullptr) { + all_calls_alloca->value.type = get_pointer_to_type(g, largest_call_frame_type, false); } // allocate temporary stack data for (size_t alloca_i = 0; alloca_i < fn_table_entry->alloca_gen_list.length; alloca_i += 1) { diff --git a/src/link.cpp b/src/link.cpp index 1130481dce..b10220d5da 100644 --- a/src/link.cpp +++ b/src/link.cpp @@ -1615,6 +1615,11 @@ static void construct_linker_job_elf(LinkJob *lj) { lj->args.append("-error-limit=0"); + if (g->out_type == OutTypeExe) { + lj->args.append("-z"); + lj->args.append("stack-size=16777216"); // default to 16 MiB + } + if (g->linker_script) { lj->args.append("-T"); lj->args.append(g->linker_script); diff --git a/std/debug.zig b/std/debug.zig index 377e6e4845..9fd2c0ff53 100644 --- a/std/debug.zig +++ b/std/debug.zig @@ -1478,10 +1478,11 @@ const LineNumberProgram = struct { } }; +// TODO the noasyncs here are workarounds fn readStringRaw(allocator: *mem.Allocator, in_stream: var) ![]u8 { var buf = ArrayList(u8).init(allocator); while (true) { - const byte = try in_stream.readByte(); + const byte = try noasync in_stream.readByte(); if (byte == 0) break; try buf.append(byte); } @@ -1494,10 +1495,11 @@ fn getString(di: *DwarfInfo, offset: u64) ![]u8 { return di.readString(); } +// TODO the noasyncs here are workarounds fn readAllocBytes(allocator: *mem.Allocator, in_stream: var, size: usize) ![]u8 { const buf = try allocator.alloc(u8, size); errdefer allocator.free(buf); - if ((try in_stream.read(buf)) < size) return error.EndOfFile; + if ((try noasync in_stream.read(buf)) < size) return error.EndOfFile; return buf; } @@ -1506,8 +1508,9 @@ fn parseFormValueBlockLen(allocator: *mem.Allocator, in_stream: var, size: usize return FormValue{ .Block = buf }; } +// TODO the noasyncs here are workarounds fn parseFormValueBlock(allocator: *mem.Allocator, in_stream: var, size: usize) !FormValue { - const block_len = try in_stream.readVarInt(usize, builtin.Endian.Little, size); + const block_len = try noasync in_stream.readVarInt(usize, builtin.Endian.Little, size); return parseFormValueBlockLen(allocator, in_stream, block_len); } @@ -1537,27 +1540,37 @@ fn parseFormValueConstant(allocator: *mem.Allocator, in_stream: var, signed: boo }; } +// TODO the noasyncs here are workarounds fn parseFormValueDwarfOffsetSize(in_stream: var, is_64: bool) !u64 { - return if (is_64) try in_stream.readIntLittle(u64) else u64(try in_stream.readIntLittle(u32)); + return if (is_64) try noasync in_stream.readIntLittle(u64) else u64(try noasync in_stream.readIntLittle(u32)); } +// TODO the noasyncs here are workarounds fn parseFormValueTargetAddrSize(in_stream: var) !u64 { - return if (@sizeOf(usize) == 4) u64(try in_stream.readIntLittle(u32)) else if (@sizeOf(usize) == 8) try in_stream.readIntLittle(u64) else unreachable; + if (@sizeOf(usize) == 4) { + return u64(try noasync in_stream.readIntLittle(u32)); + } else if (@sizeOf(usize) == 8) { + return noasync in_stream.readIntLittle(u64); + } else { + unreachable; + } } +// TODO the noasyncs here are workarounds fn parseFormValueRef(allocator: *mem.Allocator, in_stream: var, size: i32) !FormValue { return FormValue{ .Ref = switch (size) { - 1 => try in_stream.readIntLittle(u8), - 2 => try in_stream.readIntLittle(u16), - 4 => try in_stream.readIntLittle(u32), - 8 => try in_stream.readIntLittle(u64), - -1 => try leb.readULEB128(u64, in_stream), + 1 => try noasync in_stream.readIntLittle(u8), + 2 => try noasync in_stream.readIntLittle(u16), + 4 => try noasync in_stream.readIntLittle(u32), + 8 => try noasync in_stream.readIntLittle(u64), + -1 => try noasync leb.readULEB128(u64, in_stream), else => unreachable, }, }; } +// TODO the noasyncs here are workarounds fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64: bool) anyerror!FormValue { return switch (form_id) { DW.FORM_addr => FormValue{ .Address = try parseFormValueTargetAddrSize(in_stream) }, @@ -1565,7 +1578,7 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64 DW.FORM_block2 => parseFormValueBlock(allocator, in_stream, 2), DW.FORM_block4 => parseFormValueBlock(allocator, in_stream, 4), DW.FORM_block => x: { - const block_len = try leb.readULEB128(usize, in_stream); + const block_len = try noasync leb.readULEB128(usize, in_stream); return parseFormValueBlockLen(allocator, in_stream, block_len); }, DW.FORM_data1 => parseFormValueConstant(allocator, in_stream, false, 1), @@ -1577,11 +1590,11 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64 return parseFormValueConstant(allocator, in_stream, signed, -1); }, DW.FORM_exprloc => { - const size = try leb.readULEB128(usize, in_stream); + const size = try noasync leb.readULEB128(usize, in_stream); const buf = try readAllocBytes(allocator, in_stream, size); return FormValue{ .ExprLoc = buf }; }, - DW.FORM_flag => FormValue{ .Flag = (try in_stream.readByte()) != 0 }, + DW.FORM_flag => FormValue{ .Flag = (try noasync in_stream.readByte()) != 0 }, DW.FORM_flag_present => FormValue{ .Flag = true }, DW.FORM_sec_offset => FormValue{ .SecOffset = try parseFormValueDwarfOffsetSize(in_stream, is_64) }, @@ -1592,12 +1605,12 @@ fn parseFormValue(allocator: *mem.Allocator, in_stream: var, form_id: u64, is_64 DW.FORM_ref_udata => parseFormValueRef(allocator, in_stream, -1), DW.FORM_ref_addr => FormValue{ .RefAddr = try parseFormValueDwarfOffsetSize(in_stream, is_64) }, - DW.FORM_ref_sig8 => FormValue{ .Ref = try in_stream.readIntLittle(u64) }, + DW.FORM_ref_sig8 => FormValue{ .Ref = try noasync in_stream.readIntLittle(u64) }, DW.FORM_string => FormValue{ .String = try readStringRaw(allocator, in_stream) }, DW.FORM_strp => FormValue{ .StrPtr = try parseFormValueDwarfOffsetSize(in_stream, is_64) }, DW.FORM_indirect => { - const child_form_id = try leb.readULEB128(u64, in_stream); + const child_form_id = try noasync leb.readULEB128(u64, in_stream); const F = @typeOf(async parseFormValue(allocator, in_stream, child_form_id, is_64)); var frame = try allocator.create(F); defer allocator.destroy(frame); @@ -2400,3 +2413,9 @@ stdcallcc fn handleSegfaultWindows(info: *windows.EXCEPTION_POINTERS) c_long { else => return windows.EXCEPTION_CONTINUE_SEARCH, } } + +pub fn dumpStackPointerAddr(prefix: []const u8) void { + const sp = asm ("" : [argc] "={rsp}" (-> usize)); + std.debug.warn("{} sp = 0x{x}\n", prefix, sp); +} + diff --git a/std/io/in_stream.zig b/std/io/in_stream.zig index c617f10500..44c74fcca4 100644 --- a/std/io/in_stream.zig +++ b/std/io/in_stream.zig @@ -6,7 +6,7 @@ const assert = std.debug.assert; const mem = std.mem; const Buffer = std.Buffer; -pub const default_stack_size = 4 * 1024 * 1024; +pub const default_stack_size = 1 * 1024 * 1024; pub const stack_size: usize = if (@hasDecl(root, "stack_size_std_io_InStream")) root.stack_size_std_io_InStream else diff --git a/std/os/linux/tls.zig b/std/os/linux/tls.zig index 62df870944..8afc751401 100644 --- a/std/os/linux/tls.zig +++ b/std/os/linux/tls.zig @@ -125,7 +125,7 @@ pub fn setThreadPointer(addr: usize) void { } } -pub fn initTLS() void { +pub fn initTLS() ?*elf.Phdr { var tls_phdr: ?*elf.Phdr = null; var img_base: usize = 0; @@ -152,10 +152,13 @@ pub fn initTLS() void { // Search the TLS section const phdrs = (@intToPtr([*]elf.Phdr, at_phdr))[0..at_phnum]; + var gnu_stack: ?*elf.Phdr = null; + for (phdrs) |*phdr| { switch (phdr.p_type) { elf.PT_PHDR => img_base = at_phdr - phdr.p_vaddr, elf.PT_TLS => tls_phdr = phdr, + elf.PT_GNU_STACK => gnu_stack = phdr, else => continue, } } @@ -217,6 +220,8 @@ pub fn initTLS() void { .data_offset = data_offset, }; } + + return gnu_stack; } pub fn copyTLS(addr: usize) usize { diff --git a/std/special/start.zig b/std/special/start.zig index fde79a4baf..31639821b4 100644 --- a/std/special/start.zig +++ b/std/special/start.zig @@ -5,7 +5,7 @@ const std = @import("std"); const builtin = @import("builtin"); const assert = std.debug.assert; -var argc_ptr: [*]usize = undefined; +var starting_stack_ptr: [*]usize = undefined; const is_wasm = switch (builtin.arch) { .wasm32, .wasm64 => true, @@ -35,17 +35,17 @@ nakedcc fn _start() noreturn { switch (builtin.arch) { .x86_64 => { - argc_ptr = asm ("" + starting_stack_ptr = asm ("" : [argc] "={rsp}" (-> [*]usize) ); }, .i386 => { - argc_ptr = asm ("" + starting_stack_ptr = asm ("" : [argc] "={esp}" (-> [*]usize) ); }, .aarch64, .aarch64_be, .arm => { - argc_ptr = asm ("mov %[argc], sp" + starting_stack_ptr = asm ("mov %[argc], sp" : [argc] "=r" (-> [*]usize) ); }, @@ -72,8 +72,8 @@ fn posixCallMainAndExit() noreturn { if (builtin.os == builtin.Os.freebsd) { @setAlignStack(16); } - const argc = argc_ptr[0]; - const argv = @ptrCast([*][*]u8, argc_ptr + 1); + const argc = starting_stack_ptr[0]; + const argv = @ptrCast([*][*]u8, starting_stack_ptr + 1); const envp_optional = @ptrCast([*]?[*]u8, argv + argc + 1); var envp_count: usize = 0; @@ -85,21 +85,40 @@ fn posixCallMainAndExit() noreturn { const auxv = @ptrCast([*]std.elf.Auxv, envp.ptr + envp_count + 1); std.os.linux.elf_aux_maybe = auxv; // Initialize the TLS area - std.os.linux.tls.initTLS(); + const gnu_stack_phdr = std.os.linux.tls.initTLS() orelse @panic("ELF missing stack size"); if (std.os.linux.tls.tls_image) |tls_img| { const tls_addr = std.os.linux.tls.allocateTLS(tls_img.alloc_size); const tp = std.os.linux.tls.copyTLS(tls_addr); std.os.linux.tls.setThreadPointer(tp); } + + // TODO This is disabled because what should we do when linking libc and this code + // does not execute? And also it's causing a test failure in stack traces in release modes. + + //// Linux ignores the stack size from the ELF file, and instead always does 8 MiB. A further + //// problem is that it uses PROT_GROWSDOWN which prevents stores to addresses too far down + //// the stack and requires "probing". So here we allocate our own stack. + //const wanted_stack_size = gnu_stack_phdr.p_memsz; + //assert(wanted_stack_size % std.mem.page_size == 0); + //// Allocate an extra page as the guard page. + //const total_size = wanted_stack_size + std.mem.page_size; + //const new_stack = std.os.mmap( + // null, + // total_size, + // std.os.PROT_READ | std.os.PROT_WRITE, + // std.os.MAP_PRIVATE | std.os.MAP_ANONYMOUS, + // -1, + // 0, + //) catch @panic("out of memory"); + //std.os.mprotect(new_stack[0..std.mem.page_size], std.os.PROT_NONE) catch {}; + //std.os.exit(@newStackCall(new_stack, callMainWithArgs, argc, argv, envp)); } - std.os.exit(callMainWithArgs(argc, argv, envp)); + std.os.exit(@inlineCall(callMainWithArgs, argc, argv, envp)); } -// This is marked inline because for some reason LLVM in release mode fails to inline it, -// and we want fewer call frames in stack traces. -inline fn callMainWithArgs(argc: usize, argv: [*][*]u8, envp: [][*]u8) u8 { +fn callMainWithArgs(argc: usize, argv: [*][*]u8, envp: [][*]u8) u8 { std.os.argv = argv[0..argc]; std.os.environ = envp; @@ -112,7 +131,7 @@ extern fn main(c_argc: i32, c_argv: [*][*]u8, c_envp: [*]?[*]u8) i32 { var env_count: usize = 0; while (c_envp[env_count] != null) : (env_count += 1) {} const envp = @ptrCast([*][*]u8, c_envp)[0..env_count]; - return callMainWithArgs(@intCast(usize, c_argc), c_argv, envp); + return @inlineCall(callMainWithArgs, @intCast(usize, c_argc), c_argv, envp); } // General error message for a malformed return type diff --git a/std/thread.zig b/std/thread.zig index abf2f1cae1..278fcc827c 100644 --- a/std/thread.zig +++ b/std/thread.zig @@ -145,7 +145,7 @@ pub const Thread = struct { if (builtin.single_threaded) @compileError("cannot spawn thread when building in single-threaded mode"); // TODO compile-time call graph analysis to determine stack upper bound // https://github.com/ziglang/zig/issues/157 - const default_stack_size = 8 * 1024 * 1024; + const default_stack_size = 16 * 1024 * 1024; const Context = @typeOf(context); comptime assert(@ArgType(@typeOf(startFn), 0) == Context);