diff --git a/lib/std/heap.zig b/lib/std/heap.zig index d1874c5b00..10e4cec608 100644 --- a/lib/std/heap.zig +++ b/lib/std/heap.zig @@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator; const windows = std.os.windows; pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator; -pub const WasmAllocator = @import("heap/WasmAllocator.zig"); -pub const PageAllocator = @import("heap/PageAllocator.zig"); -pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); -pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const SmpAllocator = @import("heap/SmpAllocator.zig"); pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig"); +pub const PageAllocator = @import("heap/PageAllocator.zig"); +pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); +pub const WasmAllocator = @import("heap/WasmAllocator.zig"); pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config; pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator; @@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{ .vtable = &PageAllocator.vtable, }; +pub const smp_allocator: Allocator = .{ + .ptr = undefined, + .vtable = &SmpAllocator.vtable, +}; + /// This allocator is fast, small, and specific to WebAssembly. In the future, /// this will be the implementation automatically selected by /// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32 @@ -978,4 +984,5 @@ test { if (builtin.target.isWasm()) { _ = WasmAllocator; } + if (!builtin.single_threaded) _ = smp_allocator; } diff --git a/lib/std/heap/PageAllocator.zig b/lib/std/heap/PageAllocator.zig index 433e0f1218..a1eae13efc 100644 --- a/lib/std/heap/PageAllocator.zig +++ b/lib/std/heap/PageAllocator.zig @@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{ .free = free, }; -fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { - _ = context; - _ = ra; - assert(n > 0); - +pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 { const page_size = std.heap.pageSize(); if (n >= maxInt(usize) - page_size) return null; const alignment_bytes = alignment.toByteUnits(); @@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[* return result_ptr; } +fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + assert(n > 0); + return map(n, alignment); +} + fn resize( context: *anyopaque, memory: []u8, @@ -114,7 +117,7 @@ fn resize( return realloc(memory, new_len, false) != null; } -pub fn remap( +fn remap( context: *anyopaque, memory: []u8, alignment: mem.Alignment, @@ -127,21 +130,24 @@ pub fn remap( return realloc(memory, new_len, true); } -fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void { +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void { _ = context; _ = alignment; _ = return_address; + return unmap(@alignCast(memory)); +} +pub fn unmap(memory: []align(page_size_min) u8) void { if (native_os == .windows) { - windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE); + windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE); } else { - const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize()); - posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len])); + const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize()); + posix.munmap(memory.ptr[0..page_aligned_len]); } } -fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { - const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory); +pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { + const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory); const page_size = std.heap.pageSize(); const new_size_aligned = mem.alignForward(usize, new_len, page_size); diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig new file mode 100644 index 0000000000..6fd68c19b2 --- /dev/null +++ b/lib/std/heap/SmpAllocator.zig @@ -0,0 +1,288 @@ +//! An allocator that is designed for ReleaseFast optimization mode, with +//! multi-threading enabled. +//! +//! This allocator is a singleton; it uses global state and only one should be +//! instantiated for the entire process. +//! +//! ## Basic Design +//! +//! Avoid locking the global mutex as much as possible. +//! +//! Each thread gets a separate freelist, however, the data must be recoverable +//! when the thread exits. We do not directly learn when a thread exits, so +//! occasionally, one thread must attempt to reclaim another thread's +//! resources. +//! +//! Above a certain size, those allocations are memory mapped directly, with no +//! storage of allocation metadata. This works because the implementation +//! refuses resizes that would move an allocation from small category to large +//! category or vice versa. +//! +//! Each allocator operation checks the thread identifier from a threadlocal +//! variable to find out which metadata in the global state to access, and +//! attempts to grab its lock. This will usually succeed without contention, +//! unless another thread has been assigned the same id. In the case of such +//! contention, the thread moves on to the next thread metadata slot and +//! repeats the process of attempting to obtain the lock. +//! +//! By limiting the thread-local metadata array to the same number as the CPU +//! count, ensures that as threads are created and destroyed, they cycle +//! through the full set of freelists. + +const builtin = @import("builtin"); +const native_os = builtin.os.tag; + +const std = @import("../std.zig"); +const assert = std.debug.assert; +const mem = std.mem; +const math = std.math; +const Allocator = std.mem.Allocator; +const SmpAllocator = @This(); +const PageAllocator = std.heap.PageAllocator; + +/// Protects the state in this struct (global state), except for `threads` +/// which each have their own mutex. +mutex: std.Thread.Mutex, +next_thread_index: u32, +cpu_count: u32, +threads: [max_thread_count]Thread, + +var global: SmpAllocator = .{ + .mutex = .{}, + .next_thread_index = 0, + .threads = @splat(.{}), + .cpu_count = 0, +}; +threadlocal var thread_id: Thread.Id = .none; + +const max_thread_count = 128; +const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) { + .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path. + .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`. + else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low. +}); +/// Because of storing free list pointers, the minimum size class is 3. +const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize))); +const size_class_count = math.log2(slab_len) - min_class; + +const Thread = struct { + /// Avoid false sharing. + _: void align(std.atomic.cache_line) = {}, + + /// Protects the state in this struct (per-thread state). + /// + /// Threads lock this before accessing their own state in order + /// to support freelist reclamation. + mutex: std.Thread.Mutex = .{}, + + next_addrs: [size_class_count]usize = @splat(0), + /// For each size class, points to the freed pointer. + frees: [size_class_count]usize = @splat(0), + + /// Index into `SmpAllocator.threads`. + const Id = enum(usize) { + none = 0, + first = 1, + _, + + fn fromIndex(index: usize) Id { + return @enumFromInt(index + 1); + } + + fn toIndex(id: Id) usize { + return @intFromEnum(id) - 1; + } + }; + + fn lock() *Thread { + const id = thread_id; + if (id != .none) { + var index = id.toIndex(); + { + const t = &global.threads[index]; + if (t.mutex.tryLock()) return t; + } + const cpu_count = global.cpu_count; + assert(cpu_count != 0); + while (true) { + index = (index + 1) % cpu_count; + const t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_id = .fromIndex(index); + return t; + } + } + } + while (true) { + const thread_index = i: { + global.mutex.lock(); + defer global.mutex.unlock(); + const cpu_count = c: { + const cpu_count = global.cpu_count; + if (cpu_count == 0) { + const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count)); + global.cpu_count = n; + break :c n; + } + break :c cpu_count; + }; + const thread_index = global.next_thread_index; + global.next_thread_index = @intCast((thread_index + 1) % cpu_count); + break :i thread_index; + }; + const t = &global.threads[thread_index]; + if (t.mutex.tryLock()) { + thread_id = .fromIndex(thread_index); + return t; + } + } + } + + fn unlock(t: *Thread) void { + t.mutex.unlock(); + } +}; + +pub const vtable: Allocator.VTable = .{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, +}; + +comptime { + assert(!builtin.single_threaded); // you're holding it wrong +} + +fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.map(len, alignment); + } + + const t = Thread.lock(); + defer t.unlock(); + + const slot_size = slotSize(class); + + const top_free_ptr = t.frees[class]; + if (top_free_ptr != 0) { + const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize))); + t.frees[class] = node.*; + return @ptrFromInt(top_free_ptr); + } + + const next_addr = t.next_addrs[class]; + if (next_addr % slab_len == 0) { + const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null; + t.next_addrs[class] = @intFromPtr(slab) + slot_size; + return slab; + } + + t.next_addrs[class] = next_addr + slot_size; + return @ptrFromInt(next_addr); +} + +fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return false; + return PageAllocator.realloc(memory, new_len, false) != null; + } + return new_class == class; +} + +fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return null; + return PageAllocator.realloc(memory, new_len, true); + } + return if (new_class == class) memory.ptr else null; +} + +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.unmap(@alignCast(memory)); + } + + const t = Thread.lock(); + defer t.unlock(); + + const slot_size = slotSize(class); + const addr = @intFromPtr(memory.ptr); + const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize))); + node.* = t.frees[class]; + t.frees[class] = addr; +} + +fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { + return @max( + @bitSizeOf(usize) - @clz(len - 1), + @intFromEnum(alignment), + min_class, + ); +} + +fn slotSize(class: usize) usize { + const Log2USize = std.math.Log2Int(usize); + return @as(usize, 1) << @as(Log2USize, @intCast(class)); +} + +test "large alloc, resize, remap, free" { + const gpa = std.heap.smp_allocator; + + const ptr1 = try gpa.alloc(u64, 42768); + const ptr2 = try gpa.alloc(u64, 52768); + gpa.free(ptr1); + const ptr3 = try gpa.alloc(u64, 62768); + gpa.free(ptr3); + gpa.free(ptr2); +} + +test "small allocations - free in same order" { + const gpa = std.heap.smp_allocator; + + var list = std.ArrayList(*u64).init(std.testing.allocator); + defer list.deinit(); + + var i: usize = 0; + while (i < 513) : (i += 1) { + const ptr = try gpa.create(u64); + try list.append(ptr); + } + + for (list.items) |ptr| { + gpa.destroy(ptr); + } +} + +test "small allocations - free in reverse order" { + const gpa = std.heap.smp_allocator; + + var list = std.ArrayList(*u64).init(std.testing.allocator); + defer list.deinit(); + + var i: usize = 0; + while (i < 513) : (i += 1) { + const ptr = try gpa.create(u64); + try list.append(ptr); + } + + while (list.popOrNull()) |ptr| { + gpa.destroy(ptr); + } +} diff --git a/lib/std/heap/WasmAllocator.zig b/lib/std/heap/WasmAllocator.zig index e30ac5ab01..0a9003f245 100644 --- a/lib/std/heap/WasmAllocator.zig +++ b/lib/std/heap/WasmAllocator.zig @@ -1,5 +1,3 @@ -//! This is intended to be merged into GeneralPurposeAllocator at some point. - const std = @import("../std.zig"); const builtin = @import("builtin"); const Allocator = std.mem.Allocator; diff --git a/lib/std/heap/debug_allocator.zig b/lib/std/heap/debug_allocator.zig index 8abf6133bf..296014aa3f 100644 --- a/lib/std/heap/debug_allocator.zig +++ b/lib/std/heap/debug_allocator.zig @@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type { self.mutex.lock(); defer self.mutex.unlock(); - assert(old_memory.len != 0); - const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment)); if (size_class_index >= self.buckets.len) { @branchHint(.unlikely);