From 51c4ffa410d2cf51b7b45dab4dfd033db2190b7e Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 03:31:32 -0800 Subject: [PATCH 01/13] add std.heap.SmpAllocator An allocator intended to be used in -OReleaseFast mode when multi-threading is enabled. --- lib/std/heap.zig | 15 +- lib/std/heap/PageAllocator.zig | 30 ++-- lib/std/heap/SmpAllocator.zig | 288 +++++++++++++++++++++++++++++++ lib/std/heap/WasmAllocator.zig | 2 - lib/std/heap/debug_allocator.zig | 2 - 5 files changed, 317 insertions(+), 20 deletions(-) create mode 100644 lib/std/heap/SmpAllocator.zig diff --git a/lib/std/heap.zig b/lib/std/heap.zig index d1874c5b00..10e4cec608 100644 --- a/lib/std/heap.zig +++ b/lib/std/heap.zig @@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator; const windows = std.os.windows; pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator; -pub const WasmAllocator = @import("heap/WasmAllocator.zig"); -pub const PageAllocator = @import("heap/PageAllocator.zig"); -pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); -pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const SmpAllocator = @import("heap/SmpAllocator.zig"); pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig"); +pub const PageAllocator = @import("heap/PageAllocator.zig"); +pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); +pub const WasmAllocator = @import("heap/WasmAllocator.zig"); pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config; pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator; @@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{ .vtable = &PageAllocator.vtable, }; +pub const smp_allocator: Allocator = .{ + .ptr = undefined, + .vtable = &SmpAllocator.vtable, +}; + /// This allocator is fast, small, and specific to WebAssembly. In the future, /// this will be the implementation automatically selected by /// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32 @@ -978,4 +984,5 @@ test { if (builtin.target.isWasm()) { _ = WasmAllocator; } + if (!builtin.single_threaded) _ = smp_allocator; } diff --git a/lib/std/heap/PageAllocator.zig b/lib/std/heap/PageAllocator.zig index 433e0f1218..a1eae13efc 100644 --- a/lib/std/heap/PageAllocator.zig +++ b/lib/std/heap/PageAllocator.zig @@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{ .free = free, }; -fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { - _ = context; - _ = ra; - assert(n > 0); - +pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 { const page_size = std.heap.pageSize(); if (n >= maxInt(usize) - page_size) return null; const alignment_bytes = alignment.toByteUnits(); @@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[* return result_ptr; } +fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + assert(n > 0); + return map(n, alignment); +} + fn resize( context: *anyopaque, memory: []u8, @@ -114,7 +117,7 @@ fn resize( return realloc(memory, new_len, false) != null; } -pub fn remap( +fn remap( context: *anyopaque, memory: []u8, alignment: mem.Alignment, @@ -127,21 +130,24 @@ pub fn remap( return realloc(memory, new_len, true); } -fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void { +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void { _ = context; _ = alignment; _ = return_address; + return unmap(@alignCast(memory)); +} +pub fn unmap(memory: []align(page_size_min) u8) void { if (native_os == .windows) { - windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE); + windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE); } else { - const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize()); - posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len])); + const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize()); + posix.munmap(memory.ptr[0..page_aligned_len]); } } -fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { - const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory); +pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { + const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory); const page_size = std.heap.pageSize(); const new_size_aligned = mem.alignForward(usize, new_len, page_size); diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig new file mode 100644 index 0000000000..6fd68c19b2 --- /dev/null +++ b/lib/std/heap/SmpAllocator.zig @@ -0,0 +1,288 @@ +//! An allocator that is designed for ReleaseFast optimization mode, with +//! multi-threading enabled. +//! +//! This allocator is a singleton; it uses global state and only one should be +//! instantiated for the entire process. +//! +//! ## Basic Design +//! +//! Avoid locking the global mutex as much as possible. +//! +//! Each thread gets a separate freelist, however, the data must be recoverable +//! when the thread exits. We do not directly learn when a thread exits, so +//! occasionally, one thread must attempt to reclaim another thread's +//! resources. +//! +//! Above a certain size, those allocations are memory mapped directly, with no +//! storage of allocation metadata. This works because the implementation +//! refuses resizes that would move an allocation from small category to large +//! category or vice versa. +//! +//! Each allocator operation checks the thread identifier from a threadlocal +//! variable to find out which metadata in the global state to access, and +//! attempts to grab its lock. This will usually succeed without contention, +//! unless another thread has been assigned the same id. In the case of such +//! contention, the thread moves on to the next thread metadata slot and +//! repeats the process of attempting to obtain the lock. +//! +//! By limiting the thread-local metadata array to the same number as the CPU +//! count, ensures that as threads are created and destroyed, they cycle +//! through the full set of freelists. + +const builtin = @import("builtin"); +const native_os = builtin.os.tag; + +const std = @import("../std.zig"); +const assert = std.debug.assert; +const mem = std.mem; +const math = std.math; +const Allocator = std.mem.Allocator; +const SmpAllocator = @This(); +const PageAllocator = std.heap.PageAllocator; + +/// Protects the state in this struct (global state), except for `threads` +/// which each have their own mutex. +mutex: std.Thread.Mutex, +next_thread_index: u32, +cpu_count: u32, +threads: [max_thread_count]Thread, + +var global: SmpAllocator = .{ + .mutex = .{}, + .next_thread_index = 0, + .threads = @splat(.{}), + .cpu_count = 0, +}; +threadlocal var thread_id: Thread.Id = .none; + +const max_thread_count = 128; +const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) { + .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path. + .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`. + else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low. +}); +/// Because of storing free list pointers, the minimum size class is 3. +const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize))); +const size_class_count = math.log2(slab_len) - min_class; + +const Thread = struct { + /// Avoid false sharing. + _: void align(std.atomic.cache_line) = {}, + + /// Protects the state in this struct (per-thread state). + /// + /// Threads lock this before accessing their own state in order + /// to support freelist reclamation. + mutex: std.Thread.Mutex = .{}, + + next_addrs: [size_class_count]usize = @splat(0), + /// For each size class, points to the freed pointer. + frees: [size_class_count]usize = @splat(0), + + /// Index into `SmpAllocator.threads`. + const Id = enum(usize) { + none = 0, + first = 1, + _, + + fn fromIndex(index: usize) Id { + return @enumFromInt(index + 1); + } + + fn toIndex(id: Id) usize { + return @intFromEnum(id) - 1; + } + }; + + fn lock() *Thread { + const id = thread_id; + if (id != .none) { + var index = id.toIndex(); + { + const t = &global.threads[index]; + if (t.mutex.tryLock()) return t; + } + const cpu_count = global.cpu_count; + assert(cpu_count != 0); + while (true) { + index = (index + 1) % cpu_count; + const t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_id = .fromIndex(index); + return t; + } + } + } + while (true) { + const thread_index = i: { + global.mutex.lock(); + defer global.mutex.unlock(); + const cpu_count = c: { + const cpu_count = global.cpu_count; + if (cpu_count == 0) { + const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count)); + global.cpu_count = n; + break :c n; + } + break :c cpu_count; + }; + const thread_index = global.next_thread_index; + global.next_thread_index = @intCast((thread_index + 1) % cpu_count); + break :i thread_index; + }; + const t = &global.threads[thread_index]; + if (t.mutex.tryLock()) { + thread_id = .fromIndex(thread_index); + return t; + } + } + } + + fn unlock(t: *Thread) void { + t.mutex.unlock(); + } +}; + +pub const vtable: Allocator.VTable = .{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, +}; + +comptime { + assert(!builtin.single_threaded); // you're holding it wrong +} + +fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.map(len, alignment); + } + + const t = Thread.lock(); + defer t.unlock(); + + const slot_size = slotSize(class); + + const top_free_ptr = t.frees[class]; + if (top_free_ptr != 0) { + const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize))); + t.frees[class] = node.*; + return @ptrFromInt(top_free_ptr); + } + + const next_addr = t.next_addrs[class]; + if (next_addr % slab_len == 0) { + const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null; + t.next_addrs[class] = @intFromPtr(slab) + slot_size; + return slab; + } + + t.next_addrs[class] = next_addr + slot_size; + return @ptrFromInt(next_addr); +} + +fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return false; + return PageAllocator.realloc(memory, new_len, false) != null; + } + return new_class == class; +} + +fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return null; + return PageAllocator.realloc(memory, new_len, true); + } + return if (new_class == class) memory.ptr else null; +} + +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.unmap(@alignCast(memory)); + } + + const t = Thread.lock(); + defer t.unlock(); + + const slot_size = slotSize(class); + const addr = @intFromPtr(memory.ptr); + const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize))); + node.* = t.frees[class]; + t.frees[class] = addr; +} + +fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { + return @max( + @bitSizeOf(usize) - @clz(len - 1), + @intFromEnum(alignment), + min_class, + ); +} + +fn slotSize(class: usize) usize { + const Log2USize = std.math.Log2Int(usize); + return @as(usize, 1) << @as(Log2USize, @intCast(class)); +} + +test "large alloc, resize, remap, free" { + const gpa = std.heap.smp_allocator; + + const ptr1 = try gpa.alloc(u64, 42768); + const ptr2 = try gpa.alloc(u64, 52768); + gpa.free(ptr1); + const ptr3 = try gpa.alloc(u64, 62768); + gpa.free(ptr3); + gpa.free(ptr2); +} + +test "small allocations - free in same order" { + const gpa = std.heap.smp_allocator; + + var list = std.ArrayList(*u64).init(std.testing.allocator); + defer list.deinit(); + + var i: usize = 0; + while (i < 513) : (i += 1) { + const ptr = try gpa.create(u64); + try list.append(ptr); + } + + for (list.items) |ptr| { + gpa.destroy(ptr); + } +} + +test "small allocations - free in reverse order" { + const gpa = std.heap.smp_allocator; + + var list = std.ArrayList(*u64).init(std.testing.allocator); + defer list.deinit(); + + var i: usize = 0; + while (i < 513) : (i += 1) { + const ptr = try gpa.create(u64); + try list.append(ptr); + } + + while (list.popOrNull()) |ptr| { + gpa.destroy(ptr); + } +} diff --git a/lib/std/heap/WasmAllocator.zig b/lib/std/heap/WasmAllocator.zig index e30ac5ab01..0a9003f245 100644 --- a/lib/std/heap/WasmAllocator.zig +++ b/lib/std/heap/WasmAllocator.zig @@ -1,5 +1,3 @@ -//! This is intended to be merged into GeneralPurposeAllocator at some point. - const std = @import("../std.zig"); const builtin = @import("builtin"); const Allocator = std.mem.Allocator; diff --git a/lib/std/heap/debug_allocator.zig b/lib/std/heap/debug_allocator.zig index 8abf6133bf..296014aa3f 100644 --- a/lib/std/heap/debug_allocator.zig +++ b/lib/std/heap/debug_allocator.zig @@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type { self.mutex.lock(); defer self.mutex.unlock(); - assert(old_memory.len != 0); - const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment)); if (size_class_index >= self.buckets.len) { @branchHint(.unlikely); From 3d7c5cf64a3bbe55dfa943133d1a5a7a34fe388c Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 14:14:12 -0800 Subject: [PATCH 02/13] std.heap: test smp_allocator --- lib/std/heap.zig | 11 +++++++-- lib/std/heap/SmpAllocator.zig | 45 ----------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/lib/std/heap.zig b/lib/std/heap.zig index 10e4cec608..290b39b624 100644 --- a/lib/std/heap.zig +++ b/lib/std/heap.zig @@ -481,7 +481,7 @@ pub fn StackFallbackAllocator(comptime size: usize) type { }; } -test "c_allocator" { +test c_allocator { if (builtin.link_libc) { try testAllocator(c_allocator); try testAllocatorAligned(c_allocator); @@ -490,12 +490,19 @@ test "c_allocator" { } } -test "raw_c_allocator" { +test raw_c_allocator { if (builtin.link_libc) { try testAllocator(raw_c_allocator); } } +test smp_allocator { + try testAllocator(smp_allocator); + try testAllocatorAligned(smp_allocator); + try testAllocatorLargeAlignment(smp_allocator); + try testAllocatorAlignedShrink(smp_allocator); +} + test PageAllocator { const allocator = page_allocator; try testAllocator(allocator); diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index 6fd68c19b2..b1f2b14d0a 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -241,48 +241,3 @@ fn slotSize(class: usize) usize { const Log2USize = std.math.Log2Int(usize); return @as(usize, 1) << @as(Log2USize, @intCast(class)); } - -test "large alloc, resize, remap, free" { - const gpa = std.heap.smp_allocator; - - const ptr1 = try gpa.alloc(u64, 42768); - const ptr2 = try gpa.alloc(u64, 52768); - gpa.free(ptr1); - const ptr3 = try gpa.alloc(u64, 62768); - gpa.free(ptr3); - gpa.free(ptr2); -} - -test "small allocations - free in same order" { - const gpa = std.heap.smp_allocator; - - var list = std.ArrayList(*u64).init(std.testing.allocator); - defer list.deinit(); - - var i: usize = 0; - while (i < 513) : (i += 1) { - const ptr = try gpa.create(u64); - try list.append(ptr); - } - - for (list.items) |ptr| { - gpa.destroy(ptr); - } -} - -test "small allocations - free in reverse order" { - const gpa = std.heap.smp_allocator; - - var list = std.ArrayList(*u64).init(std.testing.allocator); - defer list.deinit(); - - var i: usize = 0; - while (i < 513) : (i += 1) { - const ptr = try gpa.create(u64); - try list.append(ptr); - } - - while (list.popOrNull()) |ptr| { - gpa.destroy(ptr); - } -} From 84bf7a6701d5f51a8307736d310d4049c9158921 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 17:35:27 -0800 Subject: [PATCH 03/13] std.heap.SmpAllocator: 256K slab_len and no need for special handling of wasi and windows since we don't ask for anything more than page-aligned. --- lib/std/heap/SmpAllocator.zig | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index b1f2b14d0a..1a7eab2153 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -30,7 +30,6 @@ //! through the full set of freelists. const builtin = @import("builtin"); -const native_os = builtin.os.tag; const std = @import("../std.zig"); const assert = std.debug.assert; @@ -56,11 +55,7 @@ var global: SmpAllocator = .{ threadlocal var thread_id: Thread.Id = .none; const max_thread_count = 128; -const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) { - .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path. - .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`. - else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low. -}); +const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024); /// Because of storing free list pointers, the minimum size class is 3. const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize))); const size_class_count = math.log2(slab_len) - min_class; From 7360be19a461175d1a50dfb1d7c086bcc650b3c1 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 17:41:50 -0800 Subject: [PATCH 04/13] compiler: use std.heap.smp_allocator In main, now this allocator is chosen by default when compiling without libc in ReleaseFast or ReleaseSmall, and not targeting WebAssembly. --- bootstrap.c | 2 +- build.zig | 6 +++--- src/main.zig | 35 ++++++++++++++++++----------------- stage1/config.zig.in | 2 +- 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/bootstrap.c b/bootstrap.c index a37834f463..b6345987a9 100644 --- a/bootstrap.c +++ b/bootstrap.c @@ -139,7 +139,7 @@ int main(int argc, char **argv) { "pub const enable_tracy = false;\n" "pub const value_tracing = false;\n" "pub const skip_non_native = false;\n" - "pub const force_gpa = false;\n" + "pub const debug_gpa = false;\n" "pub const dev = .core;\n" "pub const value_interpret_mode = .direct;\n" , zig_version); diff --git a/build.zig b/build.zig index 14685ffd29..1de11b2db8 100644 --- a/build.zig +++ b/build.zig @@ -171,7 +171,7 @@ pub fn build(b: *std.Build) !void { const tracy_callstack = b.option(bool, "tracy-callstack", "Include callstack information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null); const tracy_allocation = b.option(bool, "tracy-allocation", "Include allocation information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null); const tracy_callstack_depth: u32 = b.option(u32, "tracy-callstack-depth", "Declare callstack depth for Tracy data. Does nothing if -Dtracy_callstack is not provided") orelse 10; - const force_gpa = b.option(bool, "force-gpa", "Force the compiler to use GeneralPurposeAllocator") orelse false; + const debug_gpa = b.option(bool, "debug-allocator", "Force the compiler to use DebugAllocator") orelse false; const link_libc = b.option(bool, "force-link-libc", "Force self-hosted compiler to link libc") orelse (enable_llvm or only_c); const sanitize_thread = b.option(bool, "sanitize-thread", "Enable thread-sanitization") orelse false; const strip = b.option(bool, "strip", "Omit debug information"); @@ -233,7 +233,7 @@ pub fn build(b: *std.Build) !void { exe_options.addOption(bool, "llvm_has_csky", llvm_has_csky); exe_options.addOption(bool, "llvm_has_arc", llvm_has_arc); exe_options.addOption(bool, "llvm_has_xtensa", llvm_has_xtensa); - exe_options.addOption(bool, "force_gpa", force_gpa); + exe_options.addOption(bool, "debug_gpa", debug_gpa); exe_options.addOption(DevEnv, "dev", b.option(DevEnv, "dev", "Build a compiler with a reduced feature set for development of specific features") orelse if (only_c) .bootstrap else .full); exe_options.addOption(ValueInterpretMode, "value_interpret_mode", value_interpret_mode); @@ -608,7 +608,7 @@ fn addWasiUpdateStep(b: *std.Build, version: [:0]const u8) !void { exe_options.addOption(u32, "mem_leak_frames", 0); exe_options.addOption(bool, "have_llvm", false); - exe_options.addOption(bool, "force_gpa", false); + exe_options.addOption(bool, "debug_gpa", false); exe_options.addOption([:0]const u8, "version", version); exe_options.addOption(std.SemanticVersion, "semver", semver); exe_options.addOption(bool, "enable_debug_extensions", false); diff --git a/src/main.zig b/src/main.zig index a00261cc30..401f6b2296 100644 --- a/src/main.zig +++ b/src/main.zig @@ -171,30 +171,31 @@ pub fn log( std.debug.print(prefix1 ++ prefix2 ++ format ++ "\n", args); } -var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{ +var debug_allocator: std.heap.DebugAllocator(.{ .stack_trace_frames = build_options.mem_leak_frames, -}){}; +}) = .init; pub fn main() anyerror!void { crash_report.initialize(); - const use_gpa = (build_options.force_gpa or !builtin.link_libc) and native_os != .wasi; - const gpa = gpa: { - if (native_os == .wasi) { - break :gpa std.heap.wasm_allocator; + const gpa, const is_debug = gpa: { + if (build_options.debug_gpa) break :gpa .{ debug_allocator.allocator(), true }; + if (native_os == .wasi) break :gpa .{ std.heap.wasm_allocator, false }; + if (builtin.link_libc) { + // We would prefer to use raw libc allocator here, but cannot use + // it if it won't support the alignment we need. + if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) { + break :gpa .{ std.heap.c_allocator, false }; + } + break :gpa .{ std.heap.raw_c_allocator, false }; } - if (use_gpa) { - break :gpa general_purpose_allocator.allocator(); - } - // We would prefer to use raw libc allocator here, but cannot - // use it if it won't support the alignment we need. - if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) { - break :gpa std.heap.c_allocator; - } - break :gpa std.heap.raw_c_allocator; + break :gpa switch (builtin.mode) { + .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true }, + .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false }, + }; }; - defer if (use_gpa) { - _ = general_purpose_allocator.deinit(); + defer if (is_debug) { + _ = debug_allocator.deinit(); }; var arena_instance = std.heap.ArenaAllocator.init(gpa); defer arena_instance.deinit(); diff --git a/stage1/config.zig.in b/stage1/config.zig.in index 47d4b4e85f..d5c9a7ebbf 100644 --- a/stage1/config.zig.in +++ b/stage1/config.zig.in @@ -11,6 +11,6 @@ pub const enable_link_snapshots = false; pub const enable_tracy = false; pub const value_tracing = false; pub const skip_non_native = false; -pub const force_gpa = false; +pub const debug_gpa = false; pub const dev = .core; pub const value_interpret_mode = .direct; From 60765a9ee2d66d4c2b870e726aa7e7bc2590e2b8 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 19:48:02 -0800 Subject: [PATCH 05/13] std.heap.SmpAllocator: implement searching on alloc rotate a couple times before resorting to mapping more memory. --- lib/std/heap/SmpAllocator.zig | 69 ++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index 1a7eab2153..4d10624310 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -158,27 +158,53 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? return PageAllocator.map(len, alignment); } - const t = Thread.lock(); - defer t.unlock(); - const slot_size = slotSize(class); + const max_search = 2; + var search_count: u32 = 0; - const top_free_ptr = t.frees[class]; - if (top_free_ptr != 0) { - const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize))); - t.frees[class] = node.*; - return @ptrFromInt(top_free_ptr); + var t = Thread.lock(); + + outer: while (true) { + const top_free_ptr = t.frees[class]; + if (top_free_ptr != 0) { + @branchHint(.likely); + defer t.unlock(); + const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize))); + t.frees[class] = node.*; + return @ptrFromInt(top_free_ptr); + } + + const next_addr = t.next_addrs[class]; + if ((next_addr % slab_len) != 0) { + @branchHint(.likely); + defer t.unlock(); + t.next_addrs[class] = next_addr + slot_size; + return @ptrFromInt(next_addr); + } + + if (search_count >= max_search) { + @branchHint(.likely); + defer t.unlock(); + const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null; + t.next_addrs[class] = @intFromPtr(slab) + slot_size; + return slab; + } + + t.unlock(); + t = undefined; + const cpu_count = global.cpu_count; + assert(cpu_count != 0); + var index = thread_id.toIndex(); + while (true) { + index = (index + 1) % cpu_count; + t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_id = .fromIndex(index); + search_count += 1; + continue :outer; + } + } } - - const next_addr = t.next_addrs[class]; - if (next_addr % slab_len == 0) { - const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null; - t.next_addrs[class] = @intFromPtr(slab) + slot_size; - return slab; - } - - t.next_addrs[class] = next_addr + slot_size; - return @ptrFromInt(next_addr); } fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool { @@ -214,12 +240,13 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) return PageAllocator.unmap(@alignCast(memory)); } - const t = Thread.lock(); - defer t.unlock(); - const slot_size = slotSize(class); const addr = @intFromPtr(memory.ptr); const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize))); + + const t = Thread.lock(); + defer t.unlock(); + node.* = t.frees[class]; t.frees[class] = addr; } From 839c453d880e22f2f120d62332b273380a215cc5 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Thu, 6 Feb 2025 21:47:46 -0800 Subject: [PATCH 06/13] std.heap.SmpAllocator: eliminate the global mutex --- lib/std/heap/SmpAllocator.zig | 86 ++++++++++------------------------- 1 file changed, 24 insertions(+), 62 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index 4d10624310..b4e7138291 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -39,20 +39,14 @@ const Allocator = std.mem.Allocator; const SmpAllocator = @This(); const PageAllocator = std.heap.PageAllocator; -/// Protects the state in this struct (global state), except for `threads` -/// which each have their own mutex. -mutex: std.Thread.Mutex, -next_thread_index: u32, cpu_count: u32, threads: [max_thread_count]Thread, var global: SmpAllocator = .{ - .mutex = .{}, - .next_thread_index = 0, .threads = @splat(.{}), .cpu_count = 0, }; -threadlocal var thread_id: Thread.Id = .none; +threadlocal var thread_index: u32 = 0; const max_thread_count = 128; const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024); @@ -74,60 +68,22 @@ const Thread = struct { /// For each size class, points to the freed pointer. frees: [size_class_count]usize = @splat(0), - /// Index into `SmpAllocator.threads`. - const Id = enum(usize) { - none = 0, - first = 1, - _, - - fn fromIndex(index: usize) Id { - return @enumFromInt(index + 1); - } - - fn toIndex(id: Id) usize { - return @intFromEnum(id) - 1; - } - }; - fn lock() *Thread { - const id = thread_id; - if (id != .none) { - var index = id.toIndex(); - { - const t = &global.threads[index]; - if (t.mutex.tryLock()) return t; - } - const cpu_count = global.cpu_count; - assert(cpu_count != 0); - while (true) { - index = (index + 1) % cpu_count; - const t = &global.threads[index]; - if (t.mutex.tryLock()) { - thread_id = .fromIndex(index); - return t; - } + var index = thread_index; + { + const t = &global.threads[index]; + if (t.mutex.tryLock()) { + @branchHint(.likely); + return t; } } + const cpu_count = getCpuCount(); + assert(cpu_count != 0); while (true) { - const thread_index = i: { - global.mutex.lock(); - defer global.mutex.unlock(); - const cpu_count = c: { - const cpu_count = global.cpu_count; - if (cpu_count == 0) { - const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count)); - global.cpu_count = n; - break :c n; - } - break :c cpu_count; - }; - const thread_index = global.next_thread_index; - global.next_thread_index = @intCast((thread_index + 1) % cpu_count); - break :i thread_index; - }; - const t = &global.threads[thread_index]; + index = (index + 1) % cpu_count; + const t = &global.threads[index]; if (t.mutex.tryLock()) { - thread_id = .fromIndex(thread_index); + thread_index = index; return t; } } @@ -138,6 +94,13 @@ const Thread = struct { } }; +fn getCpuCount() u32 { + const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered); + if (cpu_count != 0) return cpu_count; + const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count)); + return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n; +} + pub const vtable: Allocator.VTable = .{ .alloc = alloc, .resize = resize, @@ -159,8 +122,8 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? } const slot_size = slotSize(class); - const max_search = 2; - var search_count: u32 = 0; + const max_search = 1; + var search_count: u8 = 0; var t = Thread.lock(); @@ -191,15 +154,14 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? } t.unlock(); - t = undefined; - const cpu_count = global.cpu_count; + const cpu_count = getCpuCount(); assert(cpu_count != 0); - var index = thread_id.toIndex(); + var index = thread_index; while (true) { index = (index + 1) % cpu_count; t = &global.threads[index]; if (t.mutex.tryLock()) { - thread_id = .fromIndex(index); + thread_index = index; search_count += 1; continue :outer; } From 1ffae59fec60816ff364b4ade93c6fc2fc1571cf Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 00:47:43 -0800 Subject: [PATCH 07/13] std.heap.SmpAllocator: fix using wrong size class indices --- lib/std/heap/SmpAllocator.zig | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index b4e7138291..c1c9ca5517 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -215,13 +215,11 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { return @max( - @bitSizeOf(usize) - @clz(len - 1), - @intFromEnum(alignment), - min_class, - ); + @bitSizeOf(usize) - @clz(len + (@sizeOf(usize) - 1)), + @intFromEnum(alignment) + 1, + ) - min_class; } fn slotSize(class: usize) usize { - const Log2USize = std.math.Log2Int(usize); - return @as(usize, 1) << @as(Log2USize, @intCast(class)); + return @as(usize, 1) << @intCast(class + min_class); } From 88e2e60e88ebaeb46b2642830cd8cd369dc737c0 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 00:58:01 -0800 Subject: [PATCH 08/13] std.heap.SmpAllocator: simplify by putting freelist node at start --- lib/std/heap/SmpAllocator.zig | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index c1c9ca5517..0c2dc4348d 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -51,7 +51,7 @@ threadlocal var thread_index: u32 = 0; const max_thread_count = 128; const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024); /// Because of storing free list pointers, the minimum size class is 3. -const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize))); +const min_class = math.log2(@sizeOf(usize)); const size_class_count = math.log2(slab_len) - min_class; const Thread = struct { @@ -132,7 +132,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? if (top_free_ptr != 0) { @branchHint(.likely); defer t.unlock(); - const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize))); + const node: *usize = @ptrFromInt(top_free_ptr); t.frees[class] = node.*; return @ptrFromInt(top_free_ptr); } @@ -202,22 +202,17 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) return PageAllocator.unmap(@alignCast(memory)); } - const slot_size = slotSize(class); - const addr = @intFromPtr(memory.ptr); - const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize))); + const node: *usize = @alignCast(@ptrCast(memory.ptr)); const t = Thread.lock(); defer t.unlock(); node.* = t.frees[class]; - t.frees[class] = addr; + t.frees[class] = @intFromPtr(node); } fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { - return @max( - @bitSizeOf(usize) - @clz(len + (@sizeOf(usize) - 1)), - @intFromEnum(alignment) + 1, - ) - min_class; + return @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment), min_class) - min_class; } fn slotSize(class: usize) usize { From 3246150d4589a25cd1ad762a78d2806ef1da2a95 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 01:21:09 -0800 Subject: [PATCH 09/13] std.heap.SmpAllocator: fix getCpuCount logic it was always returning max_cpu_count --- lib/std/heap/SmpAllocator.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index 0c2dc4348d..9dce341260 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -97,7 +97,7 @@ const Thread = struct { fn getCpuCount() u32 { const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered); if (cpu_count != 0) return cpu_count; - const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count)); + const n: u32 = @min(std.Thread.getCpuCount() catch max_thread_count, max_thread_count); return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n; } From a9d30056167cbe54c0c144199f01a56f6cdcdafc Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 02:04:56 -0800 Subject: [PATCH 10/13] std.heap.SmpAllocator: fix detection of slab end --- lib/std/heap/SmpAllocator.zig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index 9dce341260..b3183b26a8 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -6,8 +6,6 @@ //! //! ## Basic Design //! -//! Avoid locking the global mutex as much as possible. -//! //! Each thread gets a separate freelist, however, the data must be recoverable //! when the thread exits. We do not directly learn when a thread exits, so //! occasionally, one thread must attempt to reclaim another thread's @@ -122,6 +120,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? } const slot_size = slotSize(class); + assert(slab_len % slot_size == 0); const max_search = 1; var search_count: u8 = 0; @@ -148,7 +147,8 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? if (search_count >= max_search) { @branchHint(.likely); defer t.unlock(); - const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null; + // slab alignment here ensures the % slab len earlier catches the end of slots. + const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null; t.next_addrs[class] = @intFromPtr(slab) + slot_size; return slab; } From 1754e014f5da09bf83a7ee1e53132325fd78d1c1 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 14:05:28 -0800 Subject: [PATCH 11/13] std.heap.SmpAllocator: rotate on free sometimes * slab length reduced to 64K * track freelist length with u8s * on free(), rotate if freelist length exceeds max_freelist_len Prevents memory leakage in the scenario where one thread only allocates and another thread only frees. --- lib/std/heap/SmpAllocator.zig | 55 ++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig index b3183b26a8..50593687ec 100644 --- a/lib/std/heap/SmpAllocator.zig +++ b/lib/std/heap/SmpAllocator.zig @@ -47,10 +47,16 @@ var global: SmpAllocator = .{ threadlocal var thread_index: u32 = 0; const max_thread_count = 128; -const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024); +const slab_len: usize = @max(std.heap.page_size_max, 64 * 1024); /// Because of storing free list pointers, the minimum size class is 3. const min_class = math.log2(@sizeOf(usize)); const size_class_count = math.log2(slab_len) - min_class; +/// When a freelist length exceeds this number, a `free` will rotate up to +/// `max_free_search` times before pushing. +const max_freelist_len: u8 = 16; +const max_free_search = 1; +/// Before mapping a fresh page, `alloc` will rotate this many times. +const max_alloc_search = 1; const Thread = struct { /// Avoid false sharing. @@ -62,9 +68,13 @@ const Thread = struct { /// to support freelist reclamation. mutex: std.Thread.Mutex = .{}, + /// For each size class, tracks the next address to be returned from + /// `alloc` when the freelist is empty. next_addrs: [size_class_count]usize = @splat(0), /// For each size class, points to the freed pointer. frees: [size_class_count]usize = @splat(0), + /// For each size class, tracks the number of items in the freelist. + freelist_lens: [size_class_count]u8 = @splat(0), fn lock() *Thread { var index = thread_index; @@ -121,7 +131,6 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? const slot_size = slotSize(class); assert(slab_len % slot_size == 0); - const max_search = 1; var search_count: u8 = 0; var t = Thread.lock(); @@ -133,6 +142,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? defer t.unlock(); const node: *usize = @ptrFromInt(top_free_ptr); t.frees[class] = node.*; + t.freelist_lens[class] -|= 1; return @ptrFromInt(top_free_ptr); } @@ -144,12 +154,13 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ? return @ptrFromInt(next_addr); } - if (search_count >= max_search) { + if (search_count >= max_alloc_search) { @branchHint(.likely); defer t.unlock(); // slab alignment here ensures the % slab len earlier catches the end of slots. const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null; t.next_addrs[class] = @intFromPtr(slab) + slot_size; + t.freelist_lens[class] = 0; return slab; } @@ -203,12 +214,42 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) } const node: *usize = @alignCast(@ptrCast(memory.ptr)); + var search_count: u8 = 0; - const t = Thread.lock(); - defer t.unlock(); + var t = Thread.lock(); - node.* = t.frees[class]; - t.frees[class] = @intFromPtr(node); + outer: while (true) { + const freelist_len = t.freelist_lens[class]; + if (freelist_len < max_freelist_len) { + @branchHint(.likely); + defer t.unlock(); + node.* = t.frees[class]; + t.frees[class] = @intFromPtr(node); + return; + } + + if (search_count >= max_free_search) { + defer t.unlock(); + t.freelist_lens[class] = freelist_len +| 1; + node.* = t.frees[class]; + t.frees[class] = @intFromPtr(node); + return; + } + + t.unlock(); + const cpu_count = getCpuCount(); + assert(cpu_count != 0); + var index = thread_index; + while (true) { + index = (index + 1) % cpu_count; + t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_index = index; + search_count += 1; + continue :outer; + } + } + } } fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { From bfabb703e32c8bbca3724ee1fa79d565adc1a200 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Fri, 7 Feb 2025 15:36:00 -0800 Subject: [PATCH 12/13] don't try to test SmpAllocator in single threaded mode --- lib/std/heap.zig | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/std/heap.zig b/lib/std/heap.zig index 290b39b624..b728d0da7e 100644 --- a/lib/std/heap.zig +++ b/lib/std/heap.zig @@ -497,6 +497,7 @@ test raw_c_allocator { } test smp_allocator { + if (builtin.single_threaded) return; try testAllocator(smp_allocator); try testAllocatorAligned(smp_allocator); try testAllocatorLargeAlignment(smp_allocator); From 975cd9fc4ff8c12ae1f54e470b72be04d26e0837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= Date: Sat, 8 Feb 2025 05:31:27 +0100 Subject: [PATCH 13/13] musl: Align the stack pointer given to clone() on riscv. --- lib/libc/musl/src/thread/riscv32/clone.s | 2 ++ lib/libc/musl/src/thread/riscv64/clone.s | 2 ++ 2 files changed, 4 insertions(+) diff --git a/lib/libc/musl/src/thread/riscv32/clone.s b/lib/libc/musl/src/thread/riscv32/clone.s index 3102239d0d..484f83a199 100644 --- a/lib/libc/musl/src/thread/riscv32/clone.s +++ b/lib/libc/musl/src/thread/riscv32/clone.s @@ -7,6 +7,8 @@ .global __clone .type __clone, %function __clone: + andi a1, a1, -16 + # Save func and arg to stack addi a1, a1, -16 sw a0, 0(a1) diff --git a/lib/libc/musl/src/thread/riscv64/clone.s b/lib/libc/musl/src/thread/riscv64/clone.s index db908248cd..187a28d2e7 100644 --- a/lib/libc/musl/src/thread/riscv64/clone.s +++ b/lib/libc/musl/src/thread/riscv64/clone.s @@ -7,6 +7,8 @@ .global __clone .type __clone, %function __clone: + andi a1, a1, -16 + # Save func and arg to stack addi a1, a1, -16 sd a0, 0(a1)