diff --git a/bootstrap.c b/bootstrap.c index a37834f463..b6345987a9 100644 --- a/bootstrap.c +++ b/bootstrap.c @@ -139,7 +139,7 @@ int main(int argc, char **argv) { "pub const enable_tracy = false;\n" "pub const value_tracing = false;\n" "pub const skip_non_native = false;\n" - "pub const force_gpa = false;\n" + "pub const debug_gpa = false;\n" "pub const dev = .core;\n" "pub const value_interpret_mode = .direct;\n" , zig_version); diff --git a/build.zig b/build.zig index 12c76939a4..5ef2a76908 100644 --- a/build.zig +++ b/build.zig @@ -171,7 +171,7 @@ pub fn build(b: *std.Build) !void { const tracy_callstack = b.option(bool, "tracy-callstack", "Include callstack information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null); const tracy_allocation = b.option(bool, "tracy-allocation", "Include allocation information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null); const tracy_callstack_depth: u32 = b.option(u32, "tracy-callstack-depth", "Declare callstack depth for Tracy data. Does nothing if -Dtracy_callstack is not provided") orelse 10; - const force_gpa = b.option(bool, "force-gpa", "Force the compiler to use GeneralPurposeAllocator") orelse false; + const debug_gpa = b.option(bool, "debug-allocator", "Force the compiler to use DebugAllocator") orelse false; const link_libc = b.option(bool, "force-link-libc", "Force self-hosted compiler to link libc") orelse (enable_llvm or only_c); const sanitize_thread = b.option(bool, "sanitize-thread", "Enable thread-sanitization") orelse false; const strip = b.option(bool, "strip", "Omit debug information"); @@ -233,7 +233,7 @@ pub fn build(b: *std.Build) !void { exe_options.addOption(bool, "llvm_has_csky", llvm_has_csky); exe_options.addOption(bool, "llvm_has_arc", llvm_has_arc); exe_options.addOption(bool, "llvm_has_xtensa", llvm_has_xtensa); - exe_options.addOption(bool, "force_gpa", force_gpa); + exe_options.addOption(bool, "debug_gpa", debug_gpa); exe_options.addOption(DevEnv, "dev", b.option(DevEnv, "dev", "Build a compiler with a reduced feature set for development of specific features") orelse if (only_c) .bootstrap else .full); exe_options.addOption(ValueInterpretMode, "value_interpret_mode", value_interpret_mode); @@ -608,7 +608,7 @@ fn addWasiUpdateStep(b: *std.Build, version: [:0]const u8) !void { exe_options.addOption(u32, "mem_leak_frames", 0); exe_options.addOption(bool, "have_llvm", false); - exe_options.addOption(bool, "force_gpa", false); + exe_options.addOption(bool, "debug_gpa", false); exe_options.addOption([:0]const u8, "version", version); exe_options.addOption(std.SemanticVersion, "semver", semver); exe_options.addOption(bool, "enable_debug_extensions", false); diff --git a/lib/libc/musl/src/thread/riscv32/clone.s b/lib/libc/musl/src/thread/riscv32/clone.s index 3102239d0d..484f83a199 100644 --- a/lib/libc/musl/src/thread/riscv32/clone.s +++ b/lib/libc/musl/src/thread/riscv32/clone.s @@ -7,6 +7,8 @@ .global __clone .type __clone, %function __clone: + andi a1, a1, -16 + # Save func and arg to stack addi a1, a1, -16 sw a0, 0(a1) diff --git a/lib/libc/musl/src/thread/riscv64/clone.s b/lib/libc/musl/src/thread/riscv64/clone.s index db908248cd..187a28d2e7 100644 --- a/lib/libc/musl/src/thread/riscv64/clone.s +++ b/lib/libc/musl/src/thread/riscv64/clone.s @@ -7,6 +7,8 @@ .global __clone .type __clone, %function __clone: + andi a1, a1, -16 + # Save func and arg to stack addi a1, a1, -16 sd a0, 0(a1) diff --git a/lib/std/heap.zig b/lib/std/heap.zig index d1874c5b00..b728d0da7e 100644 --- a/lib/std/heap.zig +++ b/lib/std/heap.zig @@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator; const windows = std.os.windows; pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator; -pub const WasmAllocator = @import("heap/WasmAllocator.zig"); -pub const PageAllocator = @import("heap/PageAllocator.zig"); -pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); -pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const SmpAllocator = @import("heap/SmpAllocator.zig"); pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig"); +pub const PageAllocator = @import("heap/PageAllocator.zig"); +pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator; +pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig"); +pub const WasmAllocator = @import("heap/WasmAllocator.zig"); pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config; pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator; @@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{ .vtable = &PageAllocator.vtable, }; +pub const smp_allocator: Allocator = .{ + .ptr = undefined, + .vtable = &SmpAllocator.vtable, +}; + /// This allocator is fast, small, and specific to WebAssembly. In the future, /// this will be the implementation automatically selected by /// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32 @@ -475,7 +481,7 @@ pub fn StackFallbackAllocator(comptime size: usize) type { }; } -test "c_allocator" { +test c_allocator { if (builtin.link_libc) { try testAllocator(c_allocator); try testAllocatorAligned(c_allocator); @@ -484,12 +490,20 @@ test "c_allocator" { } } -test "raw_c_allocator" { +test raw_c_allocator { if (builtin.link_libc) { try testAllocator(raw_c_allocator); } } +test smp_allocator { + if (builtin.single_threaded) return; + try testAllocator(smp_allocator); + try testAllocatorAligned(smp_allocator); + try testAllocatorLargeAlignment(smp_allocator); + try testAllocatorAlignedShrink(smp_allocator); +} + test PageAllocator { const allocator = page_allocator; try testAllocator(allocator); @@ -978,4 +992,5 @@ test { if (builtin.target.isWasm()) { _ = WasmAllocator; } + if (!builtin.single_threaded) _ = smp_allocator; } diff --git a/lib/std/heap/PageAllocator.zig b/lib/std/heap/PageAllocator.zig index 433e0f1218..a1eae13efc 100644 --- a/lib/std/heap/PageAllocator.zig +++ b/lib/std/heap/PageAllocator.zig @@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{ .free = free, }; -fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { - _ = context; - _ = ra; - assert(n > 0); - +pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 { const page_size = std.heap.pageSize(); if (n >= maxInt(usize) - page_size) return null; const alignment_bytes = alignment.toByteUnits(); @@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[* return result_ptr; } +fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + assert(n > 0); + return map(n, alignment); +} + fn resize( context: *anyopaque, memory: []u8, @@ -114,7 +117,7 @@ fn resize( return realloc(memory, new_len, false) != null; } -pub fn remap( +fn remap( context: *anyopaque, memory: []u8, alignment: mem.Alignment, @@ -127,21 +130,24 @@ pub fn remap( return realloc(memory, new_len, true); } -fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void { +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void { _ = context; _ = alignment; _ = return_address; + return unmap(@alignCast(memory)); +} +pub fn unmap(memory: []align(page_size_min) u8) void { if (native_os == .windows) { - windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE); + windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE); } else { - const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize()); - posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len])); + const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize()); + posix.munmap(memory.ptr[0..page_aligned_len]); } } -fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { - const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory); +pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 { + const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory); const page_size = std.heap.pageSize(); const new_size_aligned = mem.alignForward(usize, new_len, page_size); diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig new file mode 100644 index 0000000000..50593687ec --- /dev/null +++ b/lib/std/heap/SmpAllocator.zig @@ -0,0 +1,261 @@ +//! An allocator that is designed for ReleaseFast optimization mode, with +//! multi-threading enabled. +//! +//! This allocator is a singleton; it uses global state and only one should be +//! instantiated for the entire process. +//! +//! ## Basic Design +//! +//! Each thread gets a separate freelist, however, the data must be recoverable +//! when the thread exits. We do not directly learn when a thread exits, so +//! occasionally, one thread must attempt to reclaim another thread's +//! resources. +//! +//! Above a certain size, those allocations are memory mapped directly, with no +//! storage of allocation metadata. This works because the implementation +//! refuses resizes that would move an allocation from small category to large +//! category or vice versa. +//! +//! Each allocator operation checks the thread identifier from a threadlocal +//! variable to find out which metadata in the global state to access, and +//! attempts to grab its lock. This will usually succeed without contention, +//! unless another thread has been assigned the same id. In the case of such +//! contention, the thread moves on to the next thread metadata slot and +//! repeats the process of attempting to obtain the lock. +//! +//! By limiting the thread-local metadata array to the same number as the CPU +//! count, ensures that as threads are created and destroyed, they cycle +//! through the full set of freelists. + +const builtin = @import("builtin"); + +const std = @import("../std.zig"); +const assert = std.debug.assert; +const mem = std.mem; +const math = std.math; +const Allocator = std.mem.Allocator; +const SmpAllocator = @This(); +const PageAllocator = std.heap.PageAllocator; + +cpu_count: u32, +threads: [max_thread_count]Thread, + +var global: SmpAllocator = .{ + .threads = @splat(.{}), + .cpu_count = 0, +}; +threadlocal var thread_index: u32 = 0; + +const max_thread_count = 128; +const slab_len: usize = @max(std.heap.page_size_max, 64 * 1024); +/// Because of storing free list pointers, the minimum size class is 3. +const min_class = math.log2(@sizeOf(usize)); +const size_class_count = math.log2(slab_len) - min_class; +/// When a freelist length exceeds this number, a `free` will rotate up to +/// `max_free_search` times before pushing. +const max_freelist_len: u8 = 16; +const max_free_search = 1; +/// Before mapping a fresh page, `alloc` will rotate this many times. +const max_alloc_search = 1; + +const Thread = struct { + /// Avoid false sharing. + _: void align(std.atomic.cache_line) = {}, + + /// Protects the state in this struct (per-thread state). + /// + /// Threads lock this before accessing their own state in order + /// to support freelist reclamation. + mutex: std.Thread.Mutex = .{}, + + /// For each size class, tracks the next address to be returned from + /// `alloc` when the freelist is empty. + next_addrs: [size_class_count]usize = @splat(0), + /// For each size class, points to the freed pointer. + frees: [size_class_count]usize = @splat(0), + /// For each size class, tracks the number of items in the freelist. + freelist_lens: [size_class_count]u8 = @splat(0), + + fn lock() *Thread { + var index = thread_index; + { + const t = &global.threads[index]; + if (t.mutex.tryLock()) { + @branchHint(.likely); + return t; + } + } + const cpu_count = getCpuCount(); + assert(cpu_count != 0); + while (true) { + index = (index + 1) % cpu_count; + const t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_index = index; + return t; + } + } + } + + fn unlock(t: *Thread) void { + t.mutex.unlock(); + } +}; + +fn getCpuCount() u32 { + const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered); + if (cpu_count != 0) return cpu_count; + const n: u32 = @min(std.Thread.getCpuCount() catch max_thread_count, max_thread_count); + return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n; +} + +pub const vtable: Allocator.VTable = .{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, +}; + +comptime { + assert(!builtin.single_threaded); // you're holding it wrong +} + +fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.map(len, alignment); + } + + const slot_size = slotSize(class); + assert(slab_len % slot_size == 0); + var search_count: u8 = 0; + + var t = Thread.lock(); + + outer: while (true) { + const top_free_ptr = t.frees[class]; + if (top_free_ptr != 0) { + @branchHint(.likely); + defer t.unlock(); + const node: *usize = @ptrFromInt(top_free_ptr); + t.frees[class] = node.*; + t.freelist_lens[class] -|= 1; + return @ptrFromInt(top_free_ptr); + } + + const next_addr = t.next_addrs[class]; + if ((next_addr % slab_len) != 0) { + @branchHint(.likely); + defer t.unlock(); + t.next_addrs[class] = next_addr + slot_size; + return @ptrFromInt(next_addr); + } + + if (search_count >= max_alloc_search) { + @branchHint(.likely); + defer t.unlock(); + // slab alignment here ensures the % slab len earlier catches the end of slots. + const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null; + t.next_addrs[class] = @intFromPtr(slab) + slot_size; + t.freelist_lens[class] = 0; + return slab; + } + + t.unlock(); + const cpu_count = getCpuCount(); + assert(cpu_count != 0); + var index = thread_index; + while (true) { + index = (index + 1) % cpu_count; + t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_index = index; + search_count += 1; + continue :outer; + } + } + } +} + +fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return false; + return PageAllocator.realloc(memory, new_len, false) != null; + } + return new_class == class; +} + +fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + const new_class = sizeClassIndex(new_len, alignment); + if (class >= size_class_count) { + if (new_class < size_class_count) return null; + return PageAllocator.realloc(memory, new_len, true); + } + return if (new_class == class) memory.ptr else null; +} + +fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void { + _ = context; + _ = ra; + const class = sizeClassIndex(memory.len, alignment); + if (class >= size_class_count) { + @branchHint(.unlikely); + return PageAllocator.unmap(@alignCast(memory)); + } + + const node: *usize = @alignCast(@ptrCast(memory.ptr)); + var search_count: u8 = 0; + + var t = Thread.lock(); + + outer: while (true) { + const freelist_len = t.freelist_lens[class]; + if (freelist_len < max_freelist_len) { + @branchHint(.likely); + defer t.unlock(); + node.* = t.frees[class]; + t.frees[class] = @intFromPtr(node); + return; + } + + if (search_count >= max_free_search) { + defer t.unlock(); + t.freelist_lens[class] = freelist_len +| 1; + node.* = t.frees[class]; + t.frees[class] = @intFromPtr(node); + return; + } + + t.unlock(); + const cpu_count = getCpuCount(); + assert(cpu_count != 0); + var index = thread_index; + while (true) { + index = (index + 1) % cpu_count; + t = &global.threads[index]; + if (t.mutex.tryLock()) { + thread_index = index; + search_count += 1; + continue :outer; + } + } + } +} + +fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize { + return @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment), min_class) - min_class; +} + +fn slotSize(class: usize) usize { + return @as(usize, 1) << @intCast(class + min_class); +} diff --git a/lib/std/heap/WasmAllocator.zig b/lib/std/heap/WasmAllocator.zig index e30ac5ab01..0a9003f245 100644 --- a/lib/std/heap/WasmAllocator.zig +++ b/lib/std/heap/WasmAllocator.zig @@ -1,5 +1,3 @@ -//! This is intended to be merged into GeneralPurposeAllocator at some point. - const std = @import("../std.zig"); const builtin = @import("builtin"); const Allocator = std.mem.Allocator; diff --git a/lib/std/heap/debug_allocator.zig b/lib/std/heap/debug_allocator.zig index 8abf6133bf..296014aa3f 100644 --- a/lib/std/heap/debug_allocator.zig +++ b/lib/std/heap/debug_allocator.zig @@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type { self.mutex.lock(); defer self.mutex.unlock(); - assert(old_memory.len != 0); - const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment)); if (size_class_index >= self.buckets.len) { @branchHint(.unlikely); diff --git a/src/main.zig b/src/main.zig index a00261cc30..401f6b2296 100644 --- a/src/main.zig +++ b/src/main.zig @@ -171,30 +171,31 @@ pub fn log( std.debug.print(prefix1 ++ prefix2 ++ format ++ "\n", args); } -var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{ +var debug_allocator: std.heap.DebugAllocator(.{ .stack_trace_frames = build_options.mem_leak_frames, -}){}; +}) = .init; pub fn main() anyerror!void { crash_report.initialize(); - const use_gpa = (build_options.force_gpa or !builtin.link_libc) and native_os != .wasi; - const gpa = gpa: { - if (native_os == .wasi) { - break :gpa std.heap.wasm_allocator; + const gpa, const is_debug = gpa: { + if (build_options.debug_gpa) break :gpa .{ debug_allocator.allocator(), true }; + if (native_os == .wasi) break :gpa .{ std.heap.wasm_allocator, false }; + if (builtin.link_libc) { + // We would prefer to use raw libc allocator here, but cannot use + // it if it won't support the alignment we need. + if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) { + break :gpa .{ std.heap.c_allocator, false }; + } + break :gpa .{ std.heap.raw_c_allocator, false }; } - if (use_gpa) { - break :gpa general_purpose_allocator.allocator(); - } - // We would prefer to use raw libc allocator here, but cannot - // use it if it won't support the alignment we need. - if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) { - break :gpa std.heap.c_allocator; - } - break :gpa std.heap.raw_c_allocator; + break :gpa switch (builtin.mode) { + .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true }, + .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false }, + }; }; - defer if (use_gpa) { - _ = general_purpose_allocator.deinit(); + defer if (is_debug) { + _ = debug_allocator.deinit(); }; var arena_instance = std.heap.ArenaAllocator.init(gpa); defer arena_instance.deinit(); diff --git a/stage1/config.zig.in b/stage1/config.zig.in index 47d4b4e85f..d5c9a7ebbf 100644 --- a/stage1/config.zig.in +++ b/stage1/config.zig.in @@ -11,6 +11,6 @@ pub const enable_link_snapshots = false; pub const enable_tracy = false; pub const value_tracing = false; pub const skip_non_native = false; -pub const force_gpa = false; +pub const debug_gpa = false; pub const dev = .core; pub const value_interpret_mode = .direct;