add std.heap.SmpAllocator

An allocator intended to be used in -OReleaseFast mode when multi-threading is enabled.
2025-12-06 13:54:21 +00:00 · 2025-02-06 03:31:32 -08:00 · 2025-02-06 03:31:32 -08:00 · 51c4ffa410
commit 51c4ffa410
parent 6a6e72fff8
5 changed files with 317 additions and 20 deletions
--- a/lib/std/heap.zig
+++ b/lib/std/heap.zig
@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator;
 const windows = std.os.windows;
 pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator;
-pub const WasmAllocator = @import("heap/WasmAllocator.zig");
+pub const SmpAllocator = @import("heap/SmpAllocator.zig");
 pub const PageAllocator = @import("heap/PageAllocator.zig");
 pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
 pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator;
 pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig");
 pub const PageAllocator = @import("heap/PageAllocator.zig");
 pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator;
 pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
 pub const WasmAllocator = @import("heap/WasmAllocator.zig");
 pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config;
 pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator;
@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{
    .vtable = &PageAllocator.vtable,
 };
 pub const smp_allocator: Allocator = .{
    .ptr = undefined,
    .vtable = &SmpAllocator.vtable,
 };
 /// This allocator is fast, small, and specific to WebAssembly. In the future,
 /// this will be the implementation automatically selected by
 /// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32
@ -978,4 +984,5 @@ test {
    if (builtin.target.isWasm()) {
        _ = WasmAllocator;
    }
    if (!builtin.single_threaded) _ = smp_allocator;
 }
--- a/lib/std/heap/PageAllocator.zig
+++ b/lib/std/heap/PageAllocator.zig
@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{
    .free = free,
 };
-fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
+pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 {
    _ = context;
    _ = ra;
    assert(n > 0);
    const page_size = std.heap.pageSize();
    if (n >= maxInt(usize) - page_size) return null;
    const alignment_bytes = alignment.toByteUnits();
@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*
    return result_ptr;
 }
 fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
    _ = context;
    _ = ra;
    assert(n > 0);
    return map(n, alignment);
 }
 fn resize(
    context: *anyopaque,
    memory: []u8,
@ -114,7 +117,7 @@ fn resize(
    return realloc(memory, new_len, false) != null;
 }
-pub fn remap(
+fn remap(
    context: *anyopaque,
    memory: []u8,
    alignment: mem.Alignment,
@ -127,21 +130,24 @@ pub fn remap(
    return realloc(memory, new_len, true);
 }
-fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void {
+fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void {
    _ = context;
    _ = alignment;
    _ = return_address;
    return unmap(@alignCast(memory));
 }
 pub fn unmap(memory: []align(page_size_min) u8) void {
    if (native_os == .windows) {
-        windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE);
+        windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE);
    } else {
-        const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize());
+        const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize());
-        posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len]));
+        posix.munmap(memory.ptr[0..page_aligned_len]);
    }
 }
-fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
+pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
-    const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory);
+    const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory);
    const page_size = std.heap.pageSize();
    const new_size_aligned = mem.alignForward(usize, new_len, page_size);
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@ -0,0 +1,288 @@
 //! An allocator that is designed for ReleaseFast optimization mode, with
 //! multi-threading enabled.
 //!
 //! This allocator is a singleton; it uses global state and only one should be
 //! instantiated for the entire process.
 //!
 //! ## Basic Design
 //!
 //! Avoid locking the global mutex as much as possible.
 //!
 //! Each thread gets a separate freelist, however, the data must be recoverable
 //! when the thread exits. We do not directly learn when a thread exits, so
 //! occasionally, one thread must attempt to reclaim another thread's
 //! resources.
 //!
 //! Above a certain size, those allocations are memory mapped directly, with no
 //! storage of allocation metadata. This works because the implementation
 //! refuses resizes that would move an allocation from small category to large
 //! category or vice versa.
 //!
 //! Each allocator operation checks the thread identifier from a threadlocal
 //! variable to find out which metadata in the global state to access, and
 //! attempts to grab its lock. This will usually succeed without contention,
 //! unless another thread has been assigned the same id. In the case of such
 //! contention, the thread moves on to the next thread metadata slot and
 //! repeats the process of attempting to obtain the lock.
 //!
 //! By limiting the thread-local metadata array to the same number as the CPU
 //! count, ensures that as threads are created and destroyed, they cycle
 //! through the full set of freelists.
 const builtin = @import("builtin");
 const native_os = builtin.os.tag;
 const std = @import("../std.zig");
 const assert = std.debug.assert;
 const mem = std.mem;
 const math = std.math;
 const Allocator = std.mem.Allocator;
 const SmpAllocator = @This();
 const PageAllocator = std.heap.PageAllocator;
 /// Protects the state in this struct (global state), except for `threads`
 /// which each have their own mutex.
 mutex: std.Thread.Mutex,
 next_thread_index: u32,
 cpu_count: u32,
 threads: [max_thread_count]Thread,
 var global: SmpAllocator = .{
    .mutex = .{},
    .next_thread_index = 0,
    .threads = @splat(.{}),
    .cpu_count = 0,
 };
 threadlocal var thread_id: Thread.Id = .none;
 const max_thread_count = 128;
 const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) {
    .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path.
    .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`.
    else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low.
 });
 /// Because of storing free list pointers, the minimum size class is 3.
 const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize)));
 const size_class_count = math.log2(slab_len) - min_class;
 const Thread = struct {
    /// Avoid false sharing.
    _: void align(std.atomic.cache_line) = {},
    /// Protects the state in this struct (per-thread state).
    ///
    /// Threads lock this before accessing their own state in order
    /// to support freelist reclamation.
    mutex: std.Thread.Mutex = .{},
    next_addrs: [size_class_count]usize = @splat(0),
    /// For each size class, points to the freed pointer.
    frees: [size_class_count]usize = @splat(0),
    /// Index into `SmpAllocator.threads`.
    const Id = enum(usize) {
        none = 0,
        first = 1,
        _,
        fn fromIndex(index: usize) Id {
            return @enumFromInt(index + 1);
        }
        fn toIndex(id: Id) usize {
            return @intFromEnum(id) - 1;
        }
    };
    fn lock() *Thread {
        const id = thread_id;
        if (id != .none) {
            var index = id.toIndex();
            {
                const t = &global.threads[index];
                if (t.mutex.tryLock()) return t;
            }
            const cpu_count = global.cpu_count;
            assert(cpu_count != 0);
            while (true) {
                index = (index + 1) % cpu_count;
                const t = &global.threads[index];
                if (t.mutex.tryLock()) {
                    thread_id = .fromIndex(index);
                    return t;
                }
            }
        }
        while (true) {
            const thread_index = i: {
                global.mutex.lock();
                defer global.mutex.unlock();
                const cpu_count = c: {
                    const cpu_count = global.cpu_count;
                    if (cpu_count == 0) {
                        const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count));
                        global.cpu_count = n;
                        break :c n;
                    }
                    break :c cpu_count;
                };
                const thread_index = global.next_thread_index;
                global.next_thread_index = @intCast((thread_index + 1) % cpu_count);
                break :i thread_index;
            };
            const t = &global.threads[thread_index];
            if (t.mutex.tryLock()) {
                thread_id = .fromIndex(thread_index);
                return t;
            }
        }
    }
    fn unlock(t: *Thread) void {
        t.mutex.unlock();
    }
 };
 pub const vtable: Allocator.VTable = .{
    .alloc = alloc,
    .resize = resize,
    .remap = remap,
    .free = free,
 };
 comptime {
    assert(!builtin.single_threaded); // you're holding it wrong
 }
 fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
    _ = context;
    _ = ra;
    const class = sizeClassIndex(len, alignment);
    if (class >= size_class_count) {
        @branchHint(.unlikely);
        return PageAllocator.map(len, alignment);
    }
    const t = Thread.lock();
    defer t.unlock();
    const slot_size = slotSize(class);
    const top_free_ptr = t.frees[class];
    if (top_free_ptr != 0) {
        const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize)));
        t.frees[class] = node.*;
        return @ptrFromInt(top_free_ptr);
    }
    const next_addr = t.next_addrs[class];
    if (next_addr % slab_len == 0) {
        const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null;
        t.next_addrs[class] = @intFromPtr(slab) + slot_size;
        return slab;
    }
    t.next_addrs[class] = next_addr + slot_size;
    return @ptrFromInt(next_addr);
 }
 fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool {
    _ = context;
    _ = ra;
    const class = sizeClassIndex(memory.len, alignment);
    const new_class = sizeClassIndex(new_len, alignment);
    if (class >= size_class_count) {
        if (new_class < size_class_count) return false;
        return PageAllocator.realloc(memory, new_len, false) != null;
    }
    return new_class == class;
 }
 fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 {
    _ = context;
    _ = ra;
    const class = sizeClassIndex(memory.len, alignment);
    const new_class = sizeClassIndex(new_len, alignment);
    if (class >= size_class_count) {
        if (new_class < size_class_count) return null;
        return PageAllocator.realloc(memory, new_len, true);
    }
    return if (new_class == class) memory.ptr else null;
 }
 fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void {
    _ = context;
    _ = ra;
    const class = sizeClassIndex(memory.len, alignment);
    if (class >= size_class_count) {
        @branchHint(.unlikely);
        return PageAllocator.unmap(@alignCast(memory));
    }
    const t = Thread.lock();
    defer t.unlock();
    const slot_size = slotSize(class);
    const addr = @intFromPtr(memory.ptr);
    const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize)));
    node.* = t.frees[class];
    t.frees[class] = addr;
 }
 fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
    return @max(
        @bitSizeOf(usize) - @clz(len - 1),
        @intFromEnum(alignment),
        min_class,
    );
 }
 fn slotSize(class: usize) usize {
    const Log2USize = std.math.Log2Int(usize);
    return @as(usize, 1) << @as(Log2USize, @intCast(class));
 }
 test "large alloc, resize, remap, free" {
    const gpa = std.heap.smp_allocator;
    const ptr1 = try gpa.alloc(u64, 42768);
    const ptr2 = try gpa.alloc(u64, 52768);
    gpa.free(ptr1);
    const ptr3 = try gpa.alloc(u64, 62768);
    gpa.free(ptr3);
    gpa.free(ptr2);
 }
 test "small allocations - free in same order" {
    const gpa = std.heap.smp_allocator;
    var list = std.ArrayList(*u64).init(std.testing.allocator);
    defer list.deinit();
    var i: usize = 0;
    while (i < 513) : (i += 1) {
        const ptr = try gpa.create(u64);
        try list.append(ptr);
    }
    for (list.items) |ptr| {
        gpa.destroy(ptr);
    }
 }
 test "small allocations - free in reverse order" {
    const gpa = std.heap.smp_allocator;
    var list = std.ArrayList(*u64).init(std.testing.allocator);
    defer list.deinit();
    var i: usize = 0;
    while (i < 513) : (i += 1) {
        const ptr = try gpa.create(u64);
        try list.append(ptr);
    }
    while (list.popOrNull()) |ptr| {
        gpa.destroy(ptr);
    }
 }
--- a/lib/std/heap/WasmAllocator.zig
+++ b/lib/std/heap/WasmAllocator.zig
@ -1,5 +1,3 @@
 //! This is intended to be merged into GeneralPurposeAllocator at some point.
 const std = @import("../std.zig");
 const builtin = @import("builtin");
 const Allocator = std.mem.Allocator;
--- a/lib/std/heap/debug_allocator.zig
+++ b/lib/std/heap/debug_allocator.zig
@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type {
            self.mutex.lock();
            defer self.mutex.unlock();
            assert(old_memory.len != 0);
            const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment));
            if (size_class_index >= self.buckets.len) {
                @branchHint(.unlikely);