From 51c4ffa410d2cf51b7b45dab4dfd033db2190b7e Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 03:31:32 -0800
Subject: [PATCH 01/13] add std.heap.SmpAllocator

An allocator intended to be used in -OReleaseFast mode when
multi-threading is enabled.
---
 lib/std/heap.zig                 |  15 +-
 lib/std/heap/PageAllocator.zig   |  30 ++--
 lib/std/heap/SmpAllocator.zig    | 288 +++++++++++++++++++++++++++++++
 lib/std/heap/WasmAllocator.zig   |   2 -
 lib/std/heap/debug_allocator.zig |   2 -
 5 files changed, 317 insertions(+), 20 deletions(-)
 create mode 100644 lib/std/heap/SmpAllocator.zig

diff --git a/lib/std/heap.zig b/lib/std/heap.zig
index d1874c5b00..10e4cec608 100644
--- a/lib/std/heap.zig
+++ b/lib/std/heap.zig
@@ -9,11 +9,12 @@ const Allocator = std.mem.Allocator;
 const windows = std.os.windows;
 
 pub const ArenaAllocator = @import("heap/arena_allocator.zig").ArenaAllocator;
-pub const WasmAllocator = @import("heap/WasmAllocator.zig");
-pub const PageAllocator = @import("heap/PageAllocator.zig");
-pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
-pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator;
+pub const SmpAllocator = @import("heap/SmpAllocator.zig");
 pub const FixedBufferAllocator = @import("heap/FixedBufferAllocator.zig");
+pub const PageAllocator = @import("heap/PageAllocator.zig");
+pub const SbrkAllocator = @import("heap/sbrk_allocator.zig").SbrkAllocator;
+pub const ThreadSafeAllocator = @import("heap/ThreadSafeAllocator.zig");
+pub const WasmAllocator = @import("heap/WasmAllocator.zig");
 
 pub const DebugAllocatorConfig = @import("heap/debug_allocator.zig").Config;
 pub const DebugAllocator = @import("heap/debug_allocator.zig").DebugAllocator;
@@ -358,6 +359,11 @@ else if (builtin.target.isWasm()) .{
     .vtable = &PageAllocator.vtable,
 };
 
+pub const smp_allocator: Allocator = .{
+    .ptr = undefined,
+    .vtable = &SmpAllocator.vtable,
+};
+
 /// This allocator is fast, small, and specific to WebAssembly. In the future,
 /// this will be the implementation automatically selected by
 /// `GeneralPurposeAllocator` when compiling in `ReleaseSmall` mode for wasm32
@@ -978,4 +984,5 @@ test {
     if (builtin.target.isWasm()) {
         _ = WasmAllocator;
     }
+    if (!builtin.single_threaded) _ = smp_allocator;
 }
diff --git a/lib/std/heap/PageAllocator.zig b/lib/std/heap/PageAllocator.zig
index 433e0f1218..a1eae13efc 100644
--- a/lib/std/heap/PageAllocator.zig
+++ b/lib/std/heap/PageAllocator.zig
@@ -16,11 +16,7 @@ pub const vtable: Allocator.VTable = .{
     .free = free,
 };
 
-fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
-    _ = context;
-    _ = ra;
-    assert(n > 0);
-
+pub fn map(n: usize, alignment: mem.Alignment) ?[*]u8 {
     const page_size = std.heap.pageSize();
     if (n >= maxInt(usize) - page_size) return null;
     const alignment_bytes = alignment.toByteUnits();
@@ -101,6 +97,13 @@ fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*
     return result_ptr;
 }
 
+fn alloc(context: *anyopaque, n: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
+    _ = context;
+    _ = ra;
+    assert(n > 0);
+    return map(n, alignment);
+}
+
 fn resize(
     context: *anyopaque,
     memory: []u8,
@@ -114,7 +117,7 @@ fn resize(
     return realloc(memory, new_len, false) != null;
 }
 
-pub fn remap(
+fn remap(
     context: *anyopaque,
     memory: []u8,
     alignment: mem.Alignment,
@@ -127,21 +130,24 @@ pub fn remap(
     return realloc(memory, new_len, true);
 }
 
-fn free(context: *anyopaque, slice: []u8, alignment: mem.Alignment, return_address: usize) void {
+fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, return_address: usize) void {
     _ = context;
     _ = alignment;
     _ = return_address;
+    return unmap(@alignCast(memory));
+}
 
+pub fn unmap(memory: []align(page_size_min) u8) void {
     if (native_os == .windows) {
-        windows.VirtualFree(slice.ptr, 0, windows.MEM_RELEASE);
+        windows.VirtualFree(memory.ptr, 0, windows.MEM_RELEASE);
     } else {
-        const buf_aligned_len = mem.alignForward(usize, slice.len, std.heap.pageSize());
-        posix.munmap(@alignCast(slice.ptr[0..buf_aligned_len]));
+        const page_aligned_len = mem.alignForward(usize, memory.len, std.heap.pageSize());
+        posix.munmap(memory.ptr[0..page_aligned_len]);
     }
 }
 
-fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
-    const memory: []align(std.heap.page_size_min) u8 = @alignCast(uncasted_memory);
+pub fn realloc(uncasted_memory: []u8, new_len: usize, may_move: bool) ?[*]u8 {
+    const memory: []align(page_size_min) u8 = @alignCast(uncasted_memory);
     const page_size = std.heap.pageSize();
     const new_size_aligned = mem.alignForward(usize, new_len, page_size);
 
diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
new file mode 100644
index 0000000000..6fd68c19b2
--- /dev/null
+++ b/lib/std/heap/SmpAllocator.zig
@@ -0,0 +1,288 @@
+//! An allocator that is designed for ReleaseFast optimization mode, with
+//! multi-threading enabled.
+//!
+//! This allocator is a singleton; it uses global state and only one should be
+//! instantiated for the entire process.
+//!
+//! ## Basic Design
+//!
+//! Avoid locking the global mutex as much as possible.
+//!
+//! Each thread gets a separate freelist, however, the data must be recoverable
+//! when the thread exits. We do not directly learn when a thread exits, so
+//! occasionally, one thread must attempt to reclaim another thread's
+//! resources.
+//!
+//! Above a certain size, those allocations are memory mapped directly, with no
+//! storage of allocation metadata. This works because the implementation
+//! refuses resizes that would move an allocation from small category to large
+//! category or vice versa.
+//!
+//! Each allocator operation checks the thread identifier from a threadlocal
+//! variable to find out which metadata in the global state to access, and
+//! attempts to grab its lock. This will usually succeed without contention,
+//! unless another thread has been assigned the same id. In the case of such
+//! contention, the thread moves on to the next thread metadata slot and
+//! repeats the process of attempting to obtain the lock.
+//!
+//! By limiting the thread-local metadata array to the same number as the CPU
+//! count, ensures that as threads are created and destroyed, they cycle
+//! through the full set of freelists.
+
+const builtin = @import("builtin");
+const native_os = builtin.os.tag;
+
+const std = @import("../std.zig");
+const assert = std.debug.assert;
+const mem = std.mem;
+const math = std.math;
+const Allocator = std.mem.Allocator;
+const SmpAllocator = @This();
+const PageAllocator = std.heap.PageAllocator;
+
+/// Protects the state in this struct (global state), except for `threads`
+/// which each have their own mutex.
+mutex: std.Thread.Mutex,
+next_thread_index: u32,
+cpu_count: u32,
+threads: [max_thread_count]Thread,
+
+var global: SmpAllocator = .{
+    .mutex = .{},
+    .next_thread_index = 0,
+    .threads = @splat(.{}),
+    .cpu_count = 0,
+};
+threadlocal var thread_id: Thread.Id = .none;
+
+const max_thread_count = 128;
+const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) {
+    .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path.
+    .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`.
+    else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low.
+});
+/// Because of storing free list pointers, the minimum size class is 3.
+const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize)));
+const size_class_count = math.log2(slab_len) - min_class;
+
+const Thread = struct {
+    /// Avoid false sharing.
+    _: void align(std.atomic.cache_line) = {},
+
+    /// Protects the state in this struct (per-thread state).
+    ///
+    /// Threads lock this before accessing their own state in order
+    /// to support freelist reclamation.
+    mutex: std.Thread.Mutex = .{},
+
+    next_addrs: [size_class_count]usize = @splat(0),
+    /// For each size class, points to the freed pointer.
+    frees: [size_class_count]usize = @splat(0),
+
+    /// Index into `SmpAllocator.threads`.
+    const Id = enum(usize) {
+        none = 0,
+        first = 1,
+        _,
+
+        fn fromIndex(index: usize) Id {
+            return @enumFromInt(index + 1);
+        }
+
+        fn toIndex(id: Id) usize {
+            return @intFromEnum(id) - 1;
+        }
+    };
+
+    fn lock() *Thread {
+        const id = thread_id;
+        if (id != .none) {
+            var index = id.toIndex();
+            {
+                const t = &global.threads[index];
+                if (t.mutex.tryLock()) return t;
+            }
+            const cpu_count = global.cpu_count;
+            assert(cpu_count != 0);
+            while (true) {
+                index = (index + 1) % cpu_count;
+                const t = &global.threads[index];
+                if (t.mutex.tryLock()) {
+                    thread_id = .fromIndex(index);
+                    return t;
+                }
+            }
+        }
+        while (true) {
+            const thread_index = i: {
+                global.mutex.lock();
+                defer global.mutex.unlock();
+                const cpu_count = c: {
+                    const cpu_count = global.cpu_count;
+                    if (cpu_count == 0) {
+                        const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count));
+                        global.cpu_count = n;
+                        break :c n;
+                    }
+                    break :c cpu_count;
+                };
+                const thread_index = global.next_thread_index;
+                global.next_thread_index = @intCast((thread_index + 1) % cpu_count);
+                break :i thread_index;
+            };
+            const t = &global.threads[thread_index];
+            if (t.mutex.tryLock()) {
+                thread_id = .fromIndex(thread_index);
+                return t;
+            }
+        }
+    }
+
+    fn unlock(t: *Thread) void {
+        t.mutex.unlock();
+    }
+};
+
+pub const vtable: Allocator.VTable = .{
+    .alloc = alloc,
+    .resize = resize,
+    .remap = remap,
+    .free = free,
+};
+
+comptime {
+    assert(!builtin.single_threaded); // you're holding it wrong
+}
+
+fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?[*]u8 {
+    _ = context;
+    _ = ra;
+    const class = sizeClassIndex(len, alignment);
+    if (class >= size_class_count) {
+        @branchHint(.unlikely);
+        return PageAllocator.map(len, alignment);
+    }
+
+    const t = Thread.lock();
+    defer t.unlock();
+
+    const slot_size = slotSize(class);
+
+    const top_free_ptr = t.frees[class];
+    if (top_free_ptr != 0) {
+        const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize)));
+        t.frees[class] = node.*;
+        return @ptrFromInt(top_free_ptr);
+    }
+
+    const next_addr = t.next_addrs[class];
+    if (next_addr % slab_len == 0) {
+        const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null;
+        t.next_addrs[class] = @intFromPtr(slab) + slot_size;
+        return slab;
+    }
+
+    t.next_addrs[class] = next_addr + slot_size;
+    return @ptrFromInt(next_addr);
+}
+
+fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool {
+    _ = context;
+    _ = ra;
+    const class = sizeClassIndex(memory.len, alignment);
+    const new_class = sizeClassIndex(new_len, alignment);
+    if (class >= size_class_count) {
+        if (new_class < size_class_count) return false;
+        return PageAllocator.realloc(memory, new_len, false) != null;
+    }
+    return new_class == class;
+}
+
+fn remap(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) ?[*]u8 {
+    _ = context;
+    _ = ra;
+    const class = sizeClassIndex(memory.len, alignment);
+    const new_class = sizeClassIndex(new_len, alignment);
+    if (class >= size_class_count) {
+        if (new_class < size_class_count) return null;
+        return PageAllocator.realloc(memory, new_len, true);
+    }
+    return if (new_class == class) memory.ptr else null;
+}
+
+fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize) void {
+    _ = context;
+    _ = ra;
+    const class = sizeClassIndex(memory.len, alignment);
+    if (class >= size_class_count) {
+        @branchHint(.unlikely);
+        return PageAllocator.unmap(@alignCast(memory));
+    }
+
+    const t = Thread.lock();
+    defer t.unlock();
+
+    const slot_size = slotSize(class);
+    const addr = @intFromPtr(memory.ptr);
+    const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize)));
+    node.* = t.frees[class];
+    t.frees[class] = addr;
+}
+
+fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
+    return @max(
+        @bitSizeOf(usize) - @clz(len - 1),
+        @intFromEnum(alignment),
+        min_class,
+    );
+}
+
+fn slotSize(class: usize) usize {
+    const Log2USize = std.math.Log2Int(usize);
+    return @as(usize, 1) << @as(Log2USize, @intCast(class));
+}
+
+test "large alloc, resize, remap, free" {
+    const gpa = std.heap.smp_allocator;
+
+    const ptr1 = try gpa.alloc(u64, 42768);
+    const ptr2 = try gpa.alloc(u64, 52768);
+    gpa.free(ptr1);
+    const ptr3 = try gpa.alloc(u64, 62768);
+    gpa.free(ptr3);
+    gpa.free(ptr2);
+}
+
+test "small allocations - free in same order" {
+    const gpa = std.heap.smp_allocator;
+
+    var list = std.ArrayList(*u64).init(std.testing.allocator);
+    defer list.deinit();
+
+    var i: usize = 0;
+    while (i < 513) : (i += 1) {
+        const ptr = try gpa.create(u64);
+        try list.append(ptr);
+    }
+
+    for (list.items) |ptr| {
+        gpa.destroy(ptr);
+    }
+}
+
+test "small allocations - free in reverse order" {
+    const gpa = std.heap.smp_allocator;
+
+    var list = std.ArrayList(*u64).init(std.testing.allocator);
+    defer list.deinit();
+
+    var i: usize = 0;
+    while (i < 513) : (i += 1) {
+        const ptr = try gpa.create(u64);
+        try list.append(ptr);
+    }
+
+    while (list.popOrNull()) |ptr| {
+        gpa.destroy(ptr);
+    }
+}
diff --git a/lib/std/heap/WasmAllocator.zig b/lib/std/heap/WasmAllocator.zig
index e30ac5ab01..0a9003f245 100644
--- a/lib/std/heap/WasmAllocator.zig
+++ b/lib/std/heap/WasmAllocator.zig
@@ -1,5 +1,3 @@
-//! This is intended to be merged into GeneralPurposeAllocator at some point.
-
 const std = @import("../std.zig");
 const builtin = @import("builtin");
 const Allocator = std.mem.Allocator;
diff --git a/lib/std/heap/debug_allocator.zig b/lib/std/heap/debug_allocator.zig
index 8abf6133bf..296014aa3f 100644
--- a/lib/std/heap/debug_allocator.zig
+++ b/lib/std/heap/debug_allocator.zig
@@ -851,8 +851,6 @@ pub fn DebugAllocator(comptime config: Config) type {
             self.mutex.lock();
             defer self.mutex.unlock();
 
-            assert(old_memory.len != 0);
-
             const size_class_index: usize = @max(@bitSizeOf(usize) - @clz(old_memory.len - 1), @intFromEnum(alignment));
             if (size_class_index >= self.buckets.len) {
                 @branchHint(.unlikely);

From 3d7c5cf64a3bbe55dfa943133d1a5a7a34fe388c Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 14:14:12 -0800
Subject: [PATCH 02/13] std.heap: test smp_allocator

---
 lib/std/heap.zig              | 11 +++++++--
 lib/std/heap/SmpAllocator.zig | 45 -----------------------------------
 2 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/lib/std/heap.zig b/lib/std/heap.zig
index 10e4cec608..290b39b624 100644
--- a/lib/std/heap.zig
+++ b/lib/std/heap.zig
@@ -481,7 +481,7 @@ pub fn StackFallbackAllocator(comptime size: usize) type {
     };
 }
 
-test "c_allocator" {
+test c_allocator {
     if (builtin.link_libc) {
         try testAllocator(c_allocator);
         try testAllocatorAligned(c_allocator);
@@ -490,12 +490,19 @@ test "c_allocator" {
     }
 }
 
-test "raw_c_allocator" {
+test raw_c_allocator {
     if (builtin.link_libc) {
         try testAllocator(raw_c_allocator);
     }
 }
 
+test smp_allocator {
+    try testAllocator(smp_allocator);
+    try testAllocatorAligned(smp_allocator);
+    try testAllocatorLargeAlignment(smp_allocator);
+    try testAllocatorAlignedShrink(smp_allocator);
+}
+
 test PageAllocator {
     const allocator = page_allocator;
     try testAllocator(allocator);
diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index 6fd68c19b2..b1f2b14d0a 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -241,48 +241,3 @@ fn slotSize(class: usize) usize {
     const Log2USize = std.math.Log2Int(usize);
     return @as(usize, 1) << @as(Log2USize, @intCast(class));
 }
-
-test "large alloc, resize, remap, free" {
-    const gpa = std.heap.smp_allocator;
-
-    const ptr1 = try gpa.alloc(u64, 42768);
-    const ptr2 = try gpa.alloc(u64, 52768);
-    gpa.free(ptr1);
-    const ptr3 = try gpa.alloc(u64, 62768);
-    gpa.free(ptr3);
-    gpa.free(ptr2);
-}
-
-test "small allocations - free in same order" {
-    const gpa = std.heap.smp_allocator;
-
-    var list = std.ArrayList(*u64).init(std.testing.allocator);
-    defer list.deinit();
-
-    var i: usize = 0;
-    while (i < 513) : (i += 1) {
-        const ptr = try gpa.create(u64);
-        try list.append(ptr);
-    }
-
-    for (list.items) |ptr| {
-        gpa.destroy(ptr);
-    }
-}
-
-test "small allocations - free in reverse order" {
-    const gpa = std.heap.smp_allocator;
-
-    var list = std.ArrayList(*u64).init(std.testing.allocator);
-    defer list.deinit();
-
-    var i: usize = 0;
-    while (i < 513) : (i += 1) {
-        const ptr = try gpa.create(u64);
-        try list.append(ptr);
-    }
-
-    while (list.popOrNull()) |ptr| {
-        gpa.destroy(ptr);
-    }
-}

From 84bf7a6701d5f51a8307736d310d4049c9158921 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 17:35:27 -0800
Subject: [PATCH 03/13] std.heap.SmpAllocator: 256K slab_len

and no need for special handling of wasi and windows since we don't ask
for anything more than page-aligned.
---
 lib/std/heap/SmpAllocator.zig | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index b1f2b14d0a..1a7eab2153 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -30,7 +30,6 @@
 //! through the full set of freelists.
 
 const builtin = @import("builtin");
-const native_os = builtin.os.tag;
 
 const std = @import("../std.zig");
 const assert = std.debug.assert;
@@ -56,11 +55,7 @@ var global: SmpAllocator = .{
 threadlocal var thread_id: Thread.Id = .none;
 
 const max_thread_count = 128;
-const slab_len: usize = @max(std.heap.page_size_max, switch (builtin.os.tag) {
-    .windows => 64 * 1024, // Makes `std.heap.PageAllocator` take the happy path.
-    .wasi => 64 * 1024, // Max alignment supported by `std.heap.WasmAllocator`.
-    else => 256 * 1024, // Avoids too many active mappings when `page_size_max` is low.
-});
+const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024);
 /// Because of storing free list pointers, the minimum size class is 3.
 const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize)));
 const size_class_count = math.log2(slab_len) - min_class;

From 7360be19a461175d1a50dfb1d7c086bcc650b3c1 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 17:41:50 -0800
Subject: [PATCH 04/13] compiler: use std.heap.smp_allocator

In main, now this allocator is chosen by default when compiling without
libc in ReleaseFast or ReleaseSmall, and not targeting WebAssembly.
---
 bootstrap.c          |  2 +-
 build.zig            |  6 +++---
 src/main.zig         | 35 ++++++++++++++++++-----------------
 stage1/config.zig.in |  2 +-
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/bootstrap.c b/bootstrap.c
index a37834f463..b6345987a9 100644
--- a/bootstrap.c
+++ b/bootstrap.c
@@ -139,7 +139,7 @@ int main(int argc, char **argv) {
             "pub const enable_tracy = false;\n"
             "pub const value_tracing = false;\n"
             "pub const skip_non_native = false;\n"
-            "pub const force_gpa = false;\n"
+            "pub const debug_gpa = false;\n"
             "pub const dev = .core;\n"
             "pub const value_interpret_mode = .direct;\n"
         , zig_version);
diff --git a/build.zig b/build.zig
index 14685ffd29..1de11b2db8 100644
--- a/build.zig
+++ b/build.zig
@@ -171,7 +171,7 @@ pub fn build(b: *std.Build) !void {
     const tracy_callstack = b.option(bool, "tracy-callstack", "Include callstack information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null);
     const tracy_allocation = b.option(bool, "tracy-allocation", "Include allocation information with Tracy data. Does nothing if -Dtracy is not provided") orelse (tracy != null);
     const tracy_callstack_depth: u32 = b.option(u32, "tracy-callstack-depth", "Declare callstack depth for Tracy data. Does nothing if -Dtracy_callstack is not provided") orelse 10;
-    const force_gpa = b.option(bool, "force-gpa", "Force the compiler to use GeneralPurposeAllocator") orelse false;
+    const debug_gpa = b.option(bool, "debug-allocator", "Force the compiler to use DebugAllocator") orelse false;
     const link_libc = b.option(bool, "force-link-libc", "Force self-hosted compiler to link libc") orelse (enable_llvm or only_c);
     const sanitize_thread = b.option(bool, "sanitize-thread", "Enable thread-sanitization") orelse false;
     const strip = b.option(bool, "strip", "Omit debug information");
@@ -233,7 +233,7 @@ pub fn build(b: *std.Build) !void {
     exe_options.addOption(bool, "llvm_has_csky", llvm_has_csky);
     exe_options.addOption(bool, "llvm_has_arc", llvm_has_arc);
     exe_options.addOption(bool, "llvm_has_xtensa", llvm_has_xtensa);
-    exe_options.addOption(bool, "force_gpa", force_gpa);
+    exe_options.addOption(bool, "debug_gpa", debug_gpa);
     exe_options.addOption(DevEnv, "dev", b.option(DevEnv, "dev", "Build a compiler with a reduced feature set for development of specific features") orelse if (only_c) .bootstrap else .full);
     exe_options.addOption(ValueInterpretMode, "value_interpret_mode", value_interpret_mode);
 
@@ -608,7 +608,7 @@ fn addWasiUpdateStep(b: *std.Build, version: [:0]const u8) !void {
 
     exe_options.addOption(u32, "mem_leak_frames", 0);
     exe_options.addOption(bool, "have_llvm", false);
-    exe_options.addOption(bool, "force_gpa", false);
+    exe_options.addOption(bool, "debug_gpa", false);
     exe_options.addOption([:0]const u8, "version", version);
     exe_options.addOption(std.SemanticVersion, "semver", semver);
     exe_options.addOption(bool, "enable_debug_extensions", false);
diff --git a/src/main.zig b/src/main.zig
index a00261cc30..401f6b2296 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -171,30 +171,31 @@ pub fn log(
     std.debug.print(prefix1 ++ prefix2 ++ format ++ "\n", args);
 }
 
-var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{
+var debug_allocator: std.heap.DebugAllocator(.{
     .stack_trace_frames = build_options.mem_leak_frames,
-}){};
+}) = .init;
 
 pub fn main() anyerror!void {
     crash_report.initialize();
 
-    const use_gpa = (build_options.force_gpa or !builtin.link_libc) and native_os != .wasi;
-    const gpa = gpa: {
-        if (native_os == .wasi) {
-            break :gpa std.heap.wasm_allocator;
+    const gpa, const is_debug = gpa: {
+        if (build_options.debug_gpa) break :gpa .{ debug_allocator.allocator(), true };
+        if (native_os == .wasi) break :gpa .{ std.heap.wasm_allocator, false };
+        if (builtin.link_libc) {
+            // We would prefer to use raw libc allocator here, but cannot use
+            // it if it won't support the alignment we need.
+            if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
+                break :gpa .{ std.heap.c_allocator, false };
+            }
+            break :gpa .{ std.heap.raw_c_allocator, false };
         }
-        if (use_gpa) {
-            break :gpa general_purpose_allocator.allocator();
-        }
-        // We would prefer to use raw libc allocator here, but cannot
-        // use it if it won't support the alignment we need.
-        if (@alignOf(std.c.max_align_t) < @max(@alignOf(i128), std.atomic.cache_line)) {
-            break :gpa std.heap.c_allocator;
-        }
-        break :gpa std.heap.raw_c_allocator;
+        break :gpa switch (builtin.mode) {
+            .Debug, .ReleaseSafe => .{ debug_allocator.allocator(), true },
+            .ReleaseFast, .ReleaseSmall => .{ std.heap.smp_allocator, false },
+        };
     };
-    defer if (use_gpa) {
-        _ = general_purpose_allocator.deinit();
+    defer if (is_debug) {
+        _ = debug_allocator.deinit();
     };
     var arena_instance = std.heap.ArenaAllocator.init(gpa);
     defer arena_instance.deinit();
diff --git a/stage1/config.zig.in b/stage1/config.zig.in
index 47d4b4e85f..d5c9a7ebbf 100644
--- a/stage1/config.zig.in
+++ b/stage1/config.zig.in
@@ -11,6 +11,6 @@ pub const enable_link_snapshots = false;
 pub const enable_tracy = false;
 pub const value_tracing = false;
 pub const skip_non_native = false;
-pub const force_gpa = false;
+pub const debug_gpa = false;
 pub const dev = .core;
 pub const value_interpret_mode = .direct;

From 60765a9ee2d66d4c2b870e726aa7e7bc2590e2b8 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 19:48:02 -0800
Subject: [PATCH 05/13] std.heap.SmpAllocator: implement searching on alloc

rotate a couple times before resorting to mapping more memory.
---
 lib/std/heap/SmpAllocator.zig | 69 ++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 21 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index 1a7eab2153..4d10624310 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -158,27 +158,53 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
         return PageAllocator.map(len, alignment);
     }
 
-    const t = Thread.lock();
-    defer t.unlock();
-
     const slot_size = slotSize(class);
+    const max_search = 2;
+    var search_count: u32 = 0;
 
-    const top_free_ptr = t.frees[class];
-    if (top_free_ptr != 0) {
-        const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize)));
-        t.frees[class] = node.*;
-        return @ptrFromInt(top_free_ptr);
+    var t = Thread.lock();
+
+    outer: while (true) {
+        const top_free_ptr = t.frees[class];
+        if (top_free_ptr != 0) {
+            @branchHint(.likely);
+            defer t.unlock();
+            const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize)));
+            t.frees[class] = node.*;
+            return @ptrFromInt(top_free_ptr);
+        }
+
+        const next_addr = t.next_addrs[class];
+        if ((next_addr % slab_len) != 0) {
+            @branchHint(.likely);
+            defer t.unlock();
+            t.next_addrs[class] = next_addr + slot_size;
+            return @ptrFromInt(next_addr);
+        }
+
+        if (search_count >= max_search) {
+            @branchHint(.likely);
+            defer t.unlock();
+            const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null;
+            t.next_addrs[class] = @intFromPtr(slab) + slot_size;
+            return slab;
+        }
+
+        t.unlock();
+        t = undefined;
+        const cpu_count = global.cpu_count;
+        assert(cpu_count != 0);
+        var index = thread_id.toIndex();
+        while (true) {
+            index = (index + 1) % cpu_count;
+            t = &global.threads[index];
+            if (t.mutex.tryLock()) {
+                thread_id = .fromIndex(index);
+                search_count += 1;
+                continue :outer;
+            }
+        }
     }
-
-    const next_addr = t.next_addrs[class];
-    if (next_addr % slab_len == 0) {
-        const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null;
-        t.next_addrs[class] = @intFromPtr(slab) + slot_size;
-        return slab;
-    }
-
-    t.next_addrs[class] = next_addr + slot_size;
-    return @ptrFromInt(next_addr);
 }
 
 fn resize(context: *anyopaque, memory: []u8, alignment: mem.Alignment, new_len: usize, ra: usize) bool {
@@ -214,12 +240,13 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize)
         return PageAllocator.unmap(@alignCast(memory));
     }
 
-    const t = Thread.lock();
-    defer t.unlock();
-
     const slot_size = slotSize(class);
     const addr = @intFromPtr(memory.ptr);
     const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize)));
+
+    const t = Thread.lock();
+    defer t.unlock();
+
     node.* = t.frees[class];
     t.frees[class] = addr;
 }

From 839c453d880e22f2f120d62332b273380a215cc5 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Thu, 6 Feb 2025 21:47:46 -0800
Subject: [PATCH 06/13] std.heap.SmpAllocator: eliminate the global mutex

---
 lib/std/heap/SmpAllocator.zig | 86 ++++++++++-------------------------
 1 file changed, 24 insertions(+), 62 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index 4d10624310..b4e7138291 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -39,20 +39,14 @@ const Allocator = std.mem.Allocator;
 const SmpAllocator = @This();
 const PageAllocator = std.heap.PageAllocator;
 
-/// Protects the state in this struct (global state), except for `threads`
-/// which each have their own mutex.
-mutex: std.Thread.Mutex,
-next_thread_index: u32,
 cpu_count: u32,
 threads: [max_thread_count]Thread,
 
 var global: SmpAllocator = .{
-    .mutex = .{},
-    .next_thread_index = 0,
     .threads = @splat(.{}),
     .cpu_count = 0,
 };
-threadlocal var thread_id: Thread.Id = .none;
+threadlocal var thread_index: u32 = 0;
 
 const max_thread_count = 128;
 const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024);
@@ -74,60 +68,22 @@ const Thread = struct {
     /// For each size class, points to the freed pointer.
     frees: [size_class_count]usize = @splat(0),
 
-    /// Index into `SmpAllocator.threads`.
-    const Id = enum(usize) {
-        none = 0,
-        first = 1,
-        _,
-
-        fn fromIndex(index: usize) Id {
-            return @enumFromInt(index + 1);
-        }
-
-        fn toIndex(id: Id) usize {
-            return @intFromEnum(id) - 1;
-        }
-    };
-
     fn lock() *Thread {
-        const id = thread_id;
-        if (id != .none) {
-            var index = id.toIndex();
-            {
-                const t = &global.threads[index];
-                if (t.mutex.tryLock()) return t;
-            }
-            const cpu_count = global.cpu_count;
-            assert(cpu_count != 0);
-            while (true) {
-                index = (index + 1) % cpu_count;
-                const t = &global.threads[index];
-                if (t.mutex.tryLock()) {
-                    thread_id = .fromIndex(index);
-                    return t;
-                }
+        var index = thread_index;
+        {
+            const t = &global.threads[index];
+            if (t.mutex.tryLock()) {
+                @branchHint(.likely);
+                return t;
             }
         }
+        const cpu_count = getCpuCount();
+        assert(cpu_count != 0);
         while (true) {
-            const thread_index = i: {
-                global.mutex.lock();
-                defer global.mutex.unlock();
-                const cpu_count = c: {
-                    const cpu_count = global.cpu_count;
-                    if (cpu_count == 0) {
-                        const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count));
-                        global.cpu_count = n;
-                        break :c n;
-                    }
-                    break :c cpu_count;
-                };
-                const thread_index = global.next_thread_index;
-                global.next_thread_index = @intCast((thread_index + 1) % cpu_count);
-                break :i thread_index;
-            };
-            const t = &global.threads[thread_index];
+            index = (index + 1) % cpu_count;
+            const t = &global.threads[index];
             if (t.mutex.tryLock()) {
-                thread_id = .fromIndex(thread_index);
+                thread_index = index;
                 return t;
             }
         }
@@ -138,6 +94,13 @@ const Thread = struct {
     }
 };
 
+fn getCpuCount() u32 {
+    const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered);
+    if (cpu_count != 0) return cpu_count;
+    const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count));
+    return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n;
+}
+
 pub const vtable: Allocator.VTable = .{
     .alloc = alloc,
     .resize = resize,
@@ -159,8 +122,8 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
     }
 
     const slot_size = slotSize(class);
-    const max_search = 2;
-    var search_count: u32 = 0;
+    const max_search = 1;
+    var search_count: u8 = 0;
 
     var t = Thread.lock();
 
@@ -191,15 +154,14 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
         }
 
         t.unlock();
-        t = undefined;
-        const cpu_count = global.cpu_count;
+        const cpu_count = getCpuCount();
         assert(cpu_count != 0);
-        var index = thread_id.toIndex();
+        var index = thread_index;
         while (true) {
             index = (index + 1) % cpu_count;
             t = &global.threads[index];
             if (t.mutex.tryLock()) {
-                thread_id = .fromIndex(index);
+                thread_index = index;
                 search_count += 1;
                 continue :outer;
             }

From 1ffae59fec60816ff364b4ade93c6fc2fc1571cf Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 00:47:43 -0800
Subject: [PATCH 07/13] std.heap.SmpAllocator: fix using wrong size class
 indices

---
 lib/std/heap/SmpAllocator.zig | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index b4e7138291..c1c9ca5517 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -215,13 +215,11 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize)
 
 fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
     return @max(
-        @bitSizeOf(usize) - @clz(len - 1),
-        @intFromEnum(alignment),
-        min_class,
-    );
+        @bitSizeOf(usize) - @clz(len + (@sizeOf(usize) - 1)),
+        @intFromEnum(alignment) + 1,
+    ) - min_class;
 }
 
 fn slotSize(class: usize) usize {
-    const Log2USize = std.math.Log2Int(usize);
-    return @as(usize, 1) << @as(Log2USize, @intCast(class));
+    return @as(usize, 1) << @intCast(class + min_class);
 }

From 88e2e60e88ebaeb46b2642830cd8cd369dc737c0 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 00:58:01 -0800
Subject: [PATCH 08/13] std.heap.SmpAllocator: simplify by putting freelist
 node at start

---
 lib/std/heap/SmpAllocator.zig | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index c1c9ca5517..0c2dc4348d 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -51,7 +51,7 @@ threadlocal var thread_index: u32 = 0;
 const max_thread_count = 128;
 const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024);
 /// Because of storing free list pointers, the minimum size class is 3.
-const min_class = math.log2(math.ceilPowerOfTwoAssert(usize, 1 + @sizeOf(usize)));
+const min_class = math.log2(@sizeOf(usize));
 const size_class_count = math.log2(slab_len) - min_class;
 
 const Thread = struct {
@@ -132,7 +132,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
         if (top_free_ptr != 0) {
             @branchHint(.likely);
             defer t.unlock();
-            const node: *usize = @ptrFromInt(top_free_ptr + (slot_size - @sizeOf(usize)));
+            const node: *usize = @ptrFromInt(top_free_ptr);
             t.frees[class] = node.*;
             return @ptrFromInt(top_free_ptr);
         }
@@ -202,22 +202,17 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize)
         return PageAllocator.unmap(@alignCast(memory));
     }
 
-    const slot_size = slotSize(class);
-    const addr = @intFromPtr(memory.ptr);
-    const node: *usize = @ptrFromInt(addr + (slot_size - @sizeOf(usize)));
+    const node: *usize = @alignCast(@ptrCast(memory.ptr));
 
     const t = Thread.lock();
     defer t.unlock();
 
     node.* = t.frees[class];
-    t.frees[class] = addr;
+    t.frees[class] = @intFromPtr(node);
 }
 
 fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {
-    return @max(
-        @bitSizeOf(usize) - @clz(len + (@sizeOf(usize) - 1)),
-        @intFromEnum(alignment) + 1,
-    ) - min_class;
+    return @max(@bitSizeOf(usize) - @clz(len - 1), @intFromEnum(alignment), min_class) - min_class;
 }
 
 fn slotSize(class: usize) usize {

From 3246150d4589a25cd1ad762a78d2806ef1da2a95 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 01:21:09 -0800
Subject: [PATCH 09/13] std.heap.SmpAllocator: fix getCpuCount logic

it was always returning max_cpu_count
---
 lib/std/heap/SmpAllocator.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index 0c2dc4348d..9dce341260 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -97,7 +97,7 @@ const Thread = struct {
 fn getCpuCount() u32 {
     const cpu_count = @atomicLoad(u32, &global.cpu_count, .unordered);
     if (cpu_count != 0) return cpu_count;
-    const n: u32 = @intCast(@max(std.Thread.getCpuCount() catch max_thread_count, max_thread_count));
+    const n: u32 = @min(std.Thread.getCpuCount() catch max_thread_count, max_thread_count);
     return if (@cmpxchgStrong(u32, &global.cpu_count, 0, n, .monotonic, .monotonic)) |other| other else n;
 }
 

From a9d30056167cbe54c0c144199f01a56f6cdcdafc Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 02:04:56 -0800
Subject: [PATCH 10/13] std.heap.SmpAllocator: fix detection of slab end

---
 lib/std/heap/SmpAllocator.zig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index 9dce341260..b3183b26a8 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -6,8 +6,6 @@
 //!
 //! ## Basic Design
 //!
-//! Avoid locking the global mutex as much as possible.
-//!
 //! Each thread gets a separate freelist, however, the data must be recoverable
 //! when the thread exits. We do not directly learn when a thread exits, so
 //! occasionally, one thread must attempt to reclaim another thread's
@@ -122,6 +120,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
     }
 
     const slot_size = slotSize(class);
+    assert(slab_len % slot_size == 0);
     const max_search = 1;
     var search_count: u8 = 0;
 
@@ -148,7 +147,8 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
         if (search_count >= max_search) {
             @branchHint(.likely);
             defer t.unlock();
-            const slab = PageAllocator.map(slab_len, .fromByteUnits(std.heap.pageSize())) orelse return null;
+            // slab alignment here ensures the % slab len earlier catches the end of slots.
+            const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null;
             t.next_addrs[class] = @intFromPtr(slab) + slot_size;
             return slab;
         }

From 1754e014f5da09bf83a7ee1e53132325fd78d1c1 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 14:05:28 -0800
Subject: [PATCH 11/13] std.heap.SmpAllocator: rotate on free sometimes

* slab length reduced to 64K
* track freelist length with u8s
* on free(), rotate if freelist length exceeds max_freelist_len

Prevents memory leakage in the scenario where one thread only allocates
and another thread only frees.
---
 lib/std/heap/SmpAllocator.zig | 55 ++++++++++++++++++++++++++++++-----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/lib/std/heap/SmpAllocator.zig b/lib/std/heap/SmpAllocator.zig
index b3183b26a8..50593687ec 100644
--- a/lib/std/heap/SmpAllocator.zig
+++ b/lib/std/heap/SmpAllocator.zig
@@ -47,10 +47,16 @@ var global: SmpAllocator = .{
 threadlocal var thread_index: u32 = 0;
 
 const max_thread_count = 128;
-const slab_len: usize = @max(std.heap.page_size_max, 256 * 1024);
+const slab_len: usize = @max(std.heap.page_size_max, 64 * 1024);
 /// Because of storing free list pointers, the minimum size class is 3.
 const min_class = math.log2(@sizeOf(usize));
 const size_class_count = math.log2(slab_len) - min_class;
+/// When a freelist length exceeds this number, a `free` will rotate up to
+/// `max_free_search` times before pushing.
+const max_freelist_len: u8 = 16;
+const max_free_search = 1;
+/// Before mapping a fresh page, `alloc` will rotate this many times.
+const max_alloc_search = 1;
 
 const Thread = struct {
     /// Avoid false sharing.
@@ -62,9 +68,13 @@ const Thread = struct {
     /// to support freelist reclamation.
     mutex: std.Thread.Mutex = .{},
 
+    /// For each size class, tracks the next address to be returned from
+    /// `alloc` when the freelist is empty.
     next_addrs: [size_class_count]usize = @splat(0),
     /// For each size class, points to the freed pointer.
     frees: [size_class_count]usize = @splat(0),
+    /// For each size class, tracks the number of items in the freelist.
+    freelist_lens: [size_class_count]u8 = @splat(0),
 
     fn lock() *Thread {
         var index = thread_index;
@@ -121,7 +131,6 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
 
     const slot_size = slotSize(class);
     assert(slab_len % slot_size == 0);
-    const max_search = 1;
     var search_count: u8 = 0;
 
     var t = Thread.lock();
@@ -133,6 +142,7 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
             defer t.unlock();
             const node: *usize = @ptrFromInt(top_free_ptr);
             t.frees[class] = node.*;
+            t.freelist_lens[class] -|= 1;
             return @ptrFromInt(top_free_ptr);
         }
 
@@ -144,12 +154,13 @@ fn alloc(context: *anyopaque, len: usize, alignment: mem.Alignment, ra: usize) ?
             return @ptrFromInt(next_addr);
         }
 
-        if (search_count >= max_search) {
+        if (search_count >= max_alloc_search) {
             @branchHint(.likely);
             defer t.unlock();
             // slab alignment here ensures the % slab len earlier catches the end of slots.
             const slab = PageAllocator.map(slab_len, .fromByteUnits(slab_len)) orelse return null;
             t.next_addrs[class] = @intFromPtr(slab) + slot_size;
+            t.freelist_lens[class] = 0;
             return slab;
         }
 
@@ -203,12 +214,42 @@ fn free(context: *anyopaque, memory: []u8, alignment: mem.Alignment, ra: usize)
     }
 
     const node: *usize = @alignCast(@ptrCast(memory.ptr));
+    var search_count: u8 = 0;
 
-    const t = Thread.lock();
-    defer t.unlock();
+    var t = Thread.lock();
 
-    node.* = t.frees[class];
-    t.frees[class] = @intFromPtr(node);
+    outer: while (true) {
+        const freelist_len = t.freelist_lens[class];
+        if (freelist_len < max_freelist_len) {
+            @branchHint(.likely);
+            defer t.unlock();
+            node.* = t.frees[class];
+            t.frees[class] = @intFromPtr(node);
+            return;
+        }
+
+        if (search_count >= max_free_search) {
+            defer t.unlock();
+            t.freelist_lens[class] = freelist_len +| 1;
+            node.* = t.frees[class];
+            t.frees[class] = @intFromPtr(node);
+            return;
+        }
+
+        t.unlock();
+        const cpu_count = getCpuCount();
+        assert(cpu_count != 0);
+        var index = thread_index;
+        while (true) {
+            index = (index + 1) % cpu_count;
+            t = &global.threads[index];
+            if (t.mutex.tryLock()) {
+                thread_index = index;
+                search_count += 1;
+                continue :outer;
+            }
+        }
+    }
 }
 
 fn sizeClassIndex(len: usize, alignment: mem.Alignment) usize {

From bfabb703e32c8bbca3724ee1fa79d565adc1a200 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Fri, 7 Feb 2025 15:36:00 -0800
Subject: [PATCH 12/13] don't try to test SmpAllocator in single threaded mode

---
 lib/std/heap.zig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/std/heap.zig b/lib/std/heap.zig
index 290b39b624..b728d0da7e 100644
--- a/lib/std/heap.zig
+++ b/lib/std/heap.zig
@@ -497,6 +497,7 @@ test raw_c_allocator {
 }
 
 test smp_allocator {
+    if (builtin.single_threaded) return;
     try testAllocator(smp_allocator);
     try testAllocatorAligned(smp_allocator);
     try testAllocatorLargeAlignment(smp_allocator);

From 975cd9fc4ff8c12ae1f54e470b72be04d26e0837 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Sat, 8 Feb 2025 05:31:27 +0100
Subject: [PATCH 13/13] musl: Align the stack pointer given to clone() on
 riscv.

---
 lib/libc/musl/src/thread/riscv32/clone.s | 2 ++
 lib/libc/musl/src/thread/riscv64/clone.s | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/lib/libc/musl/src/thread/riscv32/clone.s b/lib/libc/musl/src/thread/riscv32/clone.s
index 3102239d0d..484f83a199 100644
--- a/lib/libc/musl/src/thread/riscv32/clone.s
+++ b/lib/libc/musl/src/thread/riscv32/clone.s
@@ -7,6 +7,8 @@
 .global __clone
 .type  __clone, %function
 __clone:
+	andi a1, a1, -16
+
 	# Save func and arg to stack
 	addi a1, a1, -16
 	sw a0, 0(a1)
diff --git a/lib/libc/musl/src/thread/riscv64/clone.s b/lib/libc/musl/src/thread/riscv64/clone.s
index db908248cd..187a28d2e7 100644
--- a/lib/libc/musl/src/thread/riscv64/clone.s
+++ b/lib/libc/musl/src/thread/riscv64/clone.s
@@ -7,6 +7,8 @@
 .global __clone
 .type  __clone, %function
 __clone:
+	andi a1, a1, -16
+
 	# Save func and arg to stack
 	addi a1, a1, -16
 	sd a0, 0(a1)