diff --git a/lib/std/mem.zig b/lib/std/mem.zig
index 5c1c4f4394..5f8c56b0d2 100644
--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@@ -629,10 +629,13 @@ pub fn sortUnstable(
     std.sort.pdq(T, items, context, lessThanFn);
 }
 
-/// TODO: currently this just calls `insertionSortContext`. The block sort implementation
-/// in this file needs to be adapted to use the sort context.
+/// Sorts a range [a, b) using a stable algorithm (maintains relative order of equal elements) with custom context.
+/// This is a lower-level interface for sorting that works with indices instead of slices.
+///
+/// The context must provide lessThan(a_idx, b_idx) and swap(a_idx, b_idx) methods and optionally
+/// a rotate(start_idx, end_idx, amount) method (see `mem.rotate`).
 pub fn sortContext(a: usize, b: usize, context: anytype) void {
-    std.sort.insertionContext(a, b, context);
+    std.sort.blockContext(a, b, context);
 }
 
 /// Sorts a range [a, b) using an unstable algorithm with custom context.
diff --git a/lib/std/multi_array_list.zig b/lib/std/multi_array_list.zig
index 958cca30a2..13b45823b7 100644
--- a/lib/std/multi_array_list.zig
+++ b/lib/std/multi_array_list.zig
@@ -191,7 +191,7 @@ pub fn MultiArrayList(comptime T: type) type {
                     return lhs.alignment > rhs.alignment;
                 }
             };
-            @setEvalBranchQuota(3 * fields.len * std.math.log2(fields.len));
+            @setEvalBranchQuota(10 * fields.len * std.math.log2(fields.len));
             mem.sort(Data, &data, {}, Sort.lessThan);
             var sizes_bytes: [fields.len]usize = undefined;
             var field_indexes: [fields.len]usize = undefined;
diff --git a/lib/std/sort.zig b/lib/std/sort.zig
index 8705d24017..ba1cc86714 100644
--- a/lib/std/sort.zig
+++ b/lib/std/sort.zig
@@ -7,6 +7,7 @@ const math = std.math;
 pub const Mode = enum { stable, unstable };
 
 pub const block = @import("sort/block.zig").block;
+pub const blockContext = @import("sort/block.zig").blockContext;
 pub const pdq = @import("sort/pdq.zig").pdq;
 pub const pdqContext = @import("sort/pdq.zig").pdqContext;
 
@@ -159,7 +160,7 @@ const sort_funcs = &[_]fn (comptime type, anytype, anytype, comptime anytype) vo
 };
 
 const context_sort_funcs = &[_]fn (usize, usize, anytype) void{
-    // blockContext,
+    blockContext,
     pdqContext,
     insertionContext,
     heapContext,
diff --git a/lib/std/sort/block.zig b/lib/std/sort/block.zig
index 4c94fb78ad..8ba65a4332 100644
--- a/lib/std/sort/block.zig
+++ b/lib/std/sort/block.zig
@@ -21,6 +21,7 @@ const Range = struct {
 };
 
 const Iterator = struct {
+    start_index: usize,
     size: usize,
     power_of_two: usize,
     numerator: usize,
@@ -29,12 +30,13 @@ const Iterator = struct {
     decimal_step: usize,
     numerator_step: usize,
 
-    fn init(size2: usize, min_level: usize) Iterator {
+    fn init(start_index: usize, size2: usize, min_level: usize) Iterator {
         const power_of_two = math.floorPowerOfTwo(usize, size2);
         const denominator = power_of_two / min_level;
         return Iterator{
             .numerator = 0,
-            .decimal = 0,
+            .decimal = start_index,
+            .start_index = start_index,
             .size = size2,
             .power_of_two = power_of_two,
             .denominator = denominator,
@@ -45,7 +47,7 @@ const Iterator = struct {
 
     fn begin(self: *Iterator) void {
         self.numerator = 0;
-        self.decimal = 0;
+        self.decimal = self.start_index;
     }
 
     fn nextRange(self: *Iterator) Range {
@@ -65,7 +67,7 @@ const Iterator = struct {
     }
 
     fn finished(self: *Iterator) bool {
-        return self.decimal >= self.size;
+        return self.decimal >= self.start_index + self.size;
     }
 
     fn nextLevel(self: *Iterator) bool {
@@ -103,28 +105,99 @@ pub fn block(
     context: anytype,
     comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
-    const lessThan = if (builtin.mode == .Debug) struct {
-        fn lessThan(ctx: @TypeOf(context), lhs: T, rhs: T) bool {
-            const lt = lessThanFn(ctx, lhs, rhs);
-            const gt = lessThanFn(ctx, rhs, lhs);
+    const Context = struct {
+        items: []T,
+        sub_ctx: @TypeOf(context),
+
+        pub fn lessThan(ctx: @This(), a: usize, b: usize) bool {
+            return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]);
+        }
+
+        pub fn swap(ctx: @This(), a: usize, b: usize) void {
+            return mem.swap(T, &ctx.items[a], &ctx.items[b]);
+        }
+
+        pub fn rotate(ctx: @This(), a: usize, b: usize, amount: usize) void {
+            return mem.rotate(T, ctx.items[a..b], amount);
+        }
+    };
+    return blockContext(0, items.len, Context{ .items = items, .sub_ctx = context });
+}
+
+/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case.
+/// O(1) memory (no allocator required).
+/// Sorts in ascending order with respect to the given `lessThan` function.
+/// `context` must have methods `swap` and `lessThan`,
+/// which each take 2 `usize` parameters indicating the index of an item. Optionally
+/// the `context` can define a `rotate` method which takes 2 `usize` parameters
+/// indicating the start and end index and another `usize` indicating how many
+/// steps to rotate.
+///
+/// NOTE: The algorithm only works when the comparison is less-than or greater-than.
+///       (See https://github.com/ziglang/zig/issues/8289)
+pub fn blockContext(
+    a: usize,
+    b: usize,
+    context: anytype,
+) void {
+    // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
+    const ContextType = @TypeOf(context);
+    const Context = struct {
+        sub_ctx: ContextType,
+
+        pub const lessThan = if (builtin.mode == .Debug) lessThanChecked else lessThanUnchecked;
+
+        fn lessThanChecked(ctx: @This(), i: usize, j: usize) bool {
+            const lt = ctx.sub_ctx.lessThan(i, j);
+            const gt = ctx.sub_ctx.lessThan(j, i);
             std.debug.assert(!(lt and gt));
             return lt;
         }
-    }.lessThan else lessThanFn;
 
-    // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c
-    var cache: [512]T = undefined;
+        fn lessThanUnchecked(ctx: @This(), i: usize, j: usize) bool {
+            return ctx.sub_ctx.lessThan(i, j);
+        }
 
-    if (items.len < 4) {
-        if (items.len == 3) {
-            // hard coded insertion sort
-            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
-            if (lessThan(context, items[2], items[1])) {
-                mem.swap(T, &items[1], &items[2]);
-                if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+        pub fn swap(ctx: @This(), i: usize, j: usize) void {
+            return ctx.sub_ctx.swap(i, j);
+        }
+
+        pub const rotate = if (std.meta.hasFn(ContextType, "rotate")) innerRotate else naiveRotate;
+
+        fn innerRotate(ctx: @This(), A: Range, amount: usize) void {
+            ctx.sub_ctx.rotate(A.start, A.end, amount);
+        }
+
+        fn naiveRotate(ctx: @This(), A: Range, amount: usize) void {
+            ctx.naiveReverse(Range.init(A.start, A.start + amount));
+            ctx.naiveReverse(Range.init(A.start + amount, A.end));
+            ctx.naiveReverse(A);
+        }
+
+        fn naiveReverse(ctx: @This(), A: Range) void {
+            var i = A.start;
+            var j = A.end - 1;
+            while (j > i) {
+                ctx.sub_ctx.swap(i, j);
+                i += 1;
+                j -= 1;
             }
-        } else if (items.len == 2) {
-            if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]);
+        }
+    };
+    const wrapped_context = Context{ .sub_ctx = context };
+
+    const range_length = b - a;
+
+    if (range_length < 4) {
+        if (range_length == 3) {
+            // hard coded insertion sort
+            if (wrapped_context.lessThan(a + 1, a + 0)) wrapped_context.swap(a + 0, a + 1);
+            if (wrapped_context.lessThan(a + 2, a + 1)) {
+                wrapped_context.swap(a + 1, a + 2);
+                if (wrapped_context.lessThan(a + 1, a + 0)) wrapped_context.swap(a + 0, a + 1);
+            }
+        } else if (range_length == 2) {
+            if (wrapped_context.lessThan(a + 1, a + 0)) wrapped_context.swap(a + 0, a + 1);
         }
         return;
     }
@@ -132,620 +205,491 @@ pub fn block(
     // sort groups of 4-8 items at a time using an unstable sorting network,
     // but keep track of the original item orders to force it to be stable
     // http://pages.ripco.net/~jgamble/nw.html
-    var iterator = Iterator.init(items.len, 4);
+    var iterator = Iterator.init(a, range_length, 4);
     while (!iterator.finished()) {
         var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 };
         const range = iterator.nextRange();
 
-        const sliced_items = items[range.start..];
         switch (range.length()) {
             8 => {
-                swap(T, sliced_items, &order, 0, 1, context, lessThan);
-                swap(T, sliced_items, &order, 2, 3, context, lessThan);
-                swap(T, sliced_items, &order, 4, 5, context, lessThan);
-                swap(T, sliced_items, &order, 6, 7, context, lessThan);
-                swap(T, sliced_items, &order, 0, 2, context, lessThan);
-                swap(T, sliced_items, &order, 1, 3, context, lessThan);
-                swap(T, sliced_items, &order, 4, 6, context, lessThan);
-                swap(T, sliced_items, &order, 5, 7, context, lessThan);
-                swap(T, sliced_items, &order, 1, 2, context, lessThan);
-                swap(T, sliced_items, &order, 5, 6, context, lessThan);
-                swap(T, sliced_items, &order, 0, 4, context, lessThan);
-                swap(T, sliced_items, &order, 3, 7, context, lessThan);
-                swap(T, sliced_items, &order, 1, 5, context, lessThan);
-                swap(T, sliced_items, &order, 2, 6, context, lessThan);
-                swap(T, sliced_items, &order, 1, 4, context, lessThan);
-                swap(T, sliced_items, &order, 3, 6, context, lessThan);
-                swap(T, sliced_items, &order, 2, 4, context, lessThan);
-                swap(T, sliced_items, &order, 3, 5, context, lessThan);
-                swap(T, sliced_items, &order, 3, 4, context, lessThan);
+                swap(&order, range.start, 0, 1, wrapped_context);
+                swap(&order, range.start, 2, 3, wrapped_context);
+                swap(&order, range.start, 4, 5, wrapped_context);
+                swap(&order, range.start, 6, 7, wrapped_context);
+                swap(&order, range.start, 0, 2, wrapped_context);
+                swap(&order, range.start, 1, 3, wrapped_context);
+                swap(&order, range.start, 4, 6, wrapped_context);
+                swap(&order, range.start, 5, 7, wrapped_context);
+                swap(&order, range.start, 1, 2, wrapped_context);
+                swap(&order, range.start, 5, 6, wrapped_context);
+                swap(&order, range.start, 0, 4, wrapped_context);
+                swap(&order, range.start, 3, 7, wrapped_context);
+                swap(&order, range.start, 1, 5, wrapped_context);
+                swap(&order, range.start, 2, 6, wrapped_context);
+                swap(&order, range.start, 1, 4, wrapped_context);
+                swap(&order, range.start, 3, 6, wrapped_context);
+                swap(&order, range.start, 2, 4, wrapped_context);
+                swap(&order, range.start, 3, 5, wrapped_context);
+                swap(&order, range.start, 3, 4, wrapped_context);
             },
             7 => {
-                swap(T, sliced_items, &order, 1, 2, context, lessThan);
-                swap(T, sliced_items, &order, 3, 4, context, lessThan);
-                swap(T, sliced_items, &order, 5, 6, context, lessThan);
-                swap(T, sliced_items, &order, 0, 2, context, lessThan);
-                swap(T, sliced_items, &order, 3, 5, context, lessThan);
-                swap(T, sliced_items, &order, 4, 6, context, lessThan);
-                swap(T, sliced_items, &order, 0, 1, context, lessThan);
-                swap(T, sliced_items, &order, 4, 5, context, lessThan);
-                swap(T, sliced_items, &order, 2, 6, context, lessThan);
-                swap(T, sliced_items, &order, 0, 4, context, lessThan);
-                swap(T, sliced_items, &order, 1, 5, context, lessThan);
-                swap(T, sliced_items, &order, 0, 3, context, lessThan);
-                swap(T, sliced_items, &order, 2, 5, context, lessThan);
-                swap(T, sliced_items, &order, 1, 3, context, lessThan);
-                swap(T, sliced_items, &order, 2, 4, context, lessThan);
-                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+                swap(&order, range.start, 1, 2, wrapped_context);
+                swap(&order, range.start, 3, 4, wrapped_context);
+                swap(&order, range.start, 5, 6, wrapped_context);
+                swap(&order, range.start, 0, 2, wrapped_context);
+                swap(&order, range.start, 3, 5, wrapped_context);
+                swap(&order, range.start, 4, 6, wrapped_context);
+                swap(&order, range.start, 0, 1, wrapped_context);
+                swap(&order, range.start, 4, 5, wrapped_context);
+                swap(&order, range.start, 2, 6, wrapped_context);
+                swap(&order, range.start, 0, 4, wrapped_context);
+                swap(&order, range.start, 1, 5, wrapped_context);
+                swap(&order, range.start, 0, 3, wrapped_context);
+                swap(&order, range.start, 2, 5, wrapped_context);
+                swap(&order, range.start, 1, 3, wrapped_context);
+                swap(&order, range.start, 2, 4, wrapped_context);
+                swap(&order, range.start, 2, 3, wrapped_context);
             },
             6 => {
-                swap(T, sliced_items, &order, 1, 2, context, lessThan);
-                swap(T, sliced_items, &order, 4, 5, context, lessThan);
-                swap(T, sliced_items, &order, 0, 2, context, lessThan);
-                swap(T, sliced_items, &order, 3, 5, context, lessThan);
-                swap(T, sliced_items, &order, 0, 1, context, lessThan);
-                swap(T, sliced_items, &order, 3, 4, context, lessThan);
-                swap(T, sliced_items, &order, 2, 5, context, lessThan);
-                swap(T, sliced_items, &order, 0, 3, context, lessThan);
-                swap(T, sliced_items, &order, 1, 4, context, lessThan);
-                swap(T, sliced_items, &order, 2, 4, context, lessThan);
-                swap(T, sliced_items, &order, 1, 3, context, lessThan);
-                swap(T, sliced_items, &order, 2, 3, context, lessThan);
+                swap(&order, range.start, 1, 2, wrapped_context);
+                swap(&order, range.start, 4, 5, wrapped_context);
+                swap(&order, range.start, 0, 2, wrapped_context);
+                swap(&order, range.start, 3, 5, wrapped_context);
+                swap(&order, range.start, 0, 1, wrapped_context);
+                swap(&order, range.start, 3, 4, wrapped_context);
+                swap(&order, range.start, 2, 5, wrapped_context);
+                swap(&order, range.start, 0, 3, wrapped_context);
+                swap(&order, range.start, 1, 4, wrapped_context);
+                swap(&order, range.start, 2, 4, wrapped_context);
+                swap(&order, range.start, 1, 3, wrapped_context);
+                swap(&order, range.start, 2, 3, wrapped_context);
             },
             5 => {
-                swap(T, sliced_items, &order, 0, 1, context, lessThan);
-                swap(T, sliced_items, &order, 3, 4, context, lessThan);
-                swap(T, sliced_items, &order, 2, 4, context, lessThan);
-                swap(T, sliced_items, &order, 2, 3, context, lessThan);
-                swap(T, sliced_items, &order, 1, 4, context, lessThan);
-                swap(T, sliced_items, &order, 0, 3, context, lessThan);
-                swap(T, sliced_items, &order, 0, 2, context, lessThan);
-                swap(T, sliced_items, &order, 1, 3, context, lessThan);
-                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+                swap(&order, range.start, 0, 1, wrapped_context);
+                swap(&order, range.start, 3, 4, wrapped_context);
+                swap(&order, range.start, 2, 4, wrapped_context);
+                swap(&order, range.start, 2, 3, wrapped_context);
+                swap(&order, range.start, 1, 4, wrapped_context);
+                swap(&order, range.start, 0, 3, wrapped_context);
+                swap(&order, range.start, 0, 2, wrapped_context);
+                swap(&order, range.start, 1, 3, wrapped_context);
+                swap(&order, range.start, 1, 2, wrapped_context);
             },
             4 => {
-                swap(T, sliced_items, &order, 0, 1, context, lessThan);
-                swap(T, sliced_items, &order, 2, 3, context, lessThan);
-                swap(T, sliced_items, &order, 0, 2, context, lessThan);
-                swap(T, sliced_items, &order, 1, 3, context, lessThan);
-                swap(T, sliced_items, &order, 1, 2, context, lessThan);
+                swap(&order, range.start, 0, 1, wrapped_context);
+                swap(&order, range.start, 2, 3, wrapped_context);
+                swap(&order, range.start, 0, 2, wrapped_context);
+                swap(&order, range.start, 1, 3, wrapped_context);
+                swap(&order, range.start, 1, 2, wrapped_context);
             },
             else => {},
         }
     }
-    if (items.len < 8) return;
+    if (range_length < 8) return;
 
     // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
     while (true) {
-        // if every A and B block will fit into the cache, use a special branch
-        // specifically for merging with the cache
-        // (we use < rather than <= since the block size might be one more than
-        // iterator.length())
-        if (iterator.length() < cache.len) {
-            // if four subarrays fit into the cache, it's faster to merge both
-            // pairs of subarrays into the cache,
-            // then merge the two merged subarrays from the cache back into the original array
-            if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) {
-                iterator.begin();
-                while (!iterator.finished()) {
-                    // merge A1 and B1 into the cache
-                    var A1 = iterator.nextRange();
-                    var B1 = iterator.nextRange();
-                    var A2 = iterator.nextRange();
-                    var B2 = iterator.nextRange();
+        // this is where the in-place merge logic starts!
+        // 1. pull out two internal buffers each containing √A unique values
+        //    1a. adjust block_size and buffer_size if we couldn't find enough unique values
+        // 2. loop over the A and B subarrays within this level of the merge sort
+        // 3. break A and B into blocks of size 'block_size'
+        // 4. "tag" each of the A blocks with values from the first internal buffer
+        // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
+        // 6. merge each A block with any B values that follow, using the second internal buffer
+        // 7. sort the second internal buffer if it exists
+        // 8. redistribute the two internal buffers back into the items
+        var block_size: usize = math.sqrt(iterator.length());
+        var buffer_size = iterator.length() / block_size + 1;
 
-                    if (lessThan(context, items[B1.end - 1], items[A1.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the cache
-                        const a1_items = items[A1.start..A1.end];
-                        @memcpy(cache[B1.length()..][0..a1_items.len], a1_items);
-                        const b1_items = items[B1.start..B1.end];
-                        @memcpy(cache[0..b1_items.len], b1_items);
-                    } else if (lessThan(context, items[B1.start], items[A1.end - 1])) {
-                        // these two ranges weren't already in order, so merge them into the cache
-                        mergeInto(T, items, A1, B1, cache[0..], context, lessThan);
-                    } else {
-                        // if A1, B1, A2, and B2 are all in order, skip doing anything else
-                        if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue;
+        // as an optimization, we really only need to pull out the internal buffers once for each level of merges
+        // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
+        var A: Range = undefined;
+        var B: Range = undefined;
+        var index: usize = 0;
+        var last: usize = 0;
+        var count: usize = 0;
+        var find: usize = 0;
+        var start: usize = 0;
+        var pull_index: usize = 0;
+        var pull = [_]Pull{
+            Pull{
+                .from = 0,
+                .to = 0,
+                .count = 0,
+                .range = Range.init(0, 0),
+            },
+            Pull{
+                .from = 0,
+                .to = 0,
+                .count = 0,
+                .range = Range.init(0, 0),
+            },
+        };
 
-                        // copy A1 and B1 into the cache in the same order
-                        const a1_items = items[A1.start..A1.end];
-                        @memcpy(cache[0..a1_items.len], a1_items);
-                        const b1_items = items[B1.start..B1.end];
-                        @memcpy(cache[A1.length()..][0..b1_items.len], b1_items);
-                    }
-                    A1 = Range.init(A1.start, B1.end);
+        var buffer1 = Range.init(0, 0);
+        var buffer2 = Range.init(0, 0);
 
-                    // merge A2 and B2 into the cache
-                    if (lessThan(context, items[B2.end - 1], items[A2.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the cache
-                        const a2_items = items[A2.start..A2.end];
-                        @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items);
-                        const b2_items = items[B2.start..B2.end];
-                        @memcpy(cache[A1.length()..][0..b2_items.len], b2_items);
-                    } else if (lessThan(context, items[B2.start], items[A2.end - 1])) {
-                        // these two ranges weren't already in order, so merge them into the cache
-                        mergeInto(T, items, A2, B2, cache[A1.length()..], context, lessThan);
-                    } else {
-                        // copy A2 and B2 into the cache in the same order
-                        const a2_items = items[A2.start..A2.end];
-                        @memcpy(cache[A1.length()..][0..a2_items.len], a2_items);
-                        const b2_items = items[B2.start..B2.end];
-                        @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items);
-                    }
-                    A2 = Range.init(A2.start, B2.end);
+        // find two internal buffers of size 'buffer_size' each
+        find = buffer_size + buffer_size;
+        var find_separately = false;
 
-                    // merge A1 and A2 from the cache into the items
-                    const A3 = Range.init(0, A1.length());
-                    const B3 = Range.init(A1.length(), A1.length() + A2.length());
+        if (find > iterator.length()) {
+            // we can't fit both buffers into the same A or B subarray, so find two buffers separately
+            find = buffer_size;
+            find_separately = true;
+        }
 
-                    if (lessThan(context, cache[B3.end - 1], cache[A3.start])) {
-                        // the two ranges are in reverse order, so copy them in reverse order into the items
-                        const a3_items = cache[A3.start..A3.end];
-                        @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items);
-                        const b3_items = cache[B3.start..B3.end];
-                        @memcpy(items[A1.start..][0..b3_items.len], b3_items);
-                    } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) {
-                        // these two ranges weren't already in order, so merge them back into the items
-                        mergeInto(T, cache[0..], A3, B3, items[A1.start..], context, lessThan);
-                    } else {
-                        // copy A3 and B3 into the items in the same order
-                        const a3_items = cache[A3.start..A3.end];
-                        @memcpy(items[A1.start..][0..a3_items.len], a3_items);
-                        const b3_items = cache[B3.start..B3.end];
-                        @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items);
-                    }
-                }
+        // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
+        // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
+        // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
 
-                // we merged two levels at the same time, so we're done with this level already
-                // (iterator.nextLevel() is called again at the bottom of this outer merge loop)
-                _ = iterator.nextLevel();
-            } else {
-                iterator.begin();
-                while (!iterator.finished()) {
-                    const A = iterator.nextRange();
-                    const B = iterator.nextRange();
+        // in the case where it couldn't find a single buffer of at least √A unique values,
+        // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
+        iterator.begin();
+        while (!iterator.finished()) {
+            A = iterator.nextRange();
+            B = iterator.nextRange();
 
-                    if (lessThan(context, items[B.end - 1], items[A.start])) {
-                        // the two ranges are in reverse order, so a simple rotation should fix it
-                        mem.rotate(T, items[A.start..B.end], A.length());
-                    } else if (lessThan(context, items[B.start], items[A.end - 1])) {
-                        // these two ranges weren't already in order, so we'll need to merge them!
-                        const a_items = items[A.start..A.end];
-                        @memcpy(cache[0..a_items.len], a_items);
-                        mergeExternal(T, items, A, B, cache[0..], context, lessThan);
-                    }
-                }
+            // just store information about where the values will be pulled from and to,
+            // as well as how many values there are, to create the two internal buffers
+
+            // check A for the number of unique values we need to fill an internal buffer
+            // these values will be pulled out to the start of A
+            last = A.start;
+            count = 1;
+            while (count < find) : ({
+                last = index;
+                count += 1;
+            }) {
+                index = findLastForward(last, Range.init(last + 1, A.end), find - count, wrapped_context);
+                if (index == A.end) break;
             }
-        } else {
-            // this is where the in-place merge logic starts!
-            // 1. pull out two internal buffers each containing √A unique values
-            //    1a. adjust block_size and buffer_size if we couldn't find enough unique values
-            // 2. loop over the A and B subarrays within this level of the merge sort
-            // 3. break A and B into blocks of size 'block_size'
-            // 4. "tag" each of the A blocks with values from the first internal buffer
-            // 5. roll the A blocks through the B blocks and drop/rotate them where they belong
-            // 6. merge each A block with any B values that follow, using the cache or the second internal buffer
-            // 7. sort the second internal buffer if it exists
-            // 8. redistribute the two internal buffers back into the items
-            var block_size: usize = math.sqrt(iterator.length());
-            var buffer_size = iterator.length() / block_size + 1;
+            index = last;
 
-            // as an optimization, we really only need to pull out the internal buffers once for each level of merges
-            // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
-            var A: Range = undefined;
-            var B: Range = undefined;
-            var index: usize = 0;
-            var last: usize = 0;
-            var count: usize = 0;
-            var find: usize = 0;
-            var start: usize = 0;
-            var pull_index: usize = 0;
-            var pull = [_]Pull{
-                Pull{
-                    .from = 0,
-                    .to = 0,
-                    .count = 0,
-                    .range = Range.init(0, 0),
-                },
-                Pull{
-                    .from = 0,
-                    .to = 0,
-                    .count = 0,
-                    .range = Range.init(0, 0),
-                },
-            };
+            if (count >= buffer_size) {
+                // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
+                pull[pull_index] = Pull{
+                    .range = Range.init(A.start, B.end),
+                    .count = count,
+                    .from = index,
+                    .to = A.start,
+                };
+                pull_index = 1;
 
-            var buffer1 = Range.init(0, 0);
-            var buffer2 = Range.init(0, 0);
-
-            // find two internal buffers of size 'buffer_size' each
-            find = buffer_size + buffer_size;
-            var find_separately = false;
-
-            if (block_size <= cache.len) {
-                // if every A block fits into the cache then we won't need the second internal buffer,
-                // so we really only need to find 'buffer_size' unique values
-                find = buffer_size;
-            } else if (find > iterator.length()) {
-                // we can't fit both buffers into the same A or B subarray, so find two buffers separately
-                find = buffer_size;
-                find_separately = true;
-            }
-
-            // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each),
-            // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
-            // OR if we couldn't find that many unique values, we need the largest possible buffer we can get
-
-            // in the case where it couldn't find a single buffer of at least √A unique values,
-            // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
-            iterator.begin();
-            while (!iterator.finished()) {
-                A = iterator.nextRange();
-                B = iterator.nextRange();
-
-                // just store information about where the values will be pulled from and to,
-                // as well as how many values there are, to create the two internal buffers
-
-                // check A for the number of unique values we need to fill an internal buffer
-                // these values will be pulled out to the start of A
-                last = A.start;
-                count = 1;
-                while (count < find) : ({
-                    last = index;
-                    count += 1;
-                }) {
-                    index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), find - count, context, lessThan);
-                    if (index == A.end) break;
-                }
-                index = last;
-
-                if (count >= buffer_size) {
-                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = A.start,
-                    };
-                    pull_index = 1;
-
-                    if (count == buffer_size + buffer_size) {
-                        // we were able to find a single contiguous section containing 2√A unique values,
-                        // so this section can be used to contain both of the internal buffers we'll need
-                        buffer1 = Range.init(A.start, A.start + buffer_size);
-                        buffer2 = Range.init(A.start + buffer_size, A.start + count);
-                        break;
-                    } else if (find == buffer_size + buffer_size) {
-                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
-                        // so we still need to find a second separate buffer of at least √A unique values
-                        buffer1 = Range.init(A.start, A.start + count);
-                        find = buffer_size;
-                    } else if (block_size <= cache.len) {
-                        // we found the first and only internal buffer that we need, so we're done!
-                        buffer1 = Range.init(A.start, A.start + count);
-                        break;
-                    } else if (find_separately) {
-                        // found one buffer, but now find the other one
-                        buffer1 = Range.init(A.start, A.start + count);
-                        find_separately = false;
-                    } else {
-                        // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
-                        buffer2 = Range.init(A.start, A.start + count);
-                        break;
-                    }
-                } else if (pull_index == 0 and count > buffer1.length()) {
-                    // keep track of the largest buffer we were able to find
+                if (count == buffer_size + buffer_size) {
+                    // we were able to find a single contiguous section containing 2√A unique values,
+                    // so this section can be used to contain both of the internal buffers we'll need
+                    buffer1 = Range.init(A.start, A.start + buffer_size);
+                    buffer2 = Range.init(A.start + buffer_size, A.start + count);
+                    break;
+                } else if (find == buffer_size + buffer_size) {
+                    // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+                    // so we still need to find a second separate buffer of at least √A unique values
                     buffer1 = Range.init(A.start, A.start + count);
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = A.start,
-                    };
+                    find = buffer_size;
+                } else if (find_separately) {
+                    // found one buffer, but now find the other one
+                    buffer1 = Range.init(A.start, A.start + count);
+                    find_separately = false;
+                } else {
+                    // we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
+                    buffer2 = Range.init(A.start, A.start + count);
+                    break;
                 }
+            } else if (pull_index == 0 and count > buffer1.length()) {
+                // keep track of the largest buffer we were able to find
+                buffer1 = Range.init(A.start, A.start + count);
+                pull[pull_index] = Pull{
+                    .range = Range.init(A.start, B.end),
+                    .count = count,
+                    .from = index,
+                    .to = A.start,
+                };
+            }
 
-                // check B for the number of unique values we need to fill an internal buffer
-                // these values will be pulled out to the end of B
-                last = B.end - 1;
-                count = 1;
-                while (count < find) : ({
-                    last = index - 1;
-                    count += 1;
-                }) {
-                    index = findFirstBackward(T, items, items[last], Range.init(B.start, last), find - count, context, lessThan);
-                    if (index == B.start) break;
-                }
-                index = last;
+            // check B for the number of unique values we need to fill an internal buffer
+            // these values will be pulled out to the end of B
+            last = B.end - 1;
+            count = 1;
+            while (count < find) : ({
+                last = index - 1;
+                count += 1;
+            }) {
+                index = findFirstBackward(last, Range.init(B.start, last), find - count, wrapped_context);
+                if (index == B.start) break;
+            }
+            index = last;
 
-                if (count >= buffer_size) {
-                    // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = B.end,
-                    };
-                    pull_index = 1;
+            if (count >= buffer_size) {
+                // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe
+                pull[pull_index] = Pull{
+                    .range = Range.init(A.start, B.end),
+                    .count = count,
+                    .from = index,
+                    .to = B.end,
+                };
+                pull_index = 1;
 
-                    if (count == buffer_size + buffer_size) {
-                        // we were able to find a single contiguous section containing 2√A unique values,
-                        // so this section can be used to contain both of the internal buffers we'll need
-                        buffer1 = Range.init(B.end - count, B.end - buffer_size);
-                        buffer2 = Range.init(B.end - buffer_size, B.end);
-                        break;
-                    } else if (find == buffer_size + buffer_size) {
-                        // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
-                        // so we still need to find a second separate buffer of at least √A unique values
-                        buffer1 = Range.init(B.end - count, B.end);
-                        find = buffer_size;
-                    } else if (block_size <= cache.len) {
-                        // we found the first and only internal buffer that we need, so we're done!
-                        buffer1 = Range.init(B.end - count, B.end);
-                        break;
-                    } else if (find_separately) {
-                        // found one buffer, but now find the other one
-                        buffer1 = Range.init(B.end - count, B.end);
-                        find_separately = false;
-                    } else {
-                        // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
-                        // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
-                        if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
-
-                        // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
-                        buffer2 = Range.init(B.end - count, B.end);
-                        break;
-                    }
-                } else if (pull_index == 0 and count > buffer1.length()) {
-                    // keep track of the largest buffer we were able to find
+                if (count == buffer_size + buffer_size) {
+                    // we were able to find a single contiguous section containing 2√A unique values,
+                    // so this section can be used to contain both of the internal buffers we'll need
+                    buffer1 = Range.init(B.end - count, B.end - buffer_size);
+                    buffer2 = Range.init(B.end - buffer_size, B.end);
+                    break;
+                } else if (find == buffer_size + buffer_size) {
+                    // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
+                    // so we still need to find a second separate buffer of at least √A unique values
                     buffer1 = Range.init(B.end - count, B.end);
-                    pull[pull_index] = Pull{
-                        .range = Range.init(A.start, B.end),
-                        .count = count,
-                        .from = index,
-                        .to = B.end,
-                    };
+                    find = buffer_size;
+                } else if (find_separately) {
+                    // found one buffer, but now find the other one
+                    buffer1 = Range.init(B.end - count, B.end);
+                    find_separately = false;
+                } else {
+                    // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
+                    // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
+                    if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
+
+                    // we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
+                    buffer2 = Range.init(B.end - count, B.end);
+                    break;
+                }
+            } else if (pull_index == 0 and count > buffer1.length()) {
+                // keep track of the largest buffer we were able to find
+                buffer1 = Range.init(B.end - count, B.end);
+                pull[pull_index] = Pull{
+                    .range = Range.init(A.start, B.end),
+                    .count = count,
+                    .from = index,
+                    .to = B.end,
+                };
+            }
+        }
+
+        // pull out the two ranges so we can use them as internal buffers
+        pull_index = 0;
+        while (pull_index < 2) : (pull_index += 1) {
+            const length = pull[pull_index].count;
+
+            if (pull[pull_index].to < pull[pull_index].from) {
+                // we're pulling the values out to the left, which means the start of an A subarray
+                index = pull[pull_index].from;
+                count = 1;
+                while (count < length) : (count += 1) {
+                    index = findFirstBackward(index - 1, Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, wrapped_context);
+                    const range = Range.init(index + 1, pull[pull_index].from + 1);
+                    wrapped_context.rotate(range, range.length() - count);
+                    pull[pull_index].from = index + count;
+                }
+            } else if (pull[pull_index].to > pull[pull_index].from) {
+                // we're pulling values out to the right, which means the end of a B subarray
+                index = pull[pull_index].from + 1;
+                count = 1;
+                while (count < length) : (count += 1) {
+                    index = findLastForward(index, Range.init(index, pull[pull_index].to), length - count, wrapped_context);
+                    const range = Range.init(pull[pull_index].from, index - 1);
+                    wrapped_context.rotate(range, count);
+                    pull[pull_index].from = index - 1 - count;
+                }
+            }
+        }
+
+        // adjust block_size and buffer_size based on the values we were able to pull out
+        buffer_size = buffer1.length();
+        block_size = iterator.length() / buffer_size + 1;
+
+        // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
+        // so this was originally here to test the math for adjusting block_size above
+        // assert((iterator.length() + 1)/block_size <= buffer_size);
+
+        // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
+        iterator.begin();
+        while (!iterator.finished()) {
+            A = iterator.nextRange();
+            B = iterator.nextRange();
+
+            // remove any parts of A or B that are being used by the internal buffers
+            start = A.start;
+            if (start == pull[0].range.start) {
+                if (pull[0].from > pull[0].to) {
+                    A.start += pull[0].count;
+
+                    // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
+                    // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4
+                    if (A.length() == 0) continue;
+                } else if (pull[0].from < pull[0].to) {
+                    B.end -= pull[0].count;
+                    if (B.length() == 0) continue;
+                }
+            }
+            if (start == pull[1].range.start) {
+                if (pull[1].from > pull[1].to) {
+                    A.start += pull[1].count;
+                    if (A.length() == 0) continue;
+                } else if (pull[1].from < pull[1].to) {
+                    B.end -= pull[1].count;
+                    if (B.length() == 0) continue;
                 }
             }
 
-            // pull out the two ranges so we can use them as internal buffers
-            pull_index = 0;
-            while (pull_index < 2) : (pull_index += 1) {
-                const length = pull[pull_index].count;
+            if (wrapped_context.lessThan(B.end - 1, A.start)) {
+                // the two ranges are in reverse order, so a simple rotation should fix it
+                wrapped_context.rotate(Range.init(A.start, B.end), A.length());
+            } else if (wrapped_context.lessThan(A.end, A.end - 1)) {
+                // these two ranges weren't already in order, so we'll need to merge them!
+                var findA: usize = undefined;
 
-                if (pull[pull_index].to < pull[pull_index].from) {
-                    // we're pulling the values out to the left, which means the start of an A subarray
-                    index = pull[pull_index].from;
-                    count = 1;
-                    while (count < length) : (count += 1) {
-                        index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, context, lessThan);
-                        const range = Range.init(index + 1, pull[pull_index].from + 1);
-                        mem.rotate(T, items[range.start..range.end], range.length() - count);
-                        pull[pull_index].from = index + count;
-                    }
-                } else if (pull[pull_index].to > pull[pull_index].from) {
-                    // we're pulling values out to the right, which means the end of a B subarray
-                    index = pull[pull_index].from + 1;
-                    count = 1;
-                    while (count < length) : (count += 1) {
-                        index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), length - count, context, lessThan);
-                        const range = Range.init(pull[pull_index].from, index - 1);
-                        mem.rotate(T, items[range.start..range.end], count);
-                        pull[pull_index].from = index - 1 - count;
-                    }
-                }
-            }
+                // break the remainder of A into blocks. firstA is the uneven-sized first A block
+                var blockA = Range.init(A.start, A.end);
+                var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
 
-            // adjust block_size and buffer_size based on the values we were able to pull out
-            buffer_size = buffer1.length();
-            block_size = iterator.length() / buffer_size + 1;
-
-            // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
-            // so this was originally here to test the math for adjusting block_size above
-            // assert((iterator.length() + 1)/block_size <= buffer_size);
-
-            // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
-            iterator.begin();
-            while (!iterator.finished()) {
-                A = iterator.nextRange();
-                B = iterator.nextRange();
-
-                // remove any parts of A or B that are being used by the internal buffers
-                start = A.start;
-                if (start == pull[0].range.start) {
-                    if (pull[0].from > pull[0].to) {
-                        A.start += pull[0].count;
-
-                        // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
-                        // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
-                        // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal
-                        if (A.length() == 0) continue;
-                    } else if (pull[0].from < pull[0].to) {
-                        B.end -= pull[0].count;
-                        if (B.length() == 0) continue;
-                    }
-                }
-                if (start == pull[1].range.start) {
-                    if (pull[1].from > pull[1].to) {
-                        A.start += pull[1].count;
-                        if (A.length() == 0) continue;
-                    } else if (pull[1].from < pull[1].to) {
-                        B.end -= pull[1].count;
-                        if (B.length() == 0) continue;
-                    }
+                // swap the first value of each A block with the value in buffer1
+                var indexA = buffer1.start;
+                index = firstA.end;
+                while (index < blockA.end) : ({
+                    indexA += 1;
+                    index += block_size;
+                }) {
+                    context.swap(indexA, index);
                 }
 
-                if (lessThan(context, items[B.end - 1], items[A.start])) {
-                    // the two ranges are in reverse order, so a simple rotation should fix it
-                    mem.rotate(T, items[A.start..B.end], A.length());
-                } else if (lessThan(context, items[A.end], items[A.end - 1])) {
-                    // these two ranges weren't already in order, so we'll need to merge them!
-                    var findA: usize = undefined;
+                // start rolling the A blocks through the B blocks!
+                // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
+                var lastA = firstA;
+                var lastB = Range.init(0, 0);
+                var blockB = Range.init(B.start, B.start + @min(block_size, B.length()));
+                blockA.start += firstA.length();
+                indexA = buffer1.start;
 
-                    // break the remainder of A into blocks. firstA is the uneven-sized first A block
-                    var blockA = Range.init(A.start, A.end);
-                    var firstA = Range.init(A.start, A.start + blockA.length() % block_size);
+                // if the second buffer is available, block swap the contents into that
+                if (buffer2.length() > 0) {
+                    blockSwap(lastA.start, buffer2.start, lastA.length(), wrapped_context);
+                }
 
-                    // swap the first value of each A block with the value in buffer1
-                    var indexA = buffer1.start;
-                    index = firstA.end;
-                    while (index < blockA.end) : ({
-                        indexA += 1;
-                        index += block_size;
-                    }) {
-                        mem.swap(T, &items[indexA], &items[index]);
-                    }
+                if (blockA.length() > 0) {
+                    while (true) {
+                        // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
+                        // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
+                        if ((lastB.length() > 0 and !wrapped_context.lessThan(a + lastB.end - 1, indexA)) or blockB.length() == 0) {
+                            // figure out where to split the previous B block, and rotate it at the split
+                            const B_split = binaryFirst(indexA, lastB, wrapped_context);
+                            const B_remaining = lastB.end - B_split;
 
-                    // start rolling the A blocks through the B blocks!
-                    // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
-                    var lastA = firstA;
-                    var lastB = Range.init(0, 0);
-                    var blockB = Range.init(B.start, B.start + @min(block_size, B.length()));
-                    blockA.start += firstA.length();
-                    indexA = buffer1.start;
-
-                    // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
-                    // otherwise, if the second buffer is available, block swap the contents into that
-                    if (lastA.length() <= cache.len) {
-                        const last_a_items = items[lastA.start..lastA.end];
-                        @memcpy(cache[0..last_a_items.len], last_a_items);
-                    } else if (buffer2.length() > 0) {
-                        blockSwap(T, items, lastA.start, buffer2.start, lastA.length());
-                    }
-
-                    if (blockA.length() > 0) {
-                        while (true) {
-                            // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
-                            // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
-                            if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) {
-                                // figure out where to split the previous B block, and rotate it at the split
-                                const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan);
-                                const B_remaining = lastB.end - B_split;
-
-                                // swap the minimum A block to the beginning of the rolling A blocks
-                                var minA = blockA.start;
-                                findA = minA + block_size;
-                                while (findA < blockA.end) : (findA += block_size) {
-                                    if (lessThan(context, items[findA], items[minA])) {
-                                        minA = findA;
-                                    }
+                            // swap the minimum A block to the beginning of the rolling A blocks
+                            var minA = blockA.start;
+                            findA = minA + block_size;
+                            while (findA < blockA.end) : (findA += block_size) {
+                                if (wrapped_context.lessThan(findA, minA)) {
+                                    minA = findA;
                                 }
-                                blockSwap(T, items, blockA.start, minA, block_size);
+                            }
+                            blockSwap(blockA.start, minA, block_size, wrapped_context);
 
-                                // swap the first item of the previous A block back with its original value, which is stored in buffer1
-                                mem.swap(T, &items[blockA.start], &items[indexA]);
-                                indexA += 1;
+                            // swap the first item of the previous A block back with its original value, which is stored in buffer1
+                            context.swap(blockA.start, indexA);
+                            indexA += 1;
 
-                                // locally merge the previous A block with the B values that follow it
-                                // if lastA fits into the external cache we'll use that (with MergeExternal),
-                                // or if the second internal buffer exists we'll use that (with MergeInternal),
-                                // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
+                            // locally merge the previous A block with the B values that follow it
+                            // if lastA fits into the second internal buffer exists we'll use that (with MergeInternal),
+                            // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
 
-                                if (lastA.length() <= cache.len) {
-                                    mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), cache[0..], context, lessThan);
-                                } else if (buffer2.length() > 0) {
-                                    mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), buffer2, context, lessThan);
-                                } else {
-                                    mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan);
-                                }
-
-                                if (buffer2.length() > 0 or block_size <= cache.len) {
-                                    // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
-                                    if (block_size <= cache.len) {
-                                        @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]);
-                                    } else {
-                                        blockSwap(T, items, blockA.start, buffer2.start, block_size);
-                                    }
-
-                                    // this is equivalent to rotating, but faster
-                                    // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
-                                    // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
-                                    blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining);
-                                } else {
-                                    // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
-                                    mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split);
-                                }
-
-                                // update the range for the remaining A blocks, and the range remaining from the B block after it was split
-                                lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
-                                lastB = Range.init(lastA.end, lastA.end + B_remaining);
-
-                                // if there are no more A blocks remaining, this step is finished!
-                                blockA.start += block_size;
-                                if (blockA.length() == 0) break;
-                            } else if (blockB.length() < block_size) {
-                                // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
-                                // the cache is disabled here since it might contain the contents of the previous A block
-                                mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start);
-
-                                lastB = Range.init(blockA.start, blockA.start + blockB.length());
-                                blockA.start += blockB.length();
-                                blockA.end += blockB.length();
-                                blockB.end = blockB.start;
+                            if (buffer2.length() > 0) {
+                                mergeInternal(lastA, Range.init(lastA.end, B_split), buffer2, wrapped_context);
                             } else {
-                                // roll the leftmost A block to the end by swapping it with the next B block
-                                blockSwap(T, items, blockA.start, blockB.start, block_size);
-                                lastB = Range.init(blockA.start, blockA.start + block_size);
+                                mergeInPlace(lastA, Range.init(lastA.end, B_split), wrapped_context);
+                            }
 
-                                blockA.start += block_size;
-                                blockA.end += block_size;
-                                blockB.start += block_size;
+                            if (buffer2.length() > 0) {
+                                // copy the previous A block into the buffer2, since that's where we need it to be when we go to merge it anyway
+                                blockSwap(blockA.start, buffer2.start, block_size, wrapped_context);
 
-                                if (blockB.end > B.end - block_size) {
-                                    blockB.end = B.end;
-                                } else {
-                                    blockB.end += block_size;
-                                }
+                                // this is equivalent to rotating, but faster
+                                // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
+                                // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
+                                blockSwap(B_split, blockA.start + block_size - B_remaining, B_remaining, wrapped_context);
+                            } else {
+                                // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
+                                wrapped_context.rotate(Range.init(B_split, blockA.start + block_size), blockA.start - B_split);
+                            }
+
+                            // update the range for the remaining A blocks, and the range remaining from the B block after it was split
+                            lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
+                            lastB = Range.init(lastA.end, lastA.end + B_remaining);
+
+                            // if there are no more A blocks remaining, this step is finished!
+                            blockA.start += block_size;
+                            if (blockA.length() == 0) break;
+                        } else if (blockB.length() < block_size) {
+                            // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
+                            wrapped_context.rotate(Range.init(blockA.start, blockB.end), blockB.start - blockA.start);
+
+                            lastB = Range.init(blockA.start, blockA.start + blockB.length());
+                            blockA.start += blockB.length();
+                            blockA.end += blockB.length();
+                            blockB.end = blockB.start;
+                        } else {
+                            // roll the leftmost A block to the end by swapping it with the next B block
+                            blockSwap(blockA.start, blockB.start, block_size, wrapped_context);
+                            lastB = Range.init(blockA.start, blockA.start + block_size);
+
+                            blockA.start += block_size;
+                            blockA.end += block_size;
+                            blockB.start += block_size;
+
+                            if (blockB.end > B.end - block_size) {
+                                blockB.end = B.end;
+                            } else {
+                                blockB.end += block_size;
                             }
                         }
                     }
+                }
 
-                    // merge the last A block with the remaining B values
-                    if (lastA.length() <= cache.len) {
-                        mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), cache[0..], context, lessThan);
-                    } else if (buffer2.length() > 0) {
-                        mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), buffer2, context, lessThan);
-                    } else {
-                        mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan);
-                    }
+                // merge the last A block with the remaining B values
+                if (buffer2.length() > 0) {
+                    mergeInternal(lastA, Range.init(lastA.end, B.end), buffer2, wrapped_context);
+                } else {
+                    mergeInPlace(lastA, Range.init(lastA.end, B.end), wrapped_context);
                 }
             }
+        }
 
-            // when we're finished with this merge step we should have the one
-            // or two internal buffers left over, where the second buffer is all jumbled up
-            // insertion sort the second buffer, then redistribute the buffers
-            // back into the items using the opposite process used for creating the buffer
+        // when we're finished with this merge step we should have the one
+        // or two internal buffers left over, where the second buffer is all jumbled up
+        // insertion sort the second buffer, then redistribute the buffers
+        // back into the items using the opposite process used for creating the buffer
 
-            // while an unstable sort like quicksort could be applied here, in benchmarks
-            // it was consistently slightly slower than a simple insertion sort,
-            // even for tens of millions of items. this may be because insertion
-            // sort is quite fast when the data is already somewhat sorted, like it is here
-            sort.insertion(T, items[buffer2.start..buffer2.end], context, lessThan);
+        // while an unstable sort like quicksort could be applied here, in benchmarks
+        // it was consistently slightly slower than a simple insertion sort,
+        // even for tens of millions of items. this may be because insertion
+        // sort is quite fast when the data is already somewhat sorted, like it is here
+        sort.insertionContext(buffer2.start, buffer2.end, wrapped_context);
 
-            pull_index = 0;
-            while (pull_index < 2) : (pull_index += 1) {
-                var unique = pull[pull_index].count * 2;
-                if (pull[pull_index].from > pull[pull_index].to) {
-                    // the values were pulled out to the left, so redistribute them back to the right
-                    var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
-                    while (buffer.length() > 0) {
-                        index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), unique, context, lessThan);
-                        const amount = index - buffer.end;
-                        mem.rotate(T, items[buffer.start..index], buffer.length());
-                        buffer.start += (amount + 1);
-                        buffer.end += amount;
-                        unique -= 2;
-                    }
-                } else if (pull[pull_index].from < pull[pull_index].to) {
-                    // the values were pulled out to the right, so redistribute them back to the left
-                    var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
-                    while (buffer.length() > 0) {
-                        index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), unique, context, lessThan);
-                        const amount = buffer.start - index;
-                        mem.rotate(T, items[index..buffer.end], amount);
-                        buffer.start -= amount;
-                        buffer.end -= (amount + 1);
-                        unique -= 2;
-                    }
+        pull_index = 0;
+        while (pull_index < 2) : (pull_index += 1) {
+            var unique = pull[pull_index].count * 2;
+            if (pull[pull_index].from > pull[pull_index].to) {
+                // the values were pulled out to the left, so redistribute them back to the right
+                var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
+                while (buffer.length() > 0) {
+                    index = findFirstForward(buffer.start, Range.init(buffer.end, pull[pull_index].range.end), unique, wrapped_context);
+                    const amount = index - buffer.end;
+                    wrapped_context.rotate(Range.init(buffer.start, index), buffer.length());
+                    buffer.start += (amount + 1);
+                    buffer.end += amount;
+                    unique -= 2;
+                }
+            } else if (pull[pull_index].from < pull[pull_index].to) {
+                // the values were pulled out to the right, so redistribute them back to the left
+                var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
+                while (buffer.length() > 0) {
+                    index = findLastBackward(buffer.end - 1, Range.init(pull[pull_index].range.start, buffer.start), unique, wrapped_context);
+                    const amount = buffer.start - index;
+                    wrapped_context.rotate(Range.init(index, buffer.end), amount);
+                    buffer.start -= amount;
+                    buffer.end -= (amount + 1);
+                    unique -= 2;
                 }
             }
         }
@@ -756,12 +700,9 @@ pub fn block(
 }
 // merge operation without a buffer
 fn mergeInPlace(
-    comptime T: type,
-    items: []T,
     A_arg: Range,
     B_arg: Range,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
     if (A_arg.length() == 0 or B_arg.length() == 0) return;
 
@@ -788,30 +729,27 @@ fn mergeInPlace(
 
     while (true) {
         // find the first place in B where the first item in A needs to be inserted
-        const mid = binaryFirst(T, items, items[A.start], B, context, lessThan);
+        const mid = binaryFirst(A.start, B, context);
 
         // rotate A into place
         const amount = mid - A.end;
-        mem.rotate(T, items[A.start..mid], A.length());
+        context.rotate(Range.init(A.start, mid), A.length());
         if (B.end == mid) break;
 
         // calculate the new A and B ranges
         B.start = mid;
         A = Range.init(A.start + amount, B.start);
-        A.start = binaryLast(T, items, items[A.start], A, context, lessThan);
+        A.start = binaryLast(A.start, A, context);
         if (A.length() == 0) break;
     }
 }
 
 // merge operation using an internal buffer
 fn mergeInternal(
-    comptime T: type,
-    items: []T,
     A: Range,
     B: Range,
     buffer: Range,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
     // whenever we find a value to add to the final array, swap it with the value that's already in that spot
     // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order
@@ -821,13 +759,13 @@ fn mergeInternal(
 
     if (B.length() > 0 and A.length() > 0) {
         while (true) {
-            if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) {
-                mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]);
+            if (!context.lessThan(B.start + B_count, buffer.start + A_count)) {
+                context.swap(A.start + insert, buffer.start + A_count);
                 A_count += 1;
                 insert += 1;
                 if (A_count >= A.length()) break;
             } else {
-                mem.swap(T, &items[A.start + insert], &items[B.start + B_count]);
+                context.swap(A.start + insert, B.start + B_count);
                 B_count += 1;
                 insert += 1;
                 if (B_count >= B.length()) break;
@@ -836,113 +774,97 @@ fn mergeInternal(
     }
 
     // swap the remainder of A into the final array
-    blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count);
+    blockSwap(buffer.start + A_count, A.start + insert, A.length() - A_count, context);
 }
 
-fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void {
+fn blockSwap(start1: usize, start2: usize, block_size: usize, context: anytype) void {
     var index: usize = 0;
     while (index < block_size) : (index += 1) {
-        mem.swap(T, &items[start1 + index], &items[start2 + index]);
+        context.swap(start1 + index, start2 + index);
     }
 }
 
 // combine a linear search with a binary search to reduce the number of comparisons in situations
 // where have some idea as to how many unique values there are and where the next value might be
 fn findFirstForward(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     unique: usize,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
-    if (range.length() == 0) return range.start;
     const skip = @max(range.length() / unique, @as(usize, 1));
 
     var index = range.start + skip;
-    while (lessThan(context, items[index - 1], value)) : (index += skip) {
+    while (context.lessThan(index - 1, value_index)) : (index += skip) {
         if (index >= range.end - skip) {
-            return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan);
+            return binaryFirst(value_index, Range.init(index, range.end), context);
         }
     }
 
-    return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan);
+    return binaryFirst(value_index, Range.init(index - skip, index), context);
 }
 
 fn findFirstBackward(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     unique: usize,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
     if (range.length() == 0) return range.start;
     const skip = @max(range.length() / unique, @as(usize, 1));
 
     var index = range.end - skip;
-    while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) {
+    while (index > range.start and !context.lessThan(index - 1, value_index)) : (index -= skip) {
         if (index < range.start + skip) {
-            return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan);
+            return binaryFirst(value_index, Range.init(range.start, index), context);
         }
     }
 
-    return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan);
+    return binaryFirst(value_index, Range.init(index, index + skip), context);
 }
 
 fn findLastForward(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     unique: usize,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
     if (range.length() == 0) return range.start;
     const skip = @max(range.length() / unique, @as(usize, 1));
 
     var index = range.start + skip;
-    while (!lessThan(context, value, items[index - 1])) : (index += skip) {
+    while (!context.lessThan(value_index, index - 1)) : (index += skip) {
         if (index >= range.end - skip) {
-            return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan);
+            return binaryLast(value_index, Range.init(index, range.end), context);
         }
     }
 
-    return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan);
+    return binaryLast(value_index, Range.init(index - skip, index), context);
 }
 
 fn findLastBackward(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     unique: usize,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
     if (range.length() == 0) return range.start;
     const skip = @max(range.length() / unique, @as(usize, 1));
 
     var index = range.end - skip;
-    while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) {
+    while (index > range.start and context.lessThan(value_index, index - 1)) : (index -= skip) {
         if (index < range.start + skip) {
-            return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan);
+            return binaryLast(value_index, Range.init(range.start, index), context);
         }
     }
 
-    return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan);
+    return binaryLast(value_index, Range.init(index, index + skip), context);
 }
 
 fn binaryFirst(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
     var curr = range.start;
     var size = range.length();
@@ -951,8 +873,8 @@ fn binaryFirst(
         const offset = size % 2;
 
         size /= 2;
-        const mid_item = items[curr + size];
-        if (lessThan(context, mid_item, value)) {
+        const mid_index = curr + size;
+        if (context.lessThan(mid_index, value_index)) {
             curr += size + offset;
         }
     }
@@ -960,12 +882,9 @@ fn binaryFirst(
 }
 
 fn binaryLast(
-    comptime T: type,
-    items: []T,
-    value: T,
+    value_index: usize,
     range: Range,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) usize {
     var curr = range.start;
     var size = range.length();
@@ -974,102 +893,23 @@ fn binaryLast(
         const offset = size % 2;
 
         size /= 2;
-        const mid_item = items[curr + size];
-        if (!lessThan(context, value, mid_item)) {
+        const mid_index = curr + size;
+        if (!context.lessThan(value_index, mid_index)) {
             curr += size + offset;
         }
     }
     return curr;
 }
 
-fn mergeInto(
-    comptime T: type,
-    from: []T,
-    A: Range,
-    B: Range,
-    into: []T,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
-) void {
-    var A_index: usize = A.start;
-    var B_index: usize = B.start;
-    const A_last = A.end;
-    const B_last = B.end;
-    var insert_index: usize = 0;
-
-    while (true) {
-        if (!lessThan(context, from[B_index], from[A_index])) {
-            into[insert_index] = from[A_index];
-            A_index += 1;
-            insert_index += 1;
-            if (A_index == A_last) {
-                // copy the remainder of B into the final array
-                const from_b = from[B_index..B_last];
-                @memcpy(into[insert_index..][0..from_b.len], from_b);
-                break;
-            }
-        } else {
-            into[insert_index] = from[B_index];
-            B_index += 1;
-            insert_index += 1;
-            if (B_index == B_last) {
-                // copy the remainder of A into the final array
-                const from_a = from[A_index..A_last];
-                @memcpy(into[insert_index..][0..from_a.len], from_a);
-                break;
-            }
-        }
-    }
-}
-
-fn mergeExternal(
-    comptime T: type,
-    items: []T,
-    A: Range,
-    B: Range,
-    cache: []T,
-    context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
-) void {
-    // A fits into the cache, so use that instead of the internal buffer
-    var A_index: usize = 0;
-    var B_index: usize = B.start;
-    var insert_index: usize = A.start;
-    const A_last = A.length();
-    const B_last = B.end;
-
-    if (B.length() > 0 and A.length() > 0) {
-        while (true) {
-            if (!lessThan(context, items[B_index], cache[A_index])) {
-                items[insert_index] = cache[A_index];
-                A_index += 1;
-                insert_index += 1;
-                if (A_index == A_last) break;
-            } else {
-                items[insert_index] = items[B_index];
-                B_index += 1;
-                insert_index += 1;
-                if (B_index == B_last) break;
-            }
-        }
-    }
-
-    // copy the remainder of A into the final array
-    const cache_a = cache[A_index..A_last];
-    @memcpy(items[insert_index..][0..cache_a.len], cache_a);
-}
-
 fn swap(
-    comptime T: type,
-    items: []T,
     order: *[8]u8,
+    start_index: usize,
     x: usize,
     y: usize,
     context: anytype,
-    comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool,
 ) void {
-    if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) {
-        mem.swap(T, &items[x], &items[y]);
+    if (context.lessThan(start_index + y, start_index + x) or ((order.*)[x] > (order.*)[y] and !context.lessThan(start_index + x, start_index + y))) {
+        context.swap(start_index + x, start_index + y);
         mem.swap(u8, &(order.*)[x], &(order.*)[y]);
     }
 }