crypto - threaded K12: separate context computation from thread spawning (#25793)

* threaded K12: separate context computation from thread spawning Compute all contexts and store them in a pre-allocated array, then spawn threads using the pre-computed contexts. This ensures each context is fully materialized in memory with the correct values before any thread tries to access it. * kt128: unroll the permutation rounds only twice This appears to deliver the best performance thanks to improved cache utilization, and it’s consistent with what we already do for SHA3.
2025-12-06 05:44:20 +00:00 · 2025-11-03 17:09:00 +01:00 · 2025-11-03 17:09:00 +01:00 · ee4df4ad3e
commit ee4df4ad3e
parent afdd04356c
1 changed files with 106 additions and 95 deletions
--- a/lib/std/crypto/kangarootwelve.zig
+++ b/lib/std/crypto/kangarootwelve.zig
@ -230,7 +230,9 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
        break :blk offsets;
    };
-    inline for (RC) |rc| {
+    var round: usize = 0;
    while (round < 12) : (round += 2) {
        inline for (0..2) |i| {
            // θ (theta)
            var C: [5]@Vector(N, u64) = undefined;
            inline for (0..5) |x| {
@ -280,10 +282,11 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
            }
            // ι (iota)
-        const rc_splat: @Vector(N, u64) = @splat(rc);
+            const rc_splat: @Vector(N, u64) = @splat(RC[round + i]);
            states[0][0] ^= rc_splat;
        }
    }
 }
 /// Add lanes from data to N states in parallel with stride - optimized version
 fn addLanesAll(
@ -323,7 +326,9 @@ fn keccakP(state: *[200]u8) void {
    }
    // Apply 12 rounds
-    inline for (RC) |rc| {
+    var round: usize = 0;
    while (round < 12) : (round += 2) {
        inline for (0..2) |i| {
            // θ
            var C: [5]u64 = undefined;
            inline for (0..5) |x| {
@ -362,7 +367,8 @@ fn keccakP(state: *[200]u8) void {
            }
            // ι
-        lanes[0][0] ^= rc;
+            lanes[0][0] ^= RC[round + i];
        }
    }
    // Store lanes back to state
@ -759,32 +765,37 @@ fn ktMultiThreaded(
    const all_scratch = try allocator.alloc(u8, thread_count * scratch_size);
    defer allocator.free(all_scratch);
-    var group: Io.Group = .init;
+    const contexts = try allocator.alloc(LeafBatchContext, thread_count);
    defer allocator.free(contexts);
    var leaves_assigned: usize = 0;
-    var thread_idx: usize = 0;
+    var context_count: usize = 0;
    while (leaves_assigned < total_leaves) {
        const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned);
        const batch_start = chunk_size + leaves_assigned * chunk_size;
        const cvs_offset = leaves_assigned * cv_size;
-        const ctx = LeafBatchContext{
+        contexts[context_count] = LeafBatchContext{
            .output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size],
            .batch_start = batch_start,
            .batch_count = batch_count,
            .view = view,
-            .scratch_buffer = all_scratch[thread_idx * scratch_size .. (thread_idx + 1) * scratch_size],
+            .scratch_buffer = all_scratch[context_count * scratch_size .. (context_count + 1) * scratch_size],
            .total_len = total_len,
        };
        leaves_assigned += batch_count;
        context_count += 1;
    }
    var group: Io.Group = .init;
    for (contexts[0..context_count]) |ctx| {
        group.async(io, struct {
            fn process(c: LeafBatchContext) void {
                processLeafBatch(Variant, c);
            }
        }.process, .{ctx});
        leaves_assigned += batch_count;
        thread_idx += 1;
    }
    // Wait for all threads to complete