mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 05:44:20 +00:00
crypto - threaded K12: separate context computation from thread spawning (#25793)
* threaded K12: separate context computation from thread spawning Compute all contexts and store them in a pre-allocated array, then spawn threads using the pre-computed contexts. This ensures each context is fully materialized in memory with the correct values before any thread tries to access it. * kt128: unroll the permutation rounds only twice This appears to deliver the best performance thanks to improved cache utilization, and it’s consistent with what we already do for SHA3.
This commit is contained in:
parent
afdd04356c
commit
ee4df4ad3e
1 changed files with 106 additions and 95 deletions
|
|
@ -230,7 +230,9 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
|
||||||
break :blk offsets;
|
break :blk offsets;
|
||||||
};
|
};
|
||||||
|
|
||||||
inline for (RC) |rc| {
|
var round: usize = 0;
|
||||||
|
while (round < 12) : (round += 2) {
|
||||||
|
inline for (0..2) |i| {
|
||||||
// θ (theta)
|
// θ (theta)
|
||||||
var C: [5]@Vector(N, u64) = undefined;
|
var C: [5]@Vector(N, u64) = undefined;
|
||||||
inline for (0..5) |x| {
|
inline for (0..5) |x| {
|
||||||
|
|
@ -280,10 +282,11 @@ fn keccakP1600timesN(comptime N: usize, states: *[5][5]@Vector(N, u64)) void {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ι (iota)
|
// ι (iota)
|
||||||
const rc_splat: @Vector(N, u64) = @splat(rc);
|
const rc_splat: @Vector(N, u64) = @splat(RC[round + i]);
|
||||||
states[0][0] ^= rc_splat;
|
states[0][0] ^= rc_splat;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Add lanes from data to N states in parallel with stride - optimized version
|
/// Add lanes from data to N states in parallel with stride - optimized version
|
||||||
fn addLanesAll(
|
fn addLanesAll(
|
||||||
|
|
@ -323,7 +326,9 @@ fn keccakP(state: *[200]u8) void {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apply 12 rounds
|
// Apply 12 rounds
|
||||||
inline for (RC) |rc| {
|
var round: usize = 0;
|
||||||
|
while (round < 12) : (round += 2) {
|
||||||
|
inline for (0..2) |i| {
|
||||||
// θ
|
// θ
|
||||||
var C: [5]u64 = undefined;
|
var C: [5]u64 = undefined;
|
||||||
inline for (0..5) |x| {
|
inline for (0..5) |x| {
|
||||||
|
|
@ -362,7 +367,8 @@ fn keccakP(state: *[200]u8) void {
|
||||||
}
|
}
|
||||||
|
|
||||||
// ι
|
// ι
|
||||||
lanes[0][0] ^= rc;
|
lanes[0][0] ^= RC[round + i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store lanes back to state
|
// Store lanes back to state
|
||||||
|
|
@ -759,32 +765,37 @@ fn ktMultiThreaded(
|
||||||
const all_scratch = try allocator.alloc(u8, thread_count * scratch_size);
|
const all_scratch = try allocator.alloc(u8, thread_count * scratch_size);
|
||||||
defer allocator.free(all_scratch);
|
defer allocator.free(all_scratch);
|
||||||
|
|
||||||
var group: Io.Group = .init;
|
const contexts = try allocator.alloc(LeafBatchContext, thread_count);
|
||||||
|
defer allocator.free(contexts);
|
||||||
|
|
||||||
var leaves_assigned: usize = 0;
|
var leaves_assigned: usize = 0;
|
||||||
var thread_idx: usize = 0;
|
var context_count: usize = 0;
|
||||||
|
|
||||||
while (leaves_assigned < total_leaves) {
|
while (leaves_assigned < total_leaves) {
|
||||||
const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned);
|
const batch_count = @min(leaves_per_thread, total_leaves - leaves_assigned);
|
||||||
const batch_start = chunk_size + leaves_assigned * chunk_size;
|
const batch_start = chunk_size + leaves_assigned * chunk_size;
|
||||||
const cvs_offset = leaves_assigned * cv_size;
|
const cvs_offset = leaves_assigned * cv_size;
|
||||||
|
|
||||||
const ctx = LeafBatchContext{
|
contexts[context_count] = LeafBatchContext{
|
||||||
.output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size],
|
.output_cvs = cvs[cvs_offset .. cvs_offset + batch_count * cv_size],
|
||||||
.batch_start = batch_start,
|
.batch_start = batch_start,
|
||||||
.batch_count = batch_count,
|
.batch_count = batch_count,
|
||||||
.view = view,
|
.view = view,
|
||||||
.scratch_buffer = all_scratch[thread_idx * scratch_size .. (thread_idx + 1) * scratch_size],
|
.scratch_buffer = all_scratch[context_count * scratch_size .. (context_count + 1) * scratch_size],
|
||||||
.total_len = total_len,
|
.total_len = total_len,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
leaves_assigned += batch_count;
|
||||||
|
context_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
var group: Io.Group = .init;
|
||||||
|
for (contexts[0..context_count]) |ctx| {
|
||||||
group.async(io, struct {
|
group.async(io, struct {
|
||||||
fn process(c: LeafBatchContext) void {
|
fn process(c: LeafBatchContext) void {
|
||||||
processLeafBatch(Variant, c);
|
processLeafBatch(Variant, c);
|
||||||
}
|
}
|
||||||
}.process, .{ctx});
|
}.process, .{ctx});
|
||||||
|
|
||||||
leaves_assigned += batch_count;
|
|
||||||
thread_idx += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for all threads to complete
|
// Wait for all threads to complete
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue