mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 13:54:21 +00:00
crypto.blake3: sequentially process larger small tree layers (#26046)
Improves performance by spawning less threads.
This commit is contained in:
parent
e23af9d31d
commit
5f73c01368
1 changed files with 52 additions and 8 deletions
|
|
@ -685,9 +685,9 @@ const ChunkBatch = struct {
|
||||||
|
|
||||||
while (chunk_idx < ctx.end_chunk) {
|
while (chunk_idx < ctx.end_chunk) {
|
||||||
const remaining = ctx.end_chunk - chunk_idx;
|
const remaining = ctx.end_chunk - chunk_idx;
|
||||||
const batch_size = @min(remaining, max_simd_degree);
|
const batch_size: usize = @min(remaining, max_simd_degree);
|
||||||
const offset = chunk_idx * chunk_length;
|
const offset = chunk_idx * chunk_length;
|
||||||
const batch_len = @as(usize, batch_size) * chunk_length;
|
const batch_len = batch_size * chunk_length;
|
||||||
|
|
||||||
const num_cvs = compressChunksParallel(
|
const num_cvs = compressChunksParallel(
|
||||||
ctx.input[offset..][0..batch_len],
|
ctx.input[offset..][0..batch_len],
|
||||||
|
|
@ -723,6 +723,44 @@ fn processParentBatch(ctx: ParentBatchContext) void {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn processParentBatchSIMD(ctx: ParentBatchContext) void {
|
||||||
|
const num_parents = ctx.end_idx - ctx.start_idx;
|
||||||
|
if (num_parents == 0) return;
|
||||||
|
|
||||||
|
// Convert input CVs to bytes for SIMD processing
|
||||||
|
var input_bytes: [max_simd_degree * 2 * Blake3.digest_length]u8 = undefined;
|
||||||
|
var output_bytes: [max_simd_degree * Blake3.digest_length]u8 = undefined;
|
||||||
|
var parents_array: [max_simd_degree][*]const u8 = undefined;
|
||||||
|
|
||||||
|
var processed: usize = 0;
|
||||||
|
while (processed < num_parents) {
|
||||||
|
const batch_size: usize = @min(num_parents - processed, max_simd_degree);
|
||||||
|
|
||||||
|
// Convert CV pairs to byte blocks for this batch
|
||||||
|
for (0..batch_size) |i| {
|
||||||
|
const pair_idx = ctx.start_idx + processed + i;
|
||||||
|
const left_cv = ctx.input_cvs[pair_idx * 2];
|
||||||
|
const right_cv = ctx.input_cvs[pair_idx * 2 + 1];
|
||||||
|
|
||||||
|
// Write left CV || right CV to form 64-byte parent block
|
||||||
|
for (0..8) |j| {
|
||||||
|
store32(input_bytes[i * 64 + j * 4 ..][0..4], left_cv[j]);
|
||||||
|
store32(input_bytes[i * 64 + 32 + j * 4 ..][0..4], right_cv[j]);
|
||||||
|
}
|
||||||
|
parents_array[i] = input_bytes[i * 64 ..].ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
hashMany(parents_array[0..batch_size], batch_size, 1, ctx.key, 0, false, ctx.flags.with(.{ .parent = true }), .{}, .{}, output_bytes[0 .. batch_size * Blake3.digest_length]);
|
||||||
|
|
||||||
|
for (0..batch_size) |i| {
|
||||||
|
const output_idx = ctx.start_idx + processed + i;
|
||||||
|
ctx.output_cvs[output_idx] = loadCvWords(output_bytes[i * Blake3.digest_length ..][0..Blake3.digest_length].*);
|
||||||
|
}
|
||||||
|
|
||||||
|
processed += batch_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn buildMerkleTreeLayerParallel(
|
fn buildMerkleTreeLayerParallel(
|
||||||
input_cvs: [][8]u32,
|
input_cvs: [][8]u32,
|
||||||
output_cvs: [][8]u32,
|
output_cvs: [][8]u32,
|
||||||
|
|
@ -732,11 +770,17 @@ fn buildMerkleTreeLayerParallel(
|
||||||
) void {
|
) void {
|
||||||
const num_parents = input_cvs.len / 2;
|
const num_parents = input_cvs.len / 2;
|
||||||
|
|
||||||
if (num_parents <= 16) {
|
// Process sequentially with SIMD for smaller tree layers to avoid thread overhead
|
||||||
for (0..num_parents) |i| {
|
// Tree layers shrink quickly, so only parallelize the first few large layers
|
||||||
const output = parentOutputFromCvs(input_cvs[i * 2], input_cvs[i * 2 + 1], key, flags);
|
if (num_parents <= 1024) {
|
||||||
output_cvs[i] = output.chainingValue();
|
processParentBatchSIMD(ParentBatchContext{
|
||||||
}
|
.input_cvs = input_cvs,
|
||||||
|
.output_cvs = output_cvs,
|
||||||
|
.start_idx = 0,
|
||||||
|
.end_idx = num_parents,
|
||||||
|
.key = key,
|
||||||
|
.flags = flags,
|
||||||
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -748,7 +792,7 @@ fn buildMerkleTreeLayerParallel(
|
||||||
const start_idx = worker_id * parents_per_worker;
|
const start_idx = worker_id * parents_per_worker;
|
||||||
if (start_idx >= num_parents) break;
|
if (start_idx >= num_parents) break;
|
||||||
|
|
||||||
group.async(io, processParentBatch, .{ParentBatchContext{
|
group.async(io, processParentBatchSIMD, .{ParentBatchContext{
|
||||||
.input_cvs = input_cvs,
|
.input_cvs = input_cvs,
|
||||||
.output_cvs = output_cvs,
|
.output_cvs = output_cvs,
|
||||||
.start_idx = start_idx,
|
.start_idx = start_idx,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue