mirror of
https://codeberg.org/ziglang/zig.git
synced 2025-12-06 13:54:21 +00:00
crypto.ghash_polyval: use the R/F algorithm
A new algorithm for efficient GHASH and POLYVAL computation was just published: https://eprint.iacr.org/2025/2171.pdf It provides a very nice speedup on AARCH64 (more than 2x) and a ~10% speedup on Intel. The downside is a slight increase of memory usage and more precomputation, but this is generally worth it.
This commit is contained in:
parent
bfe3317059
commit
b6da733833
1 changed files with 104 additions and 34 deletions
|
|
@ -6,6 +6,15 @@ const mem = std.mem;
|
||||||
|
|
||||||
const Precomp = u128;
|
const Precomp = u128;
|
||||||
|
|
||||||
|
/// R/F precomputation for a single key power.
|
||||||
|
/// Based on "Efficient GHASH and POLYVAL Implementation" (Kurdi & Möller, 2025).
|
||||||
|
const RFPrecomp = struct {
|
||||||
|
/// pack1[127:64] = H1, pack1[63:0] = D1 (for computing R)
|
||||||
|
pack1: u128,
|
||||||
|
/// pack2[127:64] = H0, pack2[63:0] = D0 (for computing F)
|
||||||
|
pack2: u128,
|
||||||
|
};
|
||||||
|
|
||||||
/// GHASH is a universal hash function that uses multiplication by a fixed
|
/// GHASH is a universal hash function that uses multiplication by a fixed
|
||||||
/// parameter within a Galois field.
|
/// parameter within a Galois field.
|
||||||
///
|
///
|
||||||
|
|
@ -43,7 +52,12 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
|
||||||
// 3 multiplications with extra shifts and additions.
|
// 3 multiplications with extra shifts and additions.
|
||||||
const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .schoolbook;
|
const mul_algorithm = if (builtin.cpu.arch == .x86) .karatsuba else .schoolbook;
|
||||||
|
|
||||||
|
// P1 = x^63 + x^62 + x^57 - the reduction polynomial helper for R/F algorithm.
|
||||||
|
// This is (x^127 + x^126 + x^121) >> 64, used for efficient reduction.
|
||||||
|
const p1: u64 = (@as(u64, 1) << 63) | (@as(u64, 1) << 62) | (@as(u64, 1) << 57);
|
||||||
|
|
||||||
hx: [pc_count]Precomp,
|
hx: [pc_count]Precomp,
|
||||||
|
rf_hx: [pc_count]RFPrecomp,
|
||||||
acc: u128 = 0,
|
acc: u128 = 0,
|
||||||
|
|
||||||
leftover: usize = 0,
|
leftover: usize = 0,
|
||||||
|
|
@ -58,27 +72,46 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
|
||||||
h = (h << 1) ^ carry;
|
h = (h << 1) ^ carry;
|
||||||
}
|
}
|
||||||
var hx: [pc_count]Precomp = undefined;
|
var hx: [pc_count]Precomp = undefined;
|
||||||
|
var rf_hx: [pc_count]RFPrecomp = undefined;
|
||||||
hx[0] = h;
|
hx[0] = h;
|
||||||
hx[1] = reduce(clsq128(hx[0])); // h^2
|
hx[1] = reduce(clsq128(hx[0])); // h^2
|
||||||
|
if (use_rf) {
|
||||||
|
rf_hx[0] = initRFKey(hx[0]);
|
||||||
|
rf_hx[1] = initRFKey(hx[1]);
|
||||||
|
}
|
||||||
|
|
||||||
if (builtin.mode != .ReleaseSmall) {
|
if (builtin.mode != .ReleaseSmall) {
|
||||||
hx[2] = reduce(clmul128(hx[1], h)); // h^3
|
hx[2] = reduce(clmul128(hx[1], h)); // h^3
|
||||||
hx[3] = reduce(clsq128(hx[1])); // h^4 = h^2^2
|
hx[3] = reduce(clsq128(hx[1])); // h^4 = h^2^2
|
||||||
|
if (use_rf) {
|
||||||
|
rf_hx[2] = initRFKey(hx[2]);
|
||||||
|
rf_hx[3] = initRFKey(hx[3]);
|
||||||
|
}
|
||||||
if (block_count >= agg_8_threshold) {
|
if (block_count >= agg_8_threshold) {
|
||||||
hx[4] = reduce(clmul128(hx[3], h)); // h^5
|
hx[4] = reduce(clmul128(hx[3], h)); // h^5
|
||||||
hx[5] = reduce(clsq128(hx[2])); // h^6 = h^3^2
|
hx[5] = reduce(clsq128(hx[2])); // h^6 = h^3^2
|
||||||
hx[6] = reduce(clmul128(hx[5], h)); // h^7
|
hx[6] = reduce(clmul128(hx[5], h)); // h^7
|
||||||
hx[7] = reduce(clsq128(hx[3])); // h^8 = h^4^2
|
hx[7] = reduce(clsq128(hx[3])); // h^8 = h^4^2
|
||||||
|
if (use_rf) {
|
||||||
|
rf_hx[4] = initRFKey(hx[4]);
|
||||||
|
rf_hx[5] = initRFKey(hx[5]);
|
||||||
|
rf_hx[6] = initRFKey(hx[6]);
|
||||||
|
rf_hx[7] = initRFKey(hx[7]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (block_count >= agg_16_threshold) {
|
if (block_count >= agg_16_threshold) {
|
||||||
var i: usize = 8;
|
var i: usize = 8;
|
||||||
while (i < 16) : (i += 2) {
|
while (i < 16) : (i += 2) {
|
||||||
hx[i] = reduce(clmul128(hx[i - 1], h));
|
hx[i] = reduce(clmul128(hx[i - 1], h));
|
||||||
hx[i + 1] = reduce(clsq128(hx[i / 2]));
|
hx[i + 1] = reduce(clsq128(hx[i / 2]));
|
||||||
|
if (use_rf) {
|
||||||
|
rf_hx[i] = initRFKey(hx[i]);
|
||||||
|
rf_hx[i + 1] = initRFKey(hx[i + 1]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return Self{ .hx = hx };
|
return Self{ .hx = hx, .rf_hx = rf_hx };
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize the GHASH state with a key.
|
/// Initialize the GHASH state with a key.
|
||||||
|
|
@ -296,57 +329,94 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
|
||||||
break :impl clmulSoft;
|
break :impl clmulSoft;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Use R/F algorithm only when hardware CLMUL is available.
|
||||||
|
const use_rf = (builtin.cpu.arch == .x86_64 and builtin.zig_backend != .stage2_c and has_pclmul and has_avx) or
|
||||||
|
(builtin.cpu.arch == .aarch64 and builtin.zig_backend != .stage2_c and has_armaes);
|
||||||
|
|
||||||
|
// Initialize R/F precomputation for a single key power.
|
||||||
|
// Computes D0, D1 values that enable efficient multiplication and reduction.
|
||||||
|
fn initRFKey(h: u128) RFPrecomp {
|
||||||
|
const h0: u64 = @truncate(h);
|
||||||
|
const h1: u64 = @truncate(h >> 64);
|
||||||
|
|
||||||
|
// C = H0 * P1 (128-bit product)
|
||||||
|
const c = clmul(@as(u128, h0), @as(u128, p1), .lo);
|
||||||
|
const c0: u64 = @truncate(c);
|
||||||
|
const c1: u64 = @truncate(c >> 64);
|
||||||
|
|
||||||
|
// D0 = C0 ^ H1, D1 = C1 ^ H0
|
||||||
|
const d0 = c0 ^ h1;
|
||||||
|
const d1 = c1 ^ h0;
|
||||||
|
|
||||||
|
return .{
|
||||||
|
.pack1 = (@as(u128, h1) << 64) | d1,
|
||||||
|
.pack2 = (@as(u128, h0) << 64) | d0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// R/F multiplication - returns intermediate R and F values.
|
||||||
|
// R = M0×D1 ⊕ M1×H1, F = M0×D0 ⊕ M1×H0
|
||||||
|
fn rfMul(m: u128, key: RFPrecomp) struct { r: u128, f: u128 } {
|
||||||
|
return .{
|
||||||
|
.r = clmul(m, key.pack1, .lo) ^ clmul(m, key.pack1, .hi),
|
||||||
|
.f = clmul(m, key.pack2, .lo) ^ clmul(m, key.pack2, .hi),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// R/F final reduction - uses only 1 clmul instead of 2.
|
||||||
|
// Result = R ⊕ F1 ⊕ (x^64×F0) ⊕ (P1×F0)
|
||||||
|
fn rfReduce(r: u128, f: u128) u128 {
|
||||||
|
const f1: u64 = @truncate(f >> 64);
|
||||||
|
const f0: u64 = @truncate(f);
|
||||||
|
const pf = clmul(@as(u128, f0), @as(u128, p1), .lo);
|
||||||
|
return r ^ @as(u128, f1) ^ (@as(u128, f0) << 64) ^ pf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Aggregate N blocks and reduce.
|
||||||
|
fn aggregate(comptime n: usize, comptime rf: bool, acc: u128, st: *const Self, msg: []const u8) u128 {
|
||||||
|
if (rf) {
|
||||||
|
const rf0 = rfMul(acc ^ mem.readInt(u128, msg[0..16], endian), st.rf_hx[n - 1]);
|
||||||
|
var r = rf0.r;
|
||||||
|
var f = rf0.f;
|
||||||
|
inline for (1..n) |j| {
|
||||||
|
const rfj = rfMul(mem.readInt(u128, msg[j * 16 ..][0..16], endian), st.rf_hx[n - 1 - j]);
|
||||||
|
r ^= rfj.r;
|
||||||
|
f ^= rfj.f;
|
||||||
|
}
|
||||||
|
return rfReduce(r, f);
|
||||||
|
} else {
|
||||||
|
var u = clmul128(acc ^ mem.readInt(u128, msg[0..16], endian), st.hx[n - 1]);
|
||||||
|
inline for (1..n) |j| {
|
||||||
|
xor256(&u, clmul128(mem.readInt(u128, msg[j * 16 ..][0..16], endian), st.hx[n - 1 - j]));
|
||||||
|
}
|
||||||
|
return reduce(u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Process 16 byte blocks.
|
// Process 16 byte blocks.
|
||||||
fn blocks(st: *Self, msg: []const u8) void {
|
fn blocks(st: *Self, msg: []const u8) void {
|
||||||
assert(msg.len % 16 == 0); // GHASH blocks() expects full blocks
|
assert(msg.len % 16 == 0);
|
||||||
var acc = st.acc;
|
var acc = st.acc;
|
||||||
|
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
|
|
||||||
if (builtin.mode != .ReleaseSmall and msg.len >= agg_16_threshold * block_length) {
|
if (builtin.mode != .ReleaseSmall and msg.len >= agg_16_threshold * block_length) {
|
||||||
// 16-blocks aggregated reduction
|
|
||||||
while (i + 256 <= msg.len) : (i += 256) {
|
while (i + 256 <= msg.len) : (i += 256) {
|
||||||
var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[15 - 0]);
|
acc = aggregate(16, use_rf, acc, st, msg[i..]);
|
||||||
comptime var j = 1;
|
|
||||||
inline while (j < 16) : (j += 1) {
|
|
||||||
xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[15 - j]));
|
|
||||||
}
|
|
||||||
acc = reduce(u);
|
|
||||||
}
|
}
|
||||||
} else if (builtin.mode != .ReleaseSmall and msg.len >= agg_8_threshold * block_length) {
|
} else if (builtin.mode != .ReleaseSmall and msg.len >= agg_8_threshold * block_length) {
|
||||||
// 8-blocks aggregated reduction
|
|
||||||
while (i + 128 <= msg.len) : (i += 128) {
|
while (i + 128 <= msg.len) : (i += 128) {
|
||||||
var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[7 - 0]);
|
acc = aggregate(8, use_rf, acc, st, msg[i..]);
|
||||||
comptime var j = 1;
|
|
||||||
inline while (j < 8) : (j += 1) {
|
|
||||||
xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[7 - j]));
|
|
||||||
}
|
|
||||||
acc = reduce(u);
|
|
||||||
}
|
}
|
||||||
} else if (builtin.mode != .ReleaseSmall and msg.len >= agg_4_threshold * block_length) {
|
} else if (builtin.mode != .ReleaseSmall and msg.len >= agg_4_threshold * block_length) {
|
||||||
// 4-blocks aggregated reduction
|
|
||||||
while (i + 64 <= msg.len) : (i += 64) {
|
while (i + 64 <= msg.len) : (i += 64) {
|
||||||
var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[3 - 0]);
|
acc = aggregate(4, use_rf, acc, st, msg[i..]);
|
||||||
comptime var j = 1;
|
|
||||||
inline while (j < 4) : (j += 1) {
|
|
||||||
xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[3 - j]));
|
|
||||||
}
|
|
||||||
acc = reduce(u);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 2-blocks aggregated reduction
|
|
||||||
while (i + 32 <= msg.len) : (i += 32) {
|
while (i + 32 <= msg.len) : (i += 32) {
|
||||||
var u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[1 - 0]);
|
acc = aggregate(2, use_rf, acc, st, msg[i..]);
|
||||||
comptime var j = 1;
|
|
||||||
inline while (j < 2) : (j += 1) {
|
|
||||||
xor256(&u, clmul128(mem.readInt(u128, msg[i..][j * 16 ..][0..16], endian), st.hx[1 - j]));
|
|
||||||
}
|
|
||||||
acc = reduce(u);
|
|
||||||
}
|
}
|
||||||
// remaining blocks
|
|
||||||
if (i < msg.len) {
|
if (i < msg.len) {
|
||||||
const u = clmul128(acc ^ mem.readInt(u128, msg[i..][0..16], endian), st.hx[0]);
|
acc = aggregate(1, use_rf, acc, st, msg[i..]);
|
||||||
acc = reduce(u);
|
|
||||||
i += 16;
|
i += 16;
|
||||||
}
|
}
|
||||||
assert(i == msg.len);
|
assert(i == msg.len);
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue