zig/lib/std/crypto/poly1305.zig
Andrew Kelley e7b18a7ce6 std.crypto: remove inline from most functions
To quote the language reference,

It is generally better to let the compiler decide when to inline a
function, except for these scenarios:

* To change how many stack frames are in the call stack, for debugging
  purposes.
* To force comptime-ness of the arguments to propagate to the return
  value of the function, as in the above example.
* Real world performance measurements demand it. Don't guess!

Note that inline actually restricts what the compiler is allowed to do.
This can harm binary size, compilation speed, and even runtime
performance.

`zig run lib/std/crypto/benchmark.zig -OReleaseFast`
[-before-] vs {+after+}

              md5:        [-990-]        {+998+} MiB/s
             sha1:       [-1144-]       {+1140+} MiB/s
           sha256:       [-2267-]       {+2275+} MiB/s
           sha512:        [-762-]        {+767+} MiB/s
         sha3-256:        [-680-]        {+683+} MiB/s
         sha3-512:        [-362-]        {+363+} MiB/s
        shake-128:        [-835-]        {+839+} MiB/s
        shake-256:        [-680-]        {+681+} MiB/s
   turboshake-128:       [-1567-]       {+1570+} MiB/s
   turboshake-256:       [-1276-]       {+1282+} MiB/s
          blake2s:        [-778-]        {+789+} MiB/s
          blake2b:       [-1071-]       {+1086+} MiB/s
           blake3:       [-1148-]       {+1137+} MiB/s
            ghash:      [-10044-]      {+10033+} MiB/s
          polyval:       [-9726-]      {+10033+} MiB/s
         poly1305:       [-2486-]       {+2703+} MiB/s
         hmac-md5:        [-991-]        {+998+} MiB/s
        hmac-sha1:       [-1134-]       {+1137+} MiB/s
      hmac-sha256:       [-2265-]       {+2288+} MiB/s
      hmac-sha512:        [-765-]        {+764+} MiB/s
      siphash-2-4:       [-4410-]       {+4438+} MiB/s
      siphash-1-3:       [-7144-]       {+7225+} MiB/s
   siphash128-2-4:       [-4397-]       {+4449+} MiB/s
   siphash128-1-3:       [-7281-]       {+7374+} MiB/s
  aegis-128x4 mac:      [-73385-]      {+74523+} MiB/s
  aegis-256x4 mac:      [-30160-]      {+30539+} MiB/s
  aegis-128x2 mac:      [-66662-]      {+67267+} MiB/s
  aegis-256x2 mac:      [-16812-]      {+16806+} MiB/s
   aegis-128l mac:      [-33876-]      {+34055+} MiB/s
    aegis-256 mac:       [-8993-]       {+9087+} MiB/s
         aes-cmac:       2036 MiB/s
           x25519:      [-20670-]      {+16844+} exchanges/s
          ed25519:      [-29763-]      {+29576+} signatures/s
       ecdsa-p256:       [-4762-]       {+4900+} signatures/s
       ecdsa-p384:       [-1465-]       {+1500+} signatures/s
  ecdsa-secp256k1:       [-5643-]       {+5769+} signatures/s
          ed25519:      [-21926-]      {+21721+} verifications/s
          ed25519:      [-51200-]      {+50880+} verifications/s (batch)
 chacha20Poly1305:       [-1189-]       {+1109+} MiB/s
xchacha20Poly1305:       [-1196-]       {+1107+} MiB/s
 xchacha8Poly1305:       [-1466-]       {+1555+} MiB/s
 xsalsa20Poly1305:        [-660-]        {+620+} MiB/s
      aegis-128x4:      [-76389-]      {+78181+} MiB/s
      aegis-128x2:      [-53946-]      {+53495+} MiB/s
       aegis-128l:      [-27219-]      {+25621+} MiB/s
      aegis-256x4:      [-49351-]      {+49542+} MiB/s
      aegis-256x2:      [-32390-]      {+32366+} MiB/s
        aegis-256:       [-8881-]       {+8944+} MiB/s
       aes128-gcm:       [-6095-]       {+6205+} MiB/s
       aes256-gcm:       [-5306-]       {+5427+} MiB/s
       aes128-ocb:       [-8529-]      {+13974+} MiB/s
       aes256-ocb:       [-7241-]       {+9442+} MiB/s
        isapa128a:        [-204-]        {+214+} MiB/s
    aes128-single:  [-133857882-]  {+134170944+} ops/s
    aes256-single:   [-96306962-]   {+96408639+} ops/s
         aes128-8: [-1083210101-] {+1073727253+} ops/s
         aes256-8:  [-762042466-]  {+767091778+} ops/s
           bcrypt:      0.009 s/ops
           scrypt:      [-0.018-]      {+0.017+} s/ops
           argon2:      [-0.037-]      {+0.060+} s/ops
      kyber512d00:     [-206057-]     {+205779+} encaps/s
      kyber768d00:     [-156074-]     {+150711+} encaps/s
     kyber1024d00:     [-116626-]     {+115469+} encaps/s
      kyber512d00:     [-181149-]     {+182046+} decaps/s
      kyber768d00:     [-136965-]     {+135676+} decaps/s
     kyber1024d00:     [-101307-]     {+100643+} decaps/s
      kyber512d00:     [-123624-]     {+123375+} keygen/s
      kyber768d00:      [-69465-]      {+70828+} keygen/s
     kyber1024d00:      [-43117-]      {+43208+} keygen/s
2025-07-13 18:26:13 +02:00

217 lines
7.1 KiB
Zig

const std = @import("../std.zig");
const mem = std.mem;
const mulWide = std.math.mulWide;
pub const Poly1305 = struct {
pub const block_length: usize = 16;
pub const mac_length = 16;
pub const key_length = 32;
// constant multiplier (from the secret key)
r: [2]u64,
// accumulated hash
h: [3]u64 = [_]u64{ 0, 0, 0 },
// random number added at the end (from the secret key)
end_pad: [2]u64,
// how many bytes are waiting to be processed in a partial block
leftover: usize = 0,
// partial block buffer
buf: [block_length]u8 align(16) = undefined,
pub fn init(key: *const [key_length]u8) Poly1305 {
return Poly1305{
.r = [_]u64{
mem.readInt(u64, key[0..8], .little) & 0x0ffffffc0fffffff,
mem.readInt(u64, key[8..16], .little) & 0x0ffffffc0ffffffc,
},
.end_pad = [_]u64{
mem.readInt(u64, key[16..24], .little),
mem.readInt(u64, key[24..32], .little),
},
};
}
fn add(a: u64, b: u64, c: u1) struct { u64, u1 } {
const v1 = @addWithOverflow(a, b);
const v2 = @addWithOverflow(v1[0], c);
return .{ v2[0], v1[1] | v2[1] };
}
fn sub(a: u64, b: u64, c: u1) struct { u64, u1 } {
const v1 = @subWithOverflow(a, b);
const v2 = @subWithOverflow(v1[0], c);
return .{ v2[0], v1[1] | v2[1] };
}
fn blocks(st: *Poly1305, m: []const u8, comptime last: bool) void {
const hibit: u64 = if (last) 0 else 1;
const r0 = st.r[0];
const r1 = st.r[1];
var h0 = st.h[0];
var h1 = st.h[1];
var h2 = st.h[2];
var i: usize = 0;
while (i + block_length <= m.len) : (i += block_length) {
const in0 = mem.readInt(u64, m[i..][0..8], .little);
const in1 = mem.readInt(u64, m[i + 8 ..][0..8], .little);
// Add the input message to H
var v = @addWithOverflow(h0, in0);
h0 = v[0];
v = add(h1, in1, v[1]);
h1 = v[0];
h2 +%= v[1] +% hibit;
// Compute H * R
const m0 = mulWide(u64, h0, r0);
const h1r0 = mulWide(u64, h1, r0);
const h0r1 = mulWide(u64, h0, r1);
const h2r0 = mulWide(u64, h2, r0);
const h1r1 = mulWide(u64, h1, r1);
const m3 = mulWide(u64, h2, r1);
const m1 = h1r0 +% h0r1;
const m2 = h2r0 +% h1r1;
const t0 = @as(u64, @truncate(m0));
v = @addWithOverflow(@as(u64, @truncate(m1)), @as(u64, @truncate(m0 >> 64)));
const t1 = v[0];
v = add(@as(u64, @truncate(m2)), @as(u64, @truncate(m1 >> 64)), v[1]);
const t2 = v[0];
v = add(@as(u64, @truncate(m3)), @as(u64, @truncate(m2 >> 64)), v[1]);
const t3 = v[0];
// Partial reduction
h0 = t0;
h1 = t1;
h2 = t2 & 3;
// Add c*(4+1)
const cclo = t2 & ~@as(u64, 3);
const cchi = t3;
v = @addWithOverflow(h0, cclo);
h0 = v[0];
v = add(h1, cchi, v[1]);
h1 = v[0];
h2 +%= v[1];
const cc = (cclo | (@as(u128, cchi) << 64)) >> 2;
v = @addWithOverflow(h0, @as(u64, @truncate(cc)));
h0 = v[0];
v = add(h1, @as(u64, @truncate(cc >> 64)), v[1]);
h1 = v[0];
h2 +%= v[1];
}
st.h = [_]u64{ h0, h1, h2 };
}
pub fn update(st: *Poly1305, m: []const u8) void {
var mb = m;
// handle leftover
if (st.leftover > 0) {
const want = @min(block_length - st.leftover, mb.len);
const mc = mb[0..want];
for (mc, 0..) |x, i| {
st.buf[st.leftover + i] = x;
}
mb = mb[want..];
st.leftover += want;
if (st.leftover < block_length) {
return;
}
st.blocks(&st.buf, false);
st.leftover = 0;
}
// process full blocks
if (mb.len >= block_length) {
const want = mb.len & ~(block_length - 1);
st.blocks(mb[0..want], false);
mb = mb[want..];
}
// store leftover
if (mb.len > 0) {
for (mb, 0..) |x, i| {
st.buf[st.leftover + i] = x;
}
st.leftover += mb.len;
}
}
/// Zero-pad to align the next input to the first byte of a block
pub fn pad(st: *Poly1305) void {
if (st.leftover == 0) {
return;
}
@memset(st.buf[st.leftover..], 0);
st.blocks(&st.buf, false);
st.leftover = 0;
}
pub fn final(st: *Poly1305, out: *[mac_length]u8) void {
if (st.leftover > 0) {
var i = st.leftover;
st.buf[i] = 1;
i += 1;
@memset(st.buf[i..], 0);
st.blocks(&st.buf, true);
}
var h0 = st.h[0];
var h1 = st.h[1];
const h2 = st.h[2];
// H - (2^130 - 5)
var v = @subWithOverflow(h0, 0xfffffffffffffffb);
const h_p0 = v[0];
v = sub(h1, 0xffffffffffffffff, v[1]);
const h_p1 = v[0];
v = sub(h2, 0x0000000000000003, v[1]);
// Final reduction, subtract 2^130-5 from H if H >= 2^130-5
const mask = @as(u64, v[1]) -% 1;
h0 ^= mask & (h0 ^ h_p0);
h1 ^= mask & (h1 ^ h_p1);
// Add the first half of the key, we intentionally don't use @addWithOverflow() here.
st.h[0] = h0 +% st.end_pad[0];
const c = ((h0 & st.end_pad[0]) | ((h0 | st.end_pad[0]) & ~st.h[0])) >> 63;
st.h[1] = h1 +% st.end_pad[1] +% c;
mem.writeInt(u64, out[0..8], st.h[0], .little);
mem.writeInt(u64, out[8..16], st.h[1], .little);
std.crypto.secureZero(u8, @as([*]u8, @ptrCast(st))[0..@sizeOf(Poly1305)]);
}
pub fn create(out: *[mac_length]u8, msg: []const u8, key: *const [key_length]u8) void {
var st = Poly1305.init(key);
st.update(msg);
st.final(out);
}
};
test "rfc7439 vector1" {
const expected_mac = "\xa8\x06\x1d\xc1\x30\x51\x36\xc6\xc2\x2b\x8b\xaf\x0c\x01\x27\xa9";
const msg = "Cryptographic Forum Research Group";
const key = "\x85\xd6\xbe\x78\x57\x55\x6d\x33\x7f\x44\x52\xfe\x42\xd5\x06\xa8" ++
"\x01\x03\x80\x8a\xfb\x0d\xb2\xfd\x4a\xbf\xf6\xaf\x41\x49\xf5\x1b";
var mac: [16]u8 = undefined;
Poly1305.create(mac[0..], msg, key);
try std.testing.expectEqualSlices(u8, expected_mac, &mac);
}
test "requiring a final reduction" {
const expected_mac = [_]u8{ 25, 13, 249, 42, 164, 57, 99, 60, 149, 181, 74, 74, 13, 63, 121, 6 };
const msg = [_]u8{ 253, 193, 249, 146, 70, 6, 214, 226, 131, 213, 241, 116, 20, 24, 210, 224, 65, 151, 255, 104, 133 };
const key = [_]u8{ 190, 63, 95, 57, 155, 103, 77, 170, 7, 98, 106, 44, 117, 186, 90, 185, 109, 118, 184, 24, 69, 41, 166, 243, 119, 132, 151, 61, 52, 43, 64, 250 };
var mac: [16]u8 = undefined;
Poly1305.create(mac[0..], &msg, &key);
try std.testing.expectEqualSlices(u8, &expected_mac, &mac);
}