Compare commits

...

3 commits

Author SHA1 Message Date
David Rubin
3b29a06830
c: support +r output constraint 2025-11-28 04:13:49 -08:00
David Rubin
eef0dd4826
codegen: fix x86-64 backend crc32 inline asm 2025-11-28 04:13:48 -08:00
David Rubin
0d44084243
hash: implement fast crc32c 2025-11-28 04:13:45 -08:00
8 changed files with 316 additions and 42 deletions

View file

@ -1,5 +1,6 @@
//! This file is auto-generated by tools/update_crc_catalog.zig. //! This file is auto-generated by tools/update_crc_catalog.zig.
const builtin = @import("builtin");
const impl = @import("crc/impl.zig"); const impl = @import("crc/impl.zig");
pub const Crc = impl.Crc; pub const Crc = impl.Crc;
@ -13,6 +14,17 @@ test {
_ = @import("crc/test.zig"); _ = @import("crc/test.zig");
} }
pub const Crc32Iscsi = switch (builtin.cpu.has(.x86, .crc32)) {
true => @import("crc/crc32c.zig").Wrapper,
else => Crc(u32, .{
.polynomial = 0x1edc6f41,
.initial = 0xffffffff,
.reflect_input = true,
.reflect_output = true,
.xor_output = 0xffffffff,
}),
};
pub const Crc3Gsm = Crc(u3, .{ pub const Crc3Gsm = Crc(u3, .{
.polynomial = 0x3, .polynomial = 0x3,
.initial = 0x0, .initial = 0x0,
@ -797,14 +809,6 @@ pub const Crc32Cksum = Crc(u32, .{
.xor_output = 0xffffffff, .xor_output = 0xffffffff,
}); });
pub const Crc32Iscsi = Crc(u32, .{
.polynomial = 0x1edc6f41,
.initial = 0xffffffff,
.reflect_input = true,
.reflect_output = true,
.xor_output = 0xffffffff,
});
pub const Crc32IsoHdlc = Crc(u32, .{ pub const Crc32IsoHdlc = Crc(u32, .{
.polynomial = 0x04c11db7, .polynomial = 0x04c11db7,
.initial = 0xffffffff, .initial = 0xffffffff,

239
lib/std/hash/crc/crc32c.zig Normal file
View file

@ -0,0 +1,239 @@
//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
//!
//! A couple useful links for understanding the approach taken here:
//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
//! - http://www.ross.net/crc/download/crc_v3.txt
// Reflected CRC-32C polynomial in binary form.
const POLY = 0x82f63b78;
const LONG = 8192;
const SHORT = 256;
const long_lookup_table = genTable(LONG);
const short_lookup_table = genTable(SHORT);
/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
/// We pre-compute 4 tables of 256 entries each (one per byte offset).
///
///
/// The idea behind this table is quite interesting. The CRC state is equivalent to the
/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
///
/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
/// as a linear transformation in GF(2), i.e, a matrix.
///
/// We build up this matrix via repeated squaring:
/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
///
/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
fn genTable(length: usize) [4][256]u32 {
@setEvalBranchQuota(250000);
var even: [32]u32 = undefined;
zeroes: {
var odd: [32]u32 = undefined;
// Initialize our `odd` array with the operator for a single zero bit:
// - odd[0] is the polynomial itself (acts on the MSB).
// - odd[1..32] represent shifting a single bit through 31 positions.
odd[0] = POLY;
var row: u32 = 1;
for (1..32) |n| {
odd[n] = row;
row <<= 1;
}
// even = odd squared: even represents `x^2 mod POLY`.
square(&even, &odd);
// odd = even squared: odd now represents `x^4 mod POLY`.
square(&odd, &even);
// Continue squaring to double the number of zeroes encoded each time:
//
// At each point in the process:
// - square(even, odd): even gets the operator for twice the current length.
// - square(odd, even): odd gets the operator for 4 times the original length.
var len = length;
while (true) {
square(&even, &odd);
len >>= 1;
if (len == 0) break :zeroes;
square(&odd, &even);
len >>= 1;
if (len == 0) break;
}
@memcpy(&even, &odd);
}
var zeroes: [4][256]u32 = undefined;
for (0..256) |n| {
zeroes[0][n] = times(&even, n);
zeroes[1][n] = times(&even, n << 8);
zeroes[2][n] = times(&even, n << 16);
zeroes[3][n] = times(&even, n << 24);
}
return zeroes;
}
/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
/// during shifting.
///
/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
/// an `AND`, and addition is `XOR`.
/// - This dot product determines how each bit in the input vector "contributes" to
/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
fn times(mat: *const [32]u32, vec: u32) u32 {
var sum: u32 = 0;
var v = vec;
var i: u32 = 0;
while (v != 0) {
if (v & 1 != 0) sum ^= mat[i];
v >>= 1;
i += 1;
}
return sum;
}
/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
///
/// This produces the operator for doubling the number of zeroes:
/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
/// represent advancing by 2k zeroes.
///
/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
/// gives the effect of two sequential applications of the operator.
fn square(dst: *[32]u32, src: *const [32]u32) void {
for (dst, src) |*d, s| {
d.* = times(src, s);
}
}
fn shift(table: *const [4][256]u32, crc: u32) u32 {
return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
}
fn crc32(crc: u32, input: []const u8) u32 {
var crc0: u64 = ~crc;
// Compute the CRC for up to seven leading bytes to bring the
// `next` pointer to an eight-byte boundary.
var next = input;
while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
asm volatile ("crc32b %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (next[0]),
);
next = next[1..];
}
// Compute the CRC on sets of LONG * 3 bytes, executing three independent
// CRC instructions, each on LONG bytes. This is an optimization for
// targets where the CRC instruction has a throughput of one CRC per
// cycle, but a latency of three cycles.
while (next.len >= LONG * 3) {
var crc1: u64 = 0;
var crc2: u64 = 0;
const start = next.len;
while (true) {
// Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
const long: [*]const u64 = @ptrCast(@alignCast(next));
asm volatile (
\\crc32q %[out0], %[in0]
\\crc32q %[out1], %[in1]
\\crc32q %[out2], %[in2]
: [in0] "+r" (crc0),
[in1] "+r" (crc1),
[in2] "+r" (crc2),
: [out0] "rm" (long[0 * LONG / 8]),
[out1] "rm" (long[1 * LONG / 8]),
[out2] "rm" (long[2 * LONG / 8]),
);
next = next[8..];
if (next.len <= start - LONG) break;
}
crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
next = next[LONG * 2 ..];
}
// Same thing as above, but for smaller chunks of SHORT bytes.
while (next.len >= SHORT * 3) {
var crc1: u64 = 0;
var crc2: u64 = 0;
const start = next.len;
while (true) {
const long: [*]const u64 = @ptrCast(@alignCast(next));
asm volatile (
\\crc32q %[out0], %[in0]
\\crc32q %[out1], %[in1]
\\crc32q %[out2], %[in2]
: [in0] "+r" (crc0),
[in1] "+r" (crc1),
[in2] "+r" (crc2),
: [out0] "rm" (long[0 * SHORT / 8]),
[out1] "rm" (long[1 * SHORT / 8]),
[out2] "rm" (long[2 * SHORT / 8]),
);
next = next[8..];
if (next.len <= start - SHORT) break;
}
crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
next = next[SHORT * 2 ..];
}
// Compute via 8-byte chunks, until we're left with less than 8 bytes.
while (next.len >= 8) {
const long: [*]const u64 = @ptrCast(@alignCast(next));
asm volatile ("crc32q %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (long[0]),
);
next = next[8..];
}
// Finish the last bytes with just single instructions.
while (next.len > 0) {
asm volatile ("crc32b %[out], %[in]"
: [in] "+r" (crc0),
: [out] "rm" (next[0]),
);
next = next[1..];
}
return @truncate(~crc0);
}
// Wrapper around the accelerated implementation to match the one in impl.zig.
pub const Wrapper = struct {
crc: u32,
pub fn init() Wrapper {
return .{ .crc = 0 };
}
pub fn update(w: *Wrapper, bytes: []const u8) void {
w.crc = crc32(w.crc, bytes);
}
pub fn final(w: Wrapper) u32 {
return w.crc;
}
pub fn hash(bytes: []const u8) u32 {
var c = init();
c.update(bytes);
return c.final();
}
};

View file

@ -23,12 +23,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
const I = if (@bitSizeOf(W) < 8) u8 else W; const I = if (@bitSizeOf(W) < 8) u8 else W;
const lookup_table = blk: { const lookup_table = blk: {
@setEvalBranchQuota(2500); @setEvalBranchQuota(2500);
const poly = reflect(algorithm.polynomial);
const poly = if (algorithm.reflect_input)
@bitReverse(@as(I, algorithm.polynomial)) >> (@bitSizeOf(I) - @bitSizeOf(W))
else
@as(I, algorithm.polynomial) << (@bitSizeOf(I) - @bitSizeOf(W));
var table: [256]I = undefined; var table: [256]I = undefined;
for (&table, 0..) |*e, i| { for (&table, 0..) |*e, i| {
var crc: I = i; var crc: I = i;
@ -52,15 +47,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
crc: I, crc: I,
pub fn init() Self { pub fn init() Self {
const initial = if (algorithm.reflect_input) const initial = reflect(algorithm.initial);
@bitReverse(@as(I, algorithm.initial)) >> (@bitSizeOf(I) - @bitSizeOf(W)) return .{ .crc = initial };
else
@as(I, algorithm.initial) << (@bitSizeOf(I) - @bitSizeOf(W));
return Self{ .crc = initial };
} }
inline fn tableEntry(index: I) I { inline fn tableEntry(index: I) I {
return lookup_table[@as(u8, @intCast(index & 0xFF))]; const short: u8 = @truncate(index);
return lookup_table[short];
} }
pub fn update(self: *Self, bytes: []const u8) void { pub fn update(self: *Self, bytes: []const u8) void {
@ -90,7 +83,7 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
if (!algorithm.reflect_output) { if (!algorithm.reflect_output) {
c >>= @bitSizeOf(I) - @bitSizeOf(W); c >>= @bitSizeOf(I) - @bitSizeOf(W);
} }
return @as(W, @intCast(c ^ algorithm.xor_output)); return @intCast(c ^ algorithm.xor_output);
} }
pub fn hash(bytes: []const u8) W { pub fn hash(bytes: []const u8) W {
@ -98,5 +91,13 @@ pub fn Crc(comptime W: type, comptime algorithm: Algorithm(W)) type {
c.update(bytes); c.update(bytes);
return c.final(); return c.final();
} }
fn reflect(x: I) I {
const offset = @bitSizeOf(I) - @bitSizeOf(W);
if (algorithm.reflect_input)
return @bitReverse(x) >> offset
else
return x << offset;
}
}; };
} }

View file

@ -26,6 +26,17 @@ test "crc32 koopman regression" {
try testing.expectEqual(crc32.hash("abc"), 0xba2322ac); try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
} }
test "CRC-32/ISCSI" {
const Crc32Iscsi = crc.Crc32Iscsi;
try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
var c = Crc32Iscsi.init();
c.update("1234");
c.update("56789");
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
}
test "CRC-3/GSM" { test "CRC-3/GSM" {
const Crc3Gsm = crc.Crc3Gsm; const Crc3Gsm = crc.Crc3Gsm;
@ -1104,17 +1115,6 @@ test "CRC-32/CKSUM" {
try testing.expectEqual(@as(u32, 0x765e7680), c.final()); try testing.expectEqual(@as(u32, 0x765e7680), c.final());
} }
test "CRC-32/ISCSI" {
const Crc32Iscsi = crc.Crc32Iscsi;
try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
var c = Crc32Iscsi.init();
c.update("1234");
c.update("56789");
try testing.expectEqual(@as(u32, 0xe3069283), c.final());
}
test "CRC-32/ISO-HDLC" { test "CRC-32/ISO-HDLC" {
const Crc32IsoHdlc = crc.Crc32IsoHdlc; const Crc32IsoHdlc = crc.Crc32IsoHdlc;

View file

@ -5478,14 +5478,16 @@ fn airAsm(f: *Function, inst: Air.Inst.Index) !CValue {
// for the string, we still use the next u32 for the null terminator. // for the string, we still use the next u32 for the null terminator.
extra_i += (constraint.len + name.len + (2 + 3)) / 4; extra_i += (constraint.len + name.len + (2 + 3)) / 4;
if (constraint.len < 2 or constraint[0] != '=' or // +constraint
(constraint[1] == '{' and constraint[constraint.len - 1] != '}')) // =constraint
{ if (constraint.len > 1 and
return f.fail("CBE: constraint not supported: '{s}'", .{constraint}); (constraint[0] == '=' or constraint[0] == '+') and
} constraint[1] != '{') continue;
const is_reg = constraint[1] == '{'; // ={reg}
if (is_reg) { if (std.mem.startsWith(u8, constraint, "={") and
std.mem.endsWith(u8, constraint, "}"))
{
const output_ty = if (output == .none) inst_ty else f.typeOf(output).childType(zcu); const output_ty = if (output == .none) inst_ty else f.typeOf(output).childType(zcu);
try w.writeAll("register "); try w.writeAll("register ");
const output_local = try f.allocLocalValue(.{ const output_local = try f.allocLocalValue(.{
@ -5503,7 +5505,7 @@ fn airAsm(f: *Function, inst: Air.Inst.Index) !CValue {
} }
try w.writeByte(';'); try w.writeByte(';');
try f.object.newline(); try f.object.newline();
} } else return f.fail("CBE: constraint not supported: '{s}'", .{constraint});
} }
for (inputs) |input| { for (inputs) |input| {
const extra_bytes = mem.sliceAsBytes(f.air.extra.items[extra_i..]); const extra_bytes = mem.sliceAsBytes(f.air.extra.items[extra_i..]);

View file

@ -177437,6 +177437,10 @@ fn airAsm(self: *CodeGen, inst: Air.Inst.Index) !void {
fixed_mnem_size: { fixed_mnem_size: {
const fixed_mnem_size: Memory.Size = switch (mnem_tag) { const fixed_mnem_size: Memory.Size = switch (mnem_tag) {
.clflush => .byte, .clflush => .byte,
.crc32 => {
mnem_size.op_has_size.unset(1);
break :fixed_mnem_size;
},
.fldcw, .fnstcw, .fstcw, .fnstsw, .fstsw => .word, .fldcw, .fnstcw, .fstcw, .fnstsw, .fstsw => .word,
.fldenv, .fnstenv, .fstenv => .none, .fldenv, .fnstenv, .fstenv => .none,
.frstor, .fsave, .fnsave, .fxrstor, .fxrstor64, .fxsave, .fxsave64 => .none, .frstor, .fsave, .fnsave, .fxrstor, .fxrstor64, .fxsave, .fxsave64 => .none,

View file

@ -97,7 +97,8 @@ width=32 poly=0xa833982b init=0xffffffff refin=true refout=true xorout=0xff
width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2" width=32 poly=0x04c11db7 init=0xffffffff refin=false refout=false xorout=0xffffffff check=0xfc891918 residue=0xc704dd7b name="CRC-32/BZIP2"
width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC" width=32 poly=0x8001801b init=0x00000000 refin=true refout=true xorout=0x00000000 check=0x6ec2edc4 residue=0x00000000 name="CRC-32/CD-ROM-EDC"
width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM" width=32 poly=0x04c11db7 init=0x00000000 refin=false refout=false xorout=0xffffffff check=0x765e7680 residue=0xc704dd7b name="CRC-32/CKSUM"
width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI" # CRC-32C implementation is defined manually, since it has an accelerated variant.
# width=32 poly=0x1edc6f41 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xe3069283 residue=0xb798b438 name="CRC-32/ISCSI"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC" width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0xcbf43926 residue=0xdebb20e3 name="CRC-32/ISO-HDLC"
width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC" width=32 poly=0x04c11db7 init=0xffffffff refin=true refout=true xorout=0x00000000 check=0x340bc6d9 residue=0x00000000 name="CRC-32/JAMCRC"
width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN" width=32 poly=0x741b8cd7 init=0xffffffff refin=true refout=true xorout=0xffffffff check=0x2d3dd0ae residue=0x00000000 name="CRC-32/KOOPMAN"

View file

@ -36,6 +36,7 @@ pub fn main() anyerror!void {
try code_writer.writeAll( try code_writer.writeAll(
\\//! This file is auto-generated by tools/update_crc_catalog.zig. \\//! This file is auto-generated by tools/update_crc_catalog.zig.
\\ \\
\\const builtin = @import("builtin");
\\const impl = @import("crc/impl.zig"); \\const impl = @import("crc/impl.zig");
\\ \\
\\pub const Crc = impl.Crc; \\pub const Crc = impl.Crc;
@ -49,6 +50,17 @@ pub fn main() anyerror!void {
\\ _ = @import("crc/test.zig"); \\ _ = @import("crc/test.zig");
\\} \\}
\\ \\
\\pub const Crc32Iscsi = switch (builtin.cpu.has(.x86, .crc32)) {
\\ true => @import("crc/crc32c.zig").Wrapper,
\\ else => Crc(u32, .{
\\ .polynomial = 0x1edc6f41,
\\ .initial = 0xffffffff,
\\ .reflect_input = true,
\\ .reflect_output = true,
\\ .xor_output = 0xffffffff,
\\ }),
\\};
\\
); );
var zig_test_file = try crc_target_dir.createFile("test.zig", .{}); var zig_test_file = try crc_target_dir.createFile("test.zig", .{});
@ -80,12 +92,23 @@ pub fn main() anyerror!void {
\\} \\}
\\ \\
\\test "crc32 koopman regression" { \\test "crc32 koopman regression" {
\\ const crc32 = crc.Koopman; \\ const crc32 = crc.Crc32Koopman;
\\ try testing.expectEqual(crc32.hash(""), 0x00000000); \\ try testing.expectEqual(crc32.hash(""), 0x00000000);
\\ try testing.expectEqual(crc32.hash("a"), 0x0da2aa8a); \\ try testing.expectEqual(crc32.hash("a"), 0x0da2aa8a);
\\ try testing.expectEqual(crc32.hash("abc"), 0xba2322ac); \\ try testing.expectEqual(crc32.hash("abc"), 0xba2322ac);
\\} \\}
\\ \\
\\test "CRC-32/ISCSI" {
\\ const Crc32Iscsi = crc.Crc32Iscsi;
\\
\\ try testing.expectEqual(@as(u32, 0xe3069283), Crc32Iscsi.hash("123456789"));
\\
\\ var c = Crc32Iscsi.init();
\\ c.update("1234");
\\ c.update("56789");
\\ try testing.expectEqual(@as(u32, 0xe3069283), c.final());
\\}
\\
); );
var reader: std.Io.Reader = .fixed(catalog_txt); var reader: std.Io.Reader = .fixed(catalog_txt);