//! Implements CRC-32C (Castagnoli) using the SSE4.2 Intel CRC32 instruction.
//!
//! A couple useful links for understanding the approach taken here:
//! - https://github.com/madler/brotli/blob/1d428d3a9baade233ebc3ac108293256bcb813d1/crc32c.c
//! - https://github.com/madler/zlib/blob/5a82f71ed1dfc0bec044d9702463dbdf84ea3b71/crc32.c
//! - http://www.ross.net/crc/download/crc_v3.txt

// Reflected CRC-32C polynomial in binary form.
const POLY = 0x82f63b78;

const LONG = 8192;
const SHORT = 256;
const long_lookup_table = genTable(LONG);
const short_lookup_table = genTable(SHORT);

/// Generates the lookup table for efficiently combining CRCs over a block of a given length `length`.
/// This works by building an operator that advances the CRC state as if `length` zero-bytes were appended.
/// We pre-compute 4 tables of 256 entries each (one per byte offset).
///
///
/// The idea behind this table is quite interesting. The CRC state is equivalent to the
/// remainder of dividing the message polynomial (over GF(2)) by the CRC polynomial.
///
/// Advancing the CRC register by `k` zero bits is equivalent to multiplying the current
/// CRC state by `x^k` modulo the CRC polynomial. This operation can be represented
/// as a linear transformation in GF(2), i.e, a matrix.
///
/// We build up this matrix via repeated squaring:
/// - odd represents the operator for 1 zero bit (i.e, multiplication by `x^1 mod POLY`)
/// - even represents the operator for 2 zero bits (`x^2 mod POLY`)
/// - squaring again gives `x^4 mod POLY`, and so on until we get to the right size.
///
/// By squaring the shifting `len`, we build the operator for `x^l mod POLY`.
fn genTable(length: usize) [4][256]u32 {
    @setEvalBranchQuota(250000);

    var even: [32]u32 = undefined;
    zeroes: {
        var odd: [32]u32 = undefined;

        // Initialize our `odd` array with the operator for a single zero bit:
        // - odd[0] is the polynomial itself (acts on the MSB).
        // - odd[1..32] represent shifting a single bit through 31 positions.
        odd[0] = POLY;
        var row: u32 = 1;
        for (1..32) |n| {
            odd[n] = row;
            row <<= 1;
        }

        // even = odd squared: even represents `x^2 mod POLY`.
        square(&even, &odd);
        // odd = even squared: odd now represents `x^4 mod POLY`.
        square(&odd, &even);

        // Continue squaring to double the number of zeroes encoded each time:
        //
        // At each point in the process:
        // - square(even, odd): even gets the operator for twice the current length.
        // - square(odd, even): odd gets the operator for 4 times the original length.
        var len = length;
        while (true) {
            square(&even, &odd);
            len >>= 1;
            if (len == 0) break :zeroes;
            square(&odd, &even);
            len >>= 1;
            if (len == 0) break;
        }

        @memcpy(&even, &odd);
    }

    var zeroes: [4][256]u32 = undefined;
    for (0..256) |n| {
        zeroes[0][n] = times(&even, n);
        zeroes[1][n] = times(&even, n << 8);
        zeroes[2][n] = times(&even, n << 16);
        zeroes[3][n] = times(&even, n << 24);
    }
    return zeroes;
}

/// Computes `mat * vec` over `GF(2)`, where `mat` is a 32x32 binary matrix and `vec`
/// is a 32-bit vector. This somewhat "simulates" how bits propagate through the CRC register
/// during shifting.
///
/// - In GF(2) (aka a field where the only values are 0 and 1, aka binary), multiplication is
/// an `AND`, and addition is `XOR`.
/// - This dot product determines how each bit in the input vector "contributes" to
/// the final CRC state, by XORing (adding) rows of the matrix where `vec` has 1s.
fn times(mat: *const [32]u32, vec: u32) u32 {
    var sum: u32 = 0;
    var v = vec;
    var i: u32 = 0;
    while (v != 0) {
        if (v & 1 != 0) sum ^= mat[i];
        v >>= 1;
        i += 1;
    }
    return sum;
}

/// Computes the square of a matrix in GF(2), i.e `dst = dst x src`.
///
/// This produces the operator for doubling the number of zeroes:
/// if `src` represents advancing the CRC by `k` zeroes, then `dest` will
/// represent advancing by 2k zeroes.
///
/// Since polynomial multiplication mod POLY is linear, `mat(mat(x)) = mat^2(x)`
/// gives the effect of two sequential applications of the operator.
fn square(dst: *[32]u32, src: *const [32]u32) void {
    for (dst, src) |*d, s| {
        d.* = times(src, s);
    }
}

fn shift(table: *const [4][256]u32, crc: u32) u32 {
    return table[0][crc & 0xFF] ^ table[1][(crc >> 8) & 0xFF] ^ table[2][(crc >> 16) & 0xFF] ^ table[3][crc >> 24];
}

fn crc32(crc: u32, input: []const u8) u32 {
    var crc0: u64 = ~crc;

    // Compute the CRC for up to seven leading bytes to bring the
    // `next` pointer to an eight-byte boundary.
    var next = input;
    while (next.len > 0 and @intFromPtr(next.ptr) & 7 != 0) {
        asm volatile ("crc32b %[out], %[in]"
            : [in] "+r" (crc0),
            : [out] "rm" (next[0]),
        );
        next = next[1..];
    }

    // Compute the CRC on sets of LONG * 3 bytes, executing three independent
    // CRC instructions, each on LONG bytes. This is an optimization for
    // targets where the CRC instruction has a throughput of one CRC per
    // cycle, but a latency of three cycles.
    while (next.len >= LONG * 3) {
        var crc1: u64 = 0;
        var crc2: u64 = 0;

        const start = next.len;
        while (true) {
            // Safe @alignCast(), since we've aligned the pointer to 8 bytes before this loop.
            const long: [*]const u64 = @ptrCast(@alignCast(next));
            asm volatile (
                \\crc32q %[out0], %[in0]
                \\crc32q %[out1], %[in1]
                \\crc32q %[out2], %[in2]
                : [in0] "+r" (crc0),
                  [in1] "+r" (crc1),
                  [in2] "+r" (crc2),
                : [out0] "rm" (long[0 * LONG / 8]),
                  [out1] "rm" (long[1 * LONG / 8]),
                  [out2] "rm" (long[2 * LONG / 8]),
            );
            next = next[8..];
            if (next.len <= start - LONG) break;
        }

        crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc1;
        crc0 = shift(&long_lookup_table, @truncate(crc0)) ^ crc2;
        next = next[LONG * 2 ..];
    }

    // Same thing as above, but for smaller chunks of SHORT bytes.
    while (next.len >= SHORT * 3) {
        var crc1: u64 = 0;
        var crc2: u64 = 0;

        const start = next.len;
        while (true) {
            const long: [*]const u64 = @ptrCast(@alignCast(next));
            asm volatile (
                \\crc32q %[out0], %[in0]
                \\crc32q %[out1], %[in1]
                \\crc32q %[out2], %[in2]
                : [in0] "+r" (crc0),
                  [in1] "+r" (crc1),
                  [in2] "+r" (crc2),
                : [out0] "rm" (long[0 * SHORT / 8]),
                  [out1] "rm" (long[1 * SHORT / 8]),
                  [out2] "rm" (long[2 * SHORT / 8]),
            );
            next = next[8..];
            if (next.len <= start - SHORT) break;
        }

        crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc1;
        crc0 = shift(&short_lookup_table, @truncate(crc0)) ^ crc2;
        next = next[SHORT * 2 ..];
    }

    // Compute via 8-byte chunks, until we're left with less than 8 bytes.
    while (next.len >= 8) {
        const long: [*]const u64 = @ptrCast(@alignCast(next));
        asm volatile ("crc32q %[out], %[in]"
            : [in] "+r" (crc0),
            : [out] "rm" (long[0]),
        );
        next = next[8..];
    }

    // Finish the last bytes with just single instructions.
    while (next.len > 0) {
        asm volatile ("crc32b %[out], %[in]"
            : [in] "+r" (crc0),
            : [out] "rm" (next[0]),
        );
        next = next[1..];
    }

    return @truncate(~crc0);
}

// Wrapper around the accelerated implementation to match the one in impl.zig.
pub const Wrapper = struct {
    crc: u32,

    pub fn init() Wrapper {
        return .{ .crc = 0 };
    }

    pub fn update(w: *Wrapper, bytes: []const u8) void {
        w.crc = crc32(w.crc, bytes);
    }

    pub fn final(w: Wrapper) u32 {
        return w.crc;
    }

    pub fn hash(bytes: []const u8) u32 {
        var c = init();
        c.update(bytes);
        return c.final();
    }
};