// zig run -O ReleaseFast --zig-lib-dir ../.. benchmark.zig

const std = @import("std");
const builtin = @import("builtin");
const time = std.time;
const Timer = time.Timer;
const hash = std.hash;

const KiB = 1024;
const MiB = 1024 * KiB;
const GiB = 1024 * MiB;

var prng = std.Random.DefaultPrng.init(0);
const random = prng.random();

const Hash = struct {
    ty: type,
    name: []const u8,
    has_iterative_api: bool = true,
    has_crypto_api: bool = false,
    has_anytype_api: ?[]const comptime_int = null,
    init_u8s: ?[]const u8 = null,
    init_u64: ?u64 = null,
};

const hashes = [_]Hash{
    Hash{
        .ty = hash.XxHash3,
        .name = "xxh3",
        .init_u64 = 0,
        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
    },
    Hash{
        .ty = hash.XxHash64,
        .name = "xxhash64",
        .init_u64 = 0,
        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
    },
    Hash{
        .ty = hash.XxHash32,
        .name = "xxhash32",
        .init_u64 = 0,
        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
    },
    Hash{
        .ty = hash.Wyhash,
        .name = "wyhash",
        .init_u64 = 0,
    },
    Hash{
        .ty = hash.Fnv1a_64,
        .name = "fnv1a",
    },
    Hash{
        .ty = hash.Adler32,
        .name = "adler32",
    },
    Hash{
        .ty = hash.crc.Crc32,
        .name = "crc32",
    },
    Hash{
        .ty = hash.CityHash32,
        .name = "cityhash-32",
        .has_iterative_api = false,
    },
    Hash{
        .ty = hash.CityHash64,
        .name = "cityhash-64",
        .has_iterative_api = false,
    },
    Hash{
        .ty = hash.Murmur2_32,
        .name = "murmur2-32",
        .has_iterative_api = false,
    },
    Hash{
        .ty = hash.Murmur2_64,
        .name = "murmur2-64",
        .has_iterative_api = false,
    },
    Hash{
        .ty = hash.Murmur3_32,
        .name = "murmur3-32",
        .has_iterative_api = false,
    },
    Hash{
        .ty = hash.SipHash64(1, 3),
        .name = "siphash64",
        .has_crypto_api = true,
        .init_u8s = &[_]u8{0} ** 16,
    },
    Hash{
        .ty = hash.SipHash128(1, 3),
        .name = "siphash128",
        .has_crypto_api = true,
        .init_u8s = &[_]u8{0} ** 16,
    },
};

const Result = struct {
    hash: u64,
    throughput: u64,
};

const block_size: usize = 8 * 8192;

pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result {
    var blocks = try allocator.alloc(u8, bytes);
    defer allocator.free(blocks);
    random.bytes(blocks);

    const block_count = bytes / block_size;

    var h = blk: {
        if (H.init_u8s) |init| {
            break :blk H.ty.init(init[0..H.ty.key_length]);
        }
        if (H.init_u64) |init| {
            break :blk H.ty.init(init);
        }
        break :blk H.ty.init();
    };

    var timer = try Timer.start();
    for (0..block_count) |i| {
        h.update(blocks[i * block_size ..][0..block_size]);
    }
    const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final();
    std.mem.doNotOptimizeAway(final);

    const elapsed_ns = timer.read();

    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
    const size_float: f64 = @floatFromInt(block_size * block_count);
    const throughput: u64 = @intFromFloat(size_float / elapsed_s);

    return Result{
        .hash = final,
        .throughput = throughput,
    };
}

pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
    var blocks = try allocator.alloc(u8, bytes);
    defer allocator.free(blocks);
    random.bytes(blocks);

    const key_count = bytes / key_size;

    var timer = try Timer.start();

    var sum: u64 = 0;
    for (0..key_count) |i| {
        const small_key = blocks[i * key_size ..][0..key_size];
        const final = blk: {
            if (H.init_u8s) |init| {
                if (H.has_crypto_api) {
                    break :blk @as(u64, @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length])));
                } else {
                    break :blk H.ty.hash(init, small_key);
                }
            }
            if (H.init_u64) |init| {
                break :blk H.ty.hash(init, small_key);
            }
            break :blk H.ty.hash(small_key);
        };
        sum +%= final;
    }
    const elapsed_ns = timer.read();

    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
    const size_float: f64 = @floatFromInt(key_count * key_size);
    const throughput: u64 = @intFromFloat(size_float / elapsed_s);

    std.mem.doNotOptimizeAway(sum);

    return Result{
        .hash = sum,
        .throughput = throughput,
    };
}

// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
// if you see strange performance changes consider using `.never_inline` or `.always_inline`
// to ensure the changes are not only due to the optimiser inlining the benchmark differently
pub fn benchmarkHashSmallKeysArrayPtr(
    comptime H: anytype,
    comptime key_size: usize,
    bytes: usize,
    allocator: std.mem.Allocator,
) !Result {
    var blocks = try allocator.alloc(u8, bytes);
    defer allocator.free(blocks);
    random.bytes(blocks);

    const key_count = bytes / key_size;

    var timer = try Timer.start();

    var sum: u64 = 0;
    for (0..key_count) |i| {
        const small_key = blocks[i * key_size ..][0..key_size];
        const final: u64 = blk: {
            if (H.init_u8s) |init| {
                if (H.has_crypto_api) {
                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
                } else {
                    break :blk H.ty.hash(init, small_key);
                }
            }
            if (H.init_u64) |init| {
                break :blk H.ty.hash(init, small_key);
            }
            break :blk H.ty.hash(small_key);
        };
        sum +%= final;
    }
    const elapsed_ns = timer.read();

    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);

    std.mem.doNotOptimizeAway(sum);

    return Result{
        .hash = sum,
        .throughput = throughput,
    };
}

// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
// if you see strange performance changes consider using `.never_inline` or `.always_inline`
// to ensure the changes are not only due to the optimiser inlining the benchmark differently
pub fn benchmarkHashSmallKeysArray(
    comptime H: anytype,
    comptime key_size: usize,
    bytes: usize,
    allocator: std.mem.Allocator,
) !Result {
    var blocks = try allocator.alloc(u8, bytes);
    defer allocator.free(blocks);
    random.bytes(blocks);

    const key_count = bytes / key_size;

    var i: usize = 0;
    var timer = try Timer.start();

    var sum: u64 = 0;
    while (i < key_count) : (i += 1) {
        const small_key = blocks[i * key_size ..][0..key_size];
        const final: u64 = blk: {
            if (H.init_u8s) |init| {
                if (H.has_crypto_api) {
                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
                } else {
                    break :blk H.ty.hash(init, small_key.*);
                }
            }
            if (H.init_u64) |init| {
                break :blk H.ty.hash(init, small_key.*);
            }
            break :blk H.ty.hash(small_key.*);
        };
        sum +%= final;
    }
    const elapsed_ns = timer.read();

    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);

    std.mem.doNotOptimizeAway(sum);

    return Result{
        .hash = sum,
        .throughput = throughput,
    };
}

pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
    var blocks = try allocator.alloc(u8, bytes);
    defer allocator.free(blocks);
    random.bytes(blocks);

    const key_count = bytes / key_size;

    var timer = try Timer.start();

    var sum: u64 = 0;
    for (0..key_count) |i| {
        const small_key = blocks[i * key_size ..][0..key_size];
        const final: u64 = blk: {
            if (H.init_u8s) |init| {
                if (H.has_crypto_api) {
                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
                } else {
                    break :blk H.ty.hashSmall(init, small_key);
                }
            }
            if (H.init_u64) |init| {
                break :blk H.ty.hashSmall(init, small_key);
            }
            break :blk H.ty.hashSmall(small_key);
        };
        sum +%= final;
    }
    const elapsed_ns = timer.read();

    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);

    std.mem.doNotOptimizeAway(sum);

    return Result{
        .throughput = throughput,
        .hash = sum,
    };
}

fn usage() void {
    std.debug.print(
        \\throughput_test [options]
        \\
        \\Options:
        \\  --filter    [test-name]
        \\  --seed      [int]
        \\  --count     [int]
        \\  --key-size  [int]
        \\  --iterative-only
        \\  --small-key-only
        \\  --help
        \\
    , .{});
}

fn mode(comptime x: comptime_int) comptime_int {
    return if (builtin.mode == .Debug) x / 64 else x;
}

pub fn main() !void {
    const stdout = std.fs.File.stdout().deprecatedWriter();

    var buffer: [1024]u8 = undefined;
    var fixed = std.heap.FixedBufferAllocator.init(buffer[0..]);
    const args = try std.process.argsAlloc(fixed.allocator());

    var filter: ?[]u8 = "";
    var count: usize = mode(128 * MiB);
    var key_size: ?usize = null;
    var seed: u32 = 0;
    var test_small_key_only = false;
    var test_iterative_only = false;
    var test_arrays = false;

    const default_small_key_size = 32;

    var i: usize = 1;
    while (i < args.len) : (i += 1) {
        if (std.mem.eql(u8, args[i], "--mode")) {
            try stdout.print("{}\n", .{builtin.mode});
            return;
        } else if (std.mem.eql(u8, args[i], "--seed")) {
            i += 1;
            if (i == args.len) {
                usage();
                std.process.exit(1);
            }

            seed = try std.fmt.parseUnsigned(u32, args[i], 10);
            // we seed later
        } else if (std.mem.eql(u8, args[i], "--filter")) {
            i += 1;
            if (i == args.len) {
                usage();
                std.process.exit(1);
            }

            filter = args[i];
        } else if (std.mem.eql(u8, args[i], "--count")) {
            i += 1;
            if (i == args.len) {
                usage();
                std.process.exit(1);
            }

            const c = try std.fmt.parseUnsigned(usize, args[i], 10);
            count = c * MiB;
        } else if (std.mem.eql(u8, args[i], "--key-size")) {
            i += 1;
            if (i == args.len) {
                usage();
                std.process.exit(1);
            }

            key_size = try std.fmt.parseUnsigned(usize, args[i], 10);
            if (key_size.? > block_size) {
                try stdout.print("key_size cannot exceed block size of {}\n", .{block_size});
                std.process.exit(1);
            }
        } else if (std.mem.eql(u8, args[i], "--iterative-only")) {
            test_iterative_only = true;
        } else if (std.mem.eql(u8, args[i], "--small-key-only")) {
            test_small_key_only = true;
        } else if (std.mem.eql(u8, args[i], "--include-array")) {
            test_arrays = true;
        } else if (std.mem.eql(u8, args[i], "--help")) {
            usage();
            return;
        } else {
            usage();
            std.process.exit(1);
        }
    }

    if (test_iterative_only and test_small_key_only) {
        try stdout.print("Cannot use iterative-only and small-key-only together!\n", .{});
        usage();
        std.process.exit(1);
    }

    var gpa: std.heap.GeneralPurposeAllocator(.{}) = .init;
    defer std.testing.expect(gpa.deinit() == .ok) catch @panic("leak");
    const allocator = gpa.allocator();

    inline for (hashes) |H| {
        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: {
            if (!test_iterative_only or H.has_iterative_api) {
                try stdout.print("{s}\n", .{H.name});

                // Always reseed prior to every call so we are hashing the same buffer contents.
                // This allows easier comparison between different implementations.
                if (H.has_iterative_api and !test_small_key_only) {
                    prng.seed(seed);
                    const result = try benchmarkHash(H, count, allocator);
                    try stdout.print("   iterative: {:5} MiB/s [{x:0<16}]\n", .{ result.throughput / (1 * MiB), result.hash });
                }

                if (!test_iterative_only) {
                    if (key_size) |size| {
                        prng.seed(seed);
                        const result_small = try benchmarkHashSmallKeys(H, size, count, allocator);
                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
                            size,
                            result_small.throughput / (1 * MiB),
                            result_small.throughput / size,
                            result_small.hash,
                        });

                        if (!test_arrays) break :hash;
                        if (H.has_anytype_api) |sizes| {
                            inline for (sizes) |exact_size| {
                                if (size == exact_size) {
                                    prng.seed(seed);
                                    const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
                                    prng.seed(seed);
                                    const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
                                    try stdout.print("       array: {:5} MiB/s [{x:0<16}]\n", .{
                                        result_array.throughput / (1 * MiB),
                                        result_array.hash,
                                    });
                                    try stdout.print("   array ptr: {:5} MiB/s [{x:0<16}]\n", .{
                                        result_ptr.throughput / (1 * MiB),
                                        result_ptr.hash,
                                    });
                                }
                            }
                        }
                    } else {
                        prng.seed(seed);
                        const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator);
                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
                            default_small_key_size,
                            result_small.throughput / (1 * MiB),
                            result_small.throughput / default_small_key_size,
                            result_small.hash,
                        });

                        if (!test_arrays) break :hash;
                        if (H.has_anytype_api) |sizes| {
                            try stdout.print("       array:\n", .{});
                            inline for (sizes) |exact_size| {
                                prng.seed(seed);
                                const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
                                    exact_size,
                                    result.throughput / (1 * MiB),
                                    result.hash,
                                });
                            }
                            try stdout.print("   array ptr: \n", .{});
                            inline for (sizes) |exact_size| {
                                prng.seed(seed);
                                const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
                                    exact_size,
                                    result.throughput / (1 * MiB),
                                    result.hash,
                                });
                            }
                        }
                    }
                }
            }
        }
    }
}