zig/src/link/SpirV/BinaryModule.zig

const std = @import("std");
const assert = std.debug.assert;
const Allocator = std.mem.Allocator;
const log = std.log.scoped(.spirv_parse);

const spec = @import("../../codegen/spirv/spec.zig");
const Opcode = spec.Opcode;
const Word = spec.Word;
const InstructionSet = spec.InstructionSet;
const ResultId = spec.Id;

const BinaryModule = @This();

pub const header_words = 5;

/// The module SPIR-V version.
version: spec.Version,

/// The generator magic number.
generator_magic: u32,

/// The result-id bound of this SPIR-V module.
id_bound: u32,

/// The instructions of this module. This does not contain the header.
instructions: []const Word,

/// Maps OpExtInstImport result-ids to their InstructionSet.
ext_inst_map: std.AutoHashMapUnmanaged(ResultId, InstructionSet),

/// This map contains the width of arithmetic types (OpTypeInt and
/// OpTypeFloat). We need this information to correctly parse the operands
/// of Op(Spec)Constant and OpSwitch.
arith_type_width: std.AutoHashMapUnmanaged(ResultId, u16),

/// The starting offsets of some sections
sections: struct {
    functions: usize,
},

pub fn deinit(self: *BinaryModule, a: Allocator) void {
    self.ext_inst_map.deinit(a);
    self.arith_type_width.deinit(a);
    self.* = undefined;
}

pub fn iterateInstructions(self: BinaryModule) Instruction.Iterator {
    return Instruction.Iterator.init(self.instructions, 0);
}

pub fn iterateInstructionsFrom(self: BinaryModule, offset: usize) Instruction.Iterator {
    return Instruction.Iterator.init(self.instructions, offset);
}

pub fn instructionAt(self: BinaryModule, offset: usize) Instruction {
    var it = self.iterateInstructionsFrom(offset);
    return it.next().?;
}

pub fn finalize(self: BinaryModule, a: Allocator) ![]Word {
    const result = try a.alloc(Word, 5 + self.instructions.len);
    errdefer a.free(result);

    result[0] = spec.magic_number;
    result[1] = @bitCast(self.version);
    result[2] = spec.zig_generator_id;
    result[3] = self.id_bound;
    result[4] = 0; // Schema

    @memcpy(result[5..], self.instructions);
    return result;
}

/// Errors that can be raised when the module is not correct.
/// Note that the parser doesn't validate SPIR-V modules by a
/// long shot. It only yields errors that critically prevent
/// further analysis of the module.
pub const ParseError = error{
    /// Raised when the module doesn't start with the SPIR-V magic.
    /// This usually means that the module isn't actually SPIR-V.
    InvalidMagic,
    /// Raised when the module has an invalid "physical" format:
    /// For example when the header is incomplete, or an instruction
    /// has an illegal format.
    InvalidPhysicalFormat,
    /// OpExtInstImport was used with an unknown extension string.
    InvalidExtInstImport,
    /// The module had an instruction with an invalid (unknown) opcode.
    InvalidOpcode,
    /// An instruction's operands did not conform to the SPIR-V specification
    /// for that instruction.
    InvalidOperands,
    /// A result-id was declared more than once.
    DuplicateId,
    /// Some ID did not resolve.
    InvalidId,
    /// This opcode or instruction is not supported yet.
    UnsupportedOperation,
    /// Parser ran out of memory.
    OutOfMemory,
};

pub const Instruction = struct {
    pub const Iterator = struct {
        words: []const Word,
        index: usize = 0,
        offset: usize = 0,

        pub fn init(words: []const Word, start_offset: usize) Iterator {
            return .{ .words = words, .offset = start_offset };
        }

        pub fn next(self: *Iterator) ?Instruction {
            if (self.offset >= self.words.len) return null;

            const instruction_len = self.words[self.offset] >> 16;
            defer self.offset += instruction_len;
            defer self.index += 1;
            assert(instruction_len != 0);
            assert(self.offset < self.words.len);

            return Instruction{
                .opcode = @enumFromInt(self.words[self.offset] & 0xFFFF),
                .index = self.index,
                .offset = self.offset,
                .operands = self.words[self.offset..][1..instruction_len],
            };
        }
    };

    /// The opcode for this instruction.
    opcode: Opcode,
    /// The instruction's index.
    index: usize,
    /// The instruction's word offset in the module.
    offset: usize,
    /// The raw (unparsed) operands for this instruction.
    operands: []const Word,
};

/// This parser contains information (acceleration tables)
/// that can be persisted across different modules. This is
/// used to initialize the module, and is also used when
/// further analyzing it.
pub const Parser = struct {
    /// The allocator used to allocate this parser's structures,
    /// and also the structures of any parsed module.
    a: Allocator,

    /// Maps (instruction set, opcode) => instruction index (for instruction set)
    opcode_table: std.AutoHashMapUnmanaged(u32, u16) = .empty,

    pub fn init(a: Allocator) !Parser {
        var self = Parser{
            .a = a,
        };
        errdefer self.deinit();

        inline for (std.meta.tags(InstructionSet)) |set| {
            const instructions = set.instructions();
            try self.opcode_table.ensureUnusedCapacity(a, @intCast(instructions.len));
            for (instructions, 0..) |inst, i| {
                // Note: Some instructions may alias another. In this case we don't really care
                // which one is first: they all (should) have the same operands anyway. Just pick
                // the first, which is usually the core, KHR or EXT variant.
                const entry = self.opcode_table.getOrPutAssumeCapacity(mapSetAndOpcode(set, @intCast(inst.opcode)));
                if (!entry.found_existing) {
                    entry.value_ptr.* = @intCast(i);
                }
            }
        }

        return self;
    }

    pub fn deinit(self: *Parser) void {
        self.opcode_table.deinit(self.a);
    }

    fn mapSetAndOpcode(set: InstructionSet, opcode: u16) u32 {
        return (@as(u32, @intFromEnum(set)) << 16) | opcode;
    }

    pub fn getInstSpec(self: Parser, opcode: Opcode) ?spec.Instruction {
        const index = self.opcode_table.get(mapSetAndOpcode(.core, @intFromEnum(opcode))) orelse return null;
        return InstructionSet.core.instructions()[index];
    }

    pub fn parse(self: *Parser, module: []const u32) ParseError!BinaryModule {
        if (module[0] != spec.magic_number) {
            return error.InvalidMagic;
        } else if (module.len < header_words) {
            log.err("module only has {}/{} header words", .{ module.len, header_words });
            return error.InvalidPhysicalFormat;
        }

        var binary = BinaryModule{
            .version = @bitCast(module[1]),
            .generator_magic = module[2],
            .id_bound = module[3],
            .instructions = module[header_words..],
            .ext_inst_map = .{},
            .arith_type_width = .{},
            .sections = undefined,
        };

        var maybe_function_section: ?usize = null;

        // First pass through the module to verify basic structure and
        // to gather some initial stuff for more detailed analysis.
        // We want to check some stuff that Instruction.Iterator is no good for,
        // so just iterate manually.
        var offset: usize = 0;
        while (offset < binary.instructions.len) {
            const len = binary.instructions[offset] >> 16;
            if (len == 0 or len + offset > binary.instructions.len) {
                log.err("invalid instruction format: len={}, end={}, module len={}", .{ len, len + offset, binary.instructions.len });
                return error.InvalidPhysicalFormat;
            }
            defer offset += len;

            // We can't really efficiently use non-exhaustive enums here, because we would
            // need to manually write out all valid cases. Since we have this map anyway, just
            // use that.
            const opcode: Opcode = @enumFromInt(@as(u16, @truncate(binary.instructions[offset])));
            const inst_spec = self.getInstSpec(opcode) orelse {
                log.err("invalid opcode for core set: {}", .{@intFromEnum(opcode)});
                return error.InvalidOpcode;
            };

            const operands = binary.instructions[offset..][1..len];
            switch (opcode) {
                .OpExtInstImport => {
                    const set_name = std.mem.sliceTo(std.mem.sliceAsBytes(operands[1..]), 0);
                    const set = std.meta.stringToEnum(InstructionSet, set_name) orelse {
                        log.err("invalid instruction set '{s}'", .{set_name});
                        return error.InvalidExtInstImport;
                    };
                    if (set == .core) return error.InvalidExtInstImport;
                    try binary.ext_inst_map.put(self.a, @enumFromInt(operands[0]), set);
                },
                .OpTypeInt, .OpTypeFloat => {
                    const entry = try binary.arith_type_width.getOrPut(self.a, @enumFromInt(operands[0]));
                    if (entry.found_existing) return error.DuplicateId;
                    entry.value_ptr.* = std.math.cast(u16, operands[1]) orelse return error.InvalidOperands;
                },
                .OpFunction => if (maybe_function_section == null) {
                    maybe_function_section = offset;
                },
                else => {},
            }

            // OpSwitch takes a value as argument, not an OpType... hence we need to populate arith_type_width
            // with ALL operations that return an int or float.
            const spec_operands = inst_spec.operands;
            if (spec_operands.len >= 2 and
                spec_operands[0].kind == .id_result_type and
                spec_operands[1].kind == .id_result)
            {
                if (operands.len < 2) return error.InvalidOperands;
                if (binary.arith_type_width.get(@enumFromInt(operands[0]))) |width| {
                    const entry = try binary.arith_type_width.getOrPut(self.a, @enumFromInt(operands[1]));
                    if (entry.found_existing) return error.DuplicateId;
                    entry.value_ptr.* = width;
                }
            }
        }

        binary.sections = .{
            .functions = maybe_function_section orelse binary.instructions.len,
        };

        return binary;
    }

    /// Parse offsets in the instruction that contain result-ids.
    /// Returned offsets are relative to inst.operands.
    /// Returns in an arraylist to armortize allocations.
    pub fn parseInstructionResultIds(
        self: *Parser,
        binary: BinaryModule,
        inst: Instruction,
        offsets: *std.ArrayList(u16),
    ) !void {
        const index = self.opcode_table.get(mapSetAndOpcode(.core, @intFromEnum(inst.opcode))).?;
        const operands = InstructionSet.core.instructions()[index].operands;

        var offset: usize = 0;
        switch (inst.opcode) {
            .OpSpecConstantOp => {
                assert(operands[0].kind == .id_result_type);
                assert(operands[1].kind == .id_result);
                offset = try self.parseOperandsResultIds(binary, inst, operands[0..2], offset, offsets);

                if (offset >= inst.operands.len) return error.InvalidPhysicalFormat;
                const spec_opcode = std.math.cast(u16, inst.operands[offset]) orelse return error.InvalidPhysicalFormat;
                const spec_index = self.opcode_table.get(mapSetAndOpcode(.core, spec_opcode)) orelse
                    return error.InvalidPhysicalFormat;
                const spec_operands = InstructionSet.core.instructions()[spec_index].operands;
                assert(spec_operands[0].kind == .id_result_type);
                assert(spec_operands[1].kind == .id_result);
                offset = try self.parseOperandsResultIds(binary, inst, spec_operands[2..], offset + 1, offsets);
            },
            .OpExtInst => {
                assert(operands[0].kind == .id_result_type);
                assert(operands[1].kind == .id_result);
                offset = try self.parseOperandsResultIds(binary, inst, operands[0..2], offset, offsets);

                if (offset + 1 >= inst.operands.len) return error.InvalidPhysicalFormat;
                const set_id: ResultId = @enumFromInt(inst.operands[offset]);
                try offsets.append(@intCast(offset));
                const set = binary.ext_inst_map.get(set_id) orelse {
                    log.err("invalid instruction set {}", .{@intFromEnum(set_id)});
                    return error.InvalidId;
                };
                const ext_opcode = std.math.cast(u16, inst.operands[offset + 1]) orelse return error.InvalidPhysicalFormat;
                const ext_index = self.opcode_table.get(mapSetAndOpcode(set, ext_opcode)) orelse
                    return error.InvalidPhysicalFormat;
                const ext_operands = set.instructions()[ext_index].operands;
                offset = try self.parseOperandsResultIds(binary, inst, ext_operands, offset + 2, offsets);
            },
            else => {
                offset = try self.parseOperandsResultIds(binary, inst, operands, offset, offsets);
            },
        }

        if (offset != inst.operands.len) return error.InvalidPhysicalFormat;
    }

    fn parseOperandsResultIds(
        self: *Parser,
        binary: BinaryModule,
        inst: Instruction,
        operands: []const spec.Operand,
        start_offset: usize,
        offsets: *std.ArrayList(u16),
    ) !usize {
        var offset = start_offset;
        for (operands) |operand| {
            offset = try self.parseOperandResultIds(binary, inst, operand, offset, offsets);
        }
        return offset;
    }

    fn parseOperandResultIds(
        self: *Parser,
        binary: BinaryModule,
        inst: Instruction,
        operand: spec.Operand,
        start_offset: usize,
        offsets: *std.ArrayList(u16),
    ) !usize {
        var offset = start_offset;
        switch (operand.quantifier) {
            .variadic => while (offset < inst.operands.len) {
                offset = try self.parseOperandKindResultIds(binary, inst, operand.kind, offset, offsets);
            },
            .optional => if (offset < inst.operands.len) {
                offset = try self.parseOperandKindResultIds(binary, inst, operand.kind, offset, offsets);
            },
            .required => {
                offset = try self.parseOperandKindResultIds(binary, inst, operand.kind, offset, offsets);
            },
        }
        return offset;
    }

    fn parseOperandKindResultIds(
        self: *Parser,
        binary: BinaryModule,
        inst: Instruction,
        kind: spec.OperandKind,
        start_offset: usize,
        offsets: *std.ArrayList(u16),
    ) !usize {
        var offset = start_offset;
        if (offset >= inst.operands.len) return error.InvalidPhysicalFormat;

        switch (kind.category()) {
            .bit_enum => {
                const mask = inst.operands[offset];
                offset += 1;
                for (kind.enumerants()) |enumerant| {
                    if ((mask & enumerant.value) != 0) {
                        for (enumerant.parameters) |param_kind| {
                            offset = try self.parseOperandKindResultIds(binary, inst, param_kind, offset, offsets);
                        }
                    }
                }
            },
            .value_enum => {
                const value = inst.operands[offset];
                offset += 1;
                for (kind.enumerants()) |enumerant| {
                    if (value == enumerant.value) {
                        for (enumerant.parameters) |param_kind| {
                            offset = try self.parseOperandKindResultIds(binary, inst, param_kind, offset, offsets);
                        }
                        break;
                    }
                }
            },
            .id => {
                try offsets.append(@intCast(offset));
                offset += 1;
            },
            else => switch (kind) {
                .literal_integer, .literal_float => offset += 1,
                .literal_string => while (true) {
                    if (offset >= inst.operands.len) return error.InvalidPhysicalFormat;
                    const word = inst.operands[offset];
                    offset += 1;

                    if (word & 0xFF000000 == 0 or
                        word & 0x00FF0000 == 0 or
                        word & 0x0000FF00 == 0 or
                        word & 0x000000FF == 0)
                    {
                        break;
                    }
                },
                .literal_context_dependent_number => {
                    assert(inst.opcode == .OpConstant or inst.opcode == .OpSpecConstantOp);
                    const bit_width = binary.arith_type_width.get(@enumFromInt(inst.operands[0])) orelse {
                        log.err("invalid LiteralContextDependentNumber type {}", .{inst.operands[0]});
                        return error.InvalidId;
                    };
                    offset += switch (bit_width) {
                        1...32 => 1,
                        33...64 => 2,
                        else => unreachable,
                    };
                },
                .literal_ext_inst_integer => unreachable,
                .literal_spec_constant_op_integer => unreachable,
                .pair_literal_integer_id_ref => { // Switch case
                    assert(inst.opcode == .OpSwitch);
                    const bit_width = binary.arith_type_width.get(@enumFromInt(inst.operands[0])) orelse {
                        log.err("invalid OpSwitch type {}", .{inst.operands[0]});
                        return error.InvalidId;
                    };
                    offset += switch (bit_width) {
                        1...32 => 1,
                        33...64 => 2,
                        else => unreachable,
                    };
                    try offsets.append(@intCast(offset));
                    offset += 1;
                },
                .pair_id_ref_literal_integer => {
                    try offsets.append(@intCast(offset));
                    offset += 2;
                },
                .pair_id_ref_id_ref => {
                    try offsets.append(@intCast(offset));
                    try offsets.append(@intCast(offset + 1));
                    offset += 2;
                },
                else => unreachable,
            },
        }
        return offset;
    }
};