zig/lib/compiler/resinator/literals.zig

const std = @import("std");
const code_pages = @import("code_pages.zig");
const SupportedCodePage = code_pages.SupportedCodePage;
const windows1252 = @import("windows1252.zig");
const ErrorDetails = @import("errors.zig").ErrorDetails;
const DiagnosticsContext = @import("errors.zig").DiagnosticsContext;
const Token = @import("lex.zig").Token;

/// rc is maximally liberal in terms of what it accepts as a number literal
/// for data values. As long as it starts with a number or - or ~, that's good enough.
pub fn isValidNumberDataLiteral(str: []const u8) bool {
    if (str.len == 0) return false;
    switch (str[0]) {
        '~', '-', '0'...'9' => return true,
        else => return false,
    }
}

pub const SourceBytes = struct {
    slice: []const u8,
    code_page: SupportedCodePage,
};

pub const StringType = enum { ascii, wide };

/// Valid escapes:
///  "" -> "
///  \a, \A => 0x08 (not 0x07 like in C)
///  \n => 0x0A
///  \r => 0x0D
///  \t, \T => 0x09
///  \\ => \
///  \nnn => byte with numeric value given by nnn interpreted as octal
///          (wraps on overflow, number of digits can be 1-3 for ASCII strings
///          and 1-7 for wide strings)
///  \xhh => byte with numeric value given by hh interpreted as hex
///          (number of digits can be 0-2 for ASCII strings and 0-4 for
///          wide strings)
///  \<\r+> => \
///  \<[\r\n\t ]+> => <nothing>
///
/// Special cases:
///  <\t> => 1-8 spaces, dependent on columns in the source rc file itself
///  <\r> => <nothing>
///  <\n+><\w+?\n?> => <space><\n>
///
/// Special, especially weird case:
///  \"" => "
/// NOTE: This leads to footguns because the preprocessor can start parsing things
///       out-of-sync with the RC compiler, expanding macros within string literals, etc.
///       This parse function handles this case the same as the Windows RC compiler, but
///       \" within a string literal is treated as an error by the lexer, so the relevant
///       branches should never actually be hit during this function.
pub const IterativeStringParser = struct {
    source: []const u8,
    code_page: SupportedCodePage,
    /// The type of the string inferred by the prefix (L"" or "")
    /// This is what matters for things like the maximum digits in an
    /// escape sequence, whether or not invalid escape sequences are skipped, etc.
    declared_string_type: StringType,
    pending_codepoint: ?u21 = null,
    num_pending_spaces: u8 = 0,
    index: usize = 0,
    column: usize = 0,
    diagnostics: ?DiagnosticsContext = null,
    seen_tab: bool = false,

    const State = enum {
        normal,
        quote,
        newline,
        escaped,
        escaped_cr,
        escaped_newlines,
        escaped_octal,
        escaped_hex,
    };

    pub fn init(bytes: SourceBytes, options: StringParseOptions) IterativeStringParser {
        const declared_string_type: StringType = switch (bytes.slice[0]) {
            'L', 'l' => .wide,
            else => .ascii,
        };
        var source = bytes.slice[1 .. bytes.slice.len - 1]; // remove ""
        var column = options.start_column + 1; // for the removed "
        if (declared_string_type == .wide) {
            source = source[1..]; // remove L
            column += 1; // for the removed L
        }
        return .{
            .source = source,
            .code_page = bytes.code_page,
            .declared_string_type = declared_string_type,
            .column = column,
            .diagnostics = options.diagnostics,
        };
    }

    pub const ParsedCodepoint = struct {
        codepoint: u21,
        /// Note: If this is true, `codepoint` will have an effective maximum value
        /// of 0xFFFF, as `codepoint` is calculated using wrapping arithmetic on a u16.
        /// If the value needs to be truncated to a smaller integer (e.g. for ASCII string
        /// literals), then that must be done by the caller.
        from_escaped_integer: bool = false,
        /// Denotes that the codepoint is:
        /// - Escaped (has a \ in front of it), and
        /// - Has a value >= U+10000, meaning it would be encoded as a surrogate
        ///   pair in UTF-16, and
        /// - Is part of a wide string literal
        ///
        /// Normally in wide string literals, invalid escapes are omitted
        /// during parsing (the codepoints are not returned at all during
        /// the `next` call), but this is a special case in which the
        /// escape only applies to the high surrogate pair of the codepoint.
        ///
        /// TODO: Maybe just return the low surrogate codepoint by itself in this case.
        escaped_surrogate_pair: bool = false,
    };

    pub fn next(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
        const result = try self.nextUnchecked();
        if (self.diagnostics != null and result != null and !result.?.from_escaped_integer) {
            switch (result.?.codepoint) {
                0x0900, 0x0A00, 0x0A0D, 0x2000, 0x0D00 => {
                    const err: ErrorDetails.Error = if (result.?.codepoint == 0xD00)
                        .rc_would_miscompile_codepoint_skip
                    else
                        .rc_would_miscompile_codepoint_whitespace;
                    try self.diagnostics.?.diagnostics.append(ErrorDetails{
                        .err = err,
                        .type = .warning,
                        .code_page = self.code_page,
                        .token = self.diagnostics.?.token,
                        .extra = .{ .number = result.?.codepoint },
                    });
                },
                0xFFFE, 0xFFFF => {
                    try self.diagnostics.?.diagnostics.append(ErrorDetails{
                        .err = .rc_would_miscompile_codepoint_bom,
                        .type = .warning,
                        .code_page = self.code_page,
                        .token = self.diagnostics.?.token,
                        .extra = .{ .number = result.?.codepoint },
                    });
                    try self.diagnostics.?.diagnostics.append(ErrorDetails{
                        .err = .rc_would_miscompile_codepoint_bom,
                        .type = .note,
                        .code_page = self.code_page,
                        .token = self.diagnostics.?.token,
                        .print_source_line = false,
                        .extra = .{ .number = result.?.codepoint },
                    });
                },
                else => {},
            }
        }
        return result;
    }

    pub fn nextUnchecked(self: *IterativeStringParser) std.mem.Allocator.Error!?ParsedCodepoint {
        if (self.num_pending_spaces > 0) {
            // Ensure that we don't get into this predicament so we can ensure that
            // the order of processing any pending stuff doesn't matter
            std.debug.assert(self.pending_codepoint == null);
            self.num_pending_spaces -= 1;
            return .{ .codepoint = ' ' };
        }
        if (self.pending_codepoint) |pending_codepoint| {
            self.pending_codepoint = null;
            return .{ .codepoint = pending_codepoint };
        }
        if (self.index >= self.source.len) return null;

        var state: State = .normal;
        var string_escape_n: u16 = 0;
        var string_escape_i: u8 = 0;
        const max_octal_escape_digits: u8 = switch (self.declared_string_type) {
            .ascii => 3,
            .wide => 7,
        };
        const max_hex_escape_digits: u8 = switch (self.declared_string_type) {
            .ascii => 2,
            .wide => 4,
        };

        var backtrack: bool = undefined;
        while (self.code_page.codepointAt(self.index, self.source)) |codepoint| : ({
            if (!backtrack) self.index += codepoint.byte_len;
        }) {
            backtrack = false;
            const c = codepoint.value;
            defer {
                if (!backtrack) {
                    if (c == '\t') {
                        self.column += columnsUntilTabStop(self.column, 8);
                    } else {
                        self.column += codepoint.byte_len;
                    }
                }
            }
            switch (state) {
                .normal => switch (c) {
                    '\\' => state = .escaped,
                    '"' => state = .quote,
                    '\r' => {},
                    '\n' => state = .newline,
                    '\t' => {
                        // Only warn about a tab getting converted to spaces once per string
                        if (self.diagnostics != null and !self.seen_tab) {
                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
                                .err = .tab_converted_to_spaces,
                                .type = .warning,
                                .code_page = self.code_page,
                                .token = self.diagnostics.?.token,
                            });
                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
                                .err = .tab_converted_to_spaces,
                                .type = .note,
                                .code_page = self.code_page,
                                .token = self.diagnostics.?.token,
                                .print_source_line = false,
                            });
                            self.seen_tab = true;
                        }
                        const cols = columnsUntilTabStop(self.column, 8);
                        self.num_pending_spaces = @intCast(cols - 1);
                        self.index += codepoint.byte_len;
                        return .{ .codepoint = ' ' };
                    },
                    else => {
                        self.index += codepoint.byte_len;
                        return .{ .codepoint = c };
                    },
                },
                .quote => switch (c) {
                    '"' => {
                        // "" => "
                        self.index += codepoint.byte_len;
                        return .{ .codepoint = '"' };
                    },
                    else => unreachable, // this is a bug in the lexer
                },
                .newline => switch (c) {
                    '\r', ' ', '\t', '\n', '\x0b', '\x0c', '\xa0' => {},
                    else => {
                        // we intentionally avoid incrementing self.index
                        // to handle the current char in the next call,
                        // and we set backtrack so column count is handled correctly
                        backtrack = true;

                        // <space><newline>
                        self.pending_codepoint = '\n';
                        return .{ .codepoint = ' ' };
                    },
                },
                .escaped => switch (c) {
                    '\r' => state = .escaped_cr,
                    '\n' => state = .escaped_newlines,
                    '0'...'7' => {
                        string_escape_n = std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
                        string_escape_i = 1;
                        state = .escaped_octal;
                    },
                    'x', 'X' => {
                        string_escape_n = 0;
                        string_escape_i = 0;
                        state = .escaped_hex;
                    },
                    else => {
                        switch (c) {
                            'a', 'A' => {
                                self.index += codepoint.byte_len;
                                // might be a bug in RC, but matches its behavior
                                return .{ .codepoint = '\x08' };
                            },
                            'n' => {
                                self.index += codepoint.byte_len;
                                return .{ .codepoint = '\n' };
                            },
                            'r' => {
                                self.index += codepoint.byte_len;
                                return .{ .codepoint = '\r' };
                            },
                            't', 'T' => {
                                self.index += codepoint.byte_len;
                                return .{ .codepoint = '\t' };
                            },
                            '\\' => {
                                self.index += codepoint.byte_len;
                                return .{ .codepoint = '\\' };
                            },
                            '"' => {
                                // \" is a special case that doesn't get the \ included,
                                backtrack = true;
                            },
                            else => switch (self.declared_string_type) {
                                .wide => {
                                    // All invalid escape sequences are skipped in wide strings,
                                    // but there is a special case around \<tab> where the \
                                    // is skipped but the tab character is processed.
                                    // It's actually a bit weirder than that, though, since
                                    // the preprocessor is the one that does the <tab> -> spaces
                                    // conversion, so it goes something like this:
                                    //
                                    // Before preprocessing: L"\<tab>"
                                    // After preprocessing:  L"\     "
                                    //
                                    // So the parser only sees an escaped space character followed
                                    // by some other number of spaces >= 0.
                                    //
                                    // However, our preprocessor keeps tab characters intact, so we emulate
                                    // the above behavior by skipping the \ and then outputting one less
                                    // space than normal for the <tab> character.
                                    if (c == '\t') {
                                        // Only warn about a tab getting converted to spaces once per string
                                        if (self.diagnostics != null and !self.seen_tab) {
                                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
                                                .err = .tab_converted_to_spaces,
                                                .type = .warning,
                                                .code_page = self.code_page,
                                                .token = self.diagnostics.?.token,
                                            });
                                            try self.diagnostics.?.diagnostics.append(ErrorDetails{
                                                .err = .tab_converted_to_spaces,
                                                .type = .note,
                                                .code_page = self.code_page,
                                                .token = self.diagnostics.?.token,
                                                .print_source_line = false,
                                            });
                                            self.seen_tab = true;
                                        }

                                        const cols = columnsUntilTabStop(self.column, 8);
                                        // If the tab character would only be converted to a single space,
                                        // then we can just skip both the \ and the <tab> and move on.
                                        if (cols > 1) {
                                            self.num_pending_spaces = @intCast(cols - 2);
                                            self.index += codepoint.byte_len;
                                            return .{ .codepoint = ' ' };
                                        }
                                    }
                                    // There's a second special case when the codepoint would be encoded
                                    // as a surrogate pair in UTF-16, as the escape 'applies' to the
                                    // high surrogate pair only in this instance. This is a side-effect
                                    // of the Win32 RC compiler preprocessor outputting UTF-16 and the
                                    // compiler itself seemingly working on code units instead of code points
                                    // in this particular instance.
                                    //
                                    // We emulate this behavior by emitting the codepoint, but with a marker
                                    // that indicates that it needs to be handled specially.
                                    if (c >= 0x10000 and c != code_pages.Codepoint.invalid) {
                                        self.index += codepoint.byte_len;
                                        return .{ .codepoint = c, .escaped_surrogate_pair = true };
                                    }
                                },
                                .ascii => {
                                    // we intentionally avoid incrementing self.index
                                    // to handle the current char in the next call,
                                    // and we set backtrack so column count is handled correctly
                                    backtrack = true;
                                    return .{ .codepoint = '\\' };
                                },
                            },
                        }
                        state = .normal;
                    },
                },
                .escaped_cr => switch (c) {
                    '\r' => {},
                    '\n' => state = .escaped_newlines,
                    else => {
                        // we intentionally avoid incrementing self.index
                        // to handle the current char in the next call,
                        // and we set backtrack so column count is handled correctly
                        backtrack = true;
                        return .{ .codepoint = '\\' };
                    },
                },
                .escaped_newlines => switch (c) {
                    '\r', '\n', '\t', ' ', '\x0b', '\x0c', '\xa0' => {},
                    else => {
                        // backtrack so that we handle the current char properly
                        backtrack = true;
                        state = .normal;
                    },
                },
                .escaped_octal => switch (c) {
                    '0'...'7' => {
                        // Note: We use wrapping arithmetic on a u16 here since there's been no observed
                        // string parsing scenario where an escaped integer with a value >= the u16
                        // max is interpreted as anything but the truncated u16 value.
                        string_escape_n *%= 8;
                        string_escape_n +%= std.fmt.charToDigit(@intCast(c), 8) catch unreachable;
                        string_escape_i += 1;
                        if (string_escape_i == max_octal_escape_digits) {
                            self.index += codepoint.byte_len;
                            return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
                        }
                    },
                    else => {
                        // we intentionally avoid incrementing self.index
                        // to handle the current char in the next call,
                        // and we set backtrack so column count is handled correctly
                        backtrack = true;

                        // write out whatever byte we have parsed so far
                        return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
                    },
                },
                .escaped_hex => switch (c) {
                    '0'...'9', 'a'...'f', 'A'...'F' => {
                        string_escape_n *= 16;
                        string_escape_n += std.fmt.charToDigit(@intCast(c), 16) catch unreachable;
                        string_escape_i += 1;
                        if (string_escape_i == max_hex_escape_digits) {
                            self.index += codepoint.byte_len;
                            return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
                        }
                    },
                    else => {
                        // we intentionally avoid incrementing self.index
                        // to handle the current char in the next call,
                        // and we set backtrack so column count is handled correctly
                        backtrack = true;

                        // write out whatever byte we have parsed so far
                        // (even with 0 actual digits, \x alone parses to 0)
                        const escaped_value = string_escape_n;
                        return .{ .codepoint = escaped_value, .from_escaped_integer = true };
                    },
                },
            }
        }

        switch (state) {
            .normal, .escaped_newlines => {},
            .newline => {
                // <space><newline>
                self.pending_codepoint = '\n';
                return .{ .codepoint = ' ' };
            },
            .escaped, .escaped_cr => return .{ .codepoint = '\\' },
            .escaped_octal, .escaped_hex => {
                return .{ .codepoint = string_escape_n, .from_escaped_integer = true };
            },
            .quote => unreachable, // this is a bug in the lexer
        }

        return null;
    }
};

pub const StringParseOptions = struct {
    start_column: usize = 0,
    diagnostics: ?DiagnosticsContext = null,
    output_code_page: SupportedCodePage,
};

pub fn parseQuotedString(
    comptime literal_type: StringType,
    allocator: std.mem.Allocator,
    bytes: SourceBytes,
    options: StringParseOptions,
) !(switch (literal_type) {
    .ascii => []u8,
    .wide => [:0]u16,
}) {
    const T = if (literal_type == .ascii) u8 else u16;
    std.debug.assert(bytes.slice.len >= 2); // must at least have 2 double quote chars

    var buf = try std.array_list.Managed(T).initCapacity(allocator, bytes.slice.len);
    errdefer buf.deinit();

    var iterative_parser = IterativeStringParser.init(bytes, options);

    while (try iterative_parser.next()) |parsed| {
        const c = parsed.codepoint;
        switch (literal_type) {
            .ascii => switch (options.output_code_page) {
                .windows1252 => {
                    if (parsed.from_escaped_integer) {
                        try buf.append(@truncate(c));
                    } else if (windows1252.bestFitFromCodepoint(c)) |best_fit| {
                        try buf.append(best_fit);
                    } else if (c < 0x10000 or c == code_pages.Codepoint.invalid) {
                        try buf.append('?');
                    } else {
                        try buf.appendSlice("??");
                    }
                },
                .utf8 => {
                    var codepoint_to_encode = c;
                    if (parsed.from_escaped_integer) {
                        codepoint_to_encode = @as(T, @truncate(c));
                    }
                    const escaped_integer_outside_ascii_range = parsed.from_escaped_integer and codepoint_to_encode > 0x7F;
                    if (escaped_integer_outside_ascii_range or c == code_pages.Codepoint.invalid) {
                        codepoint_to_encode = '<27>';
                    }
                    var utf8_buf: [4]u8 = undefined;
                    const utf8_len = std.unicode.utf8Encode(codepoint_to_encode, &utf8_buf) catch unreachable;
                    try buf.appendSlice(utf8_buf[0..utf8_len]);
                },
            },
            .wide => {
                // Parsing any string type as a wide string is handled separately, see parseQuotedStringAsWideString
                std.debug.assert(iterative_parser.declared_string_type == .wide);
                if (parsed.from_escaped_integer) {
                    try buf.append(std.mem.nativeToLittle(u16, @truncate(c)));
                } else if (c == code_pages.Codepoint.invalid) {
                    try buf.append(std.mem.nativeToLittle(u16, '<27>'));
                } else if (c < 0x10000) {
                    const short: u16 = @intCast(c);
                    try buf.append(std.mem.nativeToLittle(u16, short));
                } else {
                    if (!parsed.escaped_surrogate_pair) {
                        const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
                        try buf.append(std.mem.nativeToLittle(u16, high));
                    }
                    const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
                    try buf.append(std.mem.nativeToLittle(u16, low));
                }
            },
        }
    }

    if (literal_type == .wide) {
        return buf.toOwnedSliceSentinel(0);
    } else {
        return buf.toOwnedSlice();
    }
}

pub fn parseQuotedAsciiString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![]u8 {
    std.debug.assert(bytes.slice.len >= 2); // ""
    return parseQuotedString(.ascii, allocator, bytes, options);
}

pub fn parseQuotedWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 {
    std.debug.assert(bytes.slice.len >= 3); // L""
    return parseQuotedString(.wide, allocator, bytes, options);
}

/// Parses any string type into a wide string.
/// If the string is declared as a wide string (L""), then it is handled normally.
/// Otherwise, things are fairly normal with the exception of escaped integers.
/// Escaped integers are handled by:
/// - Truncating the escape to a u8
/// - Reinterpeting the u8 as a byte from the *output* code page
/// - Outputting the codepoint that corresponds to the interpreted byte, or <20> if no such
///   interpretation is possible
/// For example, if the code page is UTF-8, then while \x80 is a valid start byte, it's
/// interpreted as a single byte, so it ends up being seen as invalid and <20> is outputted.
/// If the code page is Windows-1252, then \x80 is interpreted to be € which has the
/// codepoint U+20AC, so the UTF-16 encoding of U+20AC is outputted.
pub fn parseQuotedStringAsWideString(allocator: std.mem.Allocator, bytes: SourceBytes, options: StringParseOptions) ![:0]u16 {
    std.debug.assert(bytes.slice.len >= 2); // ""

    if (bytes.slice[0] == 'l' or bytes.slice[0] == 'L') {
        return parseQuotedWideString(allocator, bytes, options);
    }

    // Note: We're only handling the case of parsing an ASCII string into a wide string from here on out.
    // TODO: The logic below is similar to that in AcceleratorKeyCodepointTranslator, might be worth merging the two

    var buf = try std.array_list.Managed(u16).initCapacity(allocator, bytes.slice.len);
    errdefer buf.deinit();

    var iterative_parser = IterativeStringParser.init(bytes, options);

    while (try iterative_parser.next()) |parsed| {
        const c = parsed.codepoint;
        if (parsed.from_escaped_integer) {
            std.debug.assert(c != code_pages.Codepoint.invalid);
            const byte_to_interpret: u8 = @truncate(c);
            const code_unit_to_encode: u16 = switch (options.output_code_page) {
                .windows1252 => windows1252.toCodepoint(byte_to_interpret),
                .utf8 => if (byte_to_interpret > 0x7F) '<27>' else byte_to_interpret,
            };
            try buf.append(std.mem.nativeToLittle(u16, code_unit_to_encode));
        } else if (c == code_pages.Codepoint.invalid) {
            try buf.append(std.mem.nativeToLittle(u16, '<27>'));
        } else if (c < 0x10000) {
            const short: u16 = @intCast(c);
            try buf.append(std.mem.nativeToLittle(u16, short));
        } else {
            if (!parsed.escaped_surrogate_pair) {
                const high = @as(u16, @intCast((c - 0x10000) >> 10)) + 0xD800;
                try buf.append(std.mem.nativeToLittle(u16, high));
            }
            const low = @as(u16, @intCast(c & 0x3FF)) + 0xDC00;
            try buf.append(std.mem.nativeToLittle(u16, low));
        }
    }

    return buf.toOwnedSliceSentinel(0);
}

test "parse quoted ascii string" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSlices(u8, "hello", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"hello"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // hex with 0 digits
    try std.testing.expectEqualSlices(u8, "\x00", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\x"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // hex max of 2 digits
    try std.testing.expectEqualSlices(u8, "\xFFf", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\XfFf"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // octal with invalid octal digit
    try std.testing.expectEqualSlices(u8, "\x019", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\19"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // escaped quotes
    try std.testing.expectEqualSlices(u8, " \" ", try parseQuotedAsciiString(arena, .{
        .slice =
        \\" "" "
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // backslash right before escaped quotes
    try std.testing.expectEqualSlices(u8, "\"", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\"""
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // octal overflow
    try std.testing.expectEqualSlices(u8, "\x01", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\401"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // escapes
    try std.testing.expectEqualSlices(u8, "\x08\n\r\t\\", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\a\n\r\t\\"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // uppercase escapes
    try std.testing.expectEqualSlices(u8, "\x08\\N\\R\t\\", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\A\N\R\T\\"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // backslash on its own
    try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // unrecognized escapes
    try std.testing.expectEqualSlices(u8, "\\b", try parseQuotedAsciiString(arena, .{
        .slice =
        \\"\b"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // escaped carriage returns
    try std.testing.expectEqualSlices(u8, "\\", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\r\r\r\r\r\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // escaped newlines
    try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\n\n\n\n\n\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // escaped CRLF pairs
    try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\r\n\r\n\r\n\r\n\r\n\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // escaped newlines with other whitespace
    try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\n    \t\r\n \r\t\n  \t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // literal tab characters get converted to spaces (dependent on source file columns)
    try std.testing.expectEqualSlices(u8, "       ", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSlices(u8, "abc    ", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"abc\t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSlices(u8, "abcdefg        ", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"abcdefg\t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSlices(u8, "\\      ", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // literal CR's get dropped
    try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\r\r\r\r\r\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // contiguous newlines and whitespace get collapsed to <space><newline>
    try std.testing.expectEqualSlices(u8, " \n", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\n\r\r  \r\n \t  \"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
}

test "parse quoted ascii string with utf8 code page" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSlices(u8, "", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Codepoints that don't have a Windows-1252 representation get converted to ?
    try std.testing.expectEqualSlices(u8, "?????????", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"кириллица\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Codepoints that have a best fit mapping get converted accordingly,
    // these are box drawing codepoints
    try std.testing.expectEqualSlices(u8, "\x2b\x2d\x2b", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"┌─┐\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Invalid UTF-8 gets converted to ? depending on well-formedness
    try std.testing.expectEqualSlices(u8, "????", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Codepoints that would require a UTF-16 surrogate pair get converted to ??
    try std.testing.expectEqualSlices(u8, "??", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));

    // Output code page changes how invalid UTF-8 gets converted, since it
    // now encodes the result as UTF-8 so it can write replacement characters.
    try std.testing.expectEqualSlices(u8, "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
        .{ .output_code_page = .utf8 },
    ));
    try std.testing.expectEqualSlices(u8, "\xF2\xAF\xBA\xB4", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\xF2\xAF\xBA\xB4\"", .code_page = .utf8 },
        .{ .output_code_page = .utf8 },
    ));

    // This used to cause integer overflow when reconsuming the 4-byte long codepoint
    // after the escaped CRLF pair.
    try std.testing.expectEqualSlices(u8, "\u{10348}", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\\\r\n\u{10348}\"", .code_page = .utf8 },
        .{ .output_code_page = .utf8 },
    ));
}

test "parse quoted string with different input/output code pages" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSlices(u8, "€<EFBFBD><EFBFBD><EFBFBD>\x60\x7F", try parseQuotedAsciiString(
        arena,
        .{ .slice = "\"\x80\\x8a\\600\\612\\540\\577\"", .code_page = .windows1252 },
        .{ .output_code_page = .utf8 },
    ));
}

test "parse quoted wide string" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("hello"), try parseQuotedWideString(arena, .{
        .slice =
        \\L"hello"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // hex with 0 digits
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{0x0}, try parseQuotedWideString(arena, .{
        .slice =
        \\L"\x"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // hex max of 4 digits
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0xFFFF), std.mem.nativeToLittle(u16, 'f') }, try parseQuotedWideString(arena, .{
        .slice =
        \\L"\XfFfFf"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // octal max of 7 digits
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x9493), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '3') }, try parseQuotedWideString(arena, .{
        .slice =
        \\L"\111222333"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // octal overflow
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0xFF01)}, try parseQuotedWideString(arena, .{
        .slice =
        \\L"\777401"
        ,
        .code_page = .windows1252,
    }, .{
        .output_code_page = .windows1252,
    }));
    // literal tab characters get converted to spaces (dependent on source file columns)
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("abcdefg       "), try parseQuotedWideString(
        arena,
        .{ .slice = "L\"abcdefg\t\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // Windows-1252 conversion
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("ðð€€€"), try parseQuotedWideString(
        arena,
        .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // Invalid escape sequences are skipped
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedWideString(
        arena,
        .{ .slice = "L\"\\H\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
}

test "parse quoted wide string with utf8 code page" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{}, try parseQuotedWideString(
        arena,
        .{ .slice = "L\"\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedWideString(
        arena,
        .{ .slice = "L\"кириллица\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Invalid UTF-8 gets converted to <20> depending on well-formedness
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("<EFBFBD><EFBFBD><EFBFBD><EFBFBD>"), try parseQuotedWideString(
        arena,
        .{ .slice = "L\"\xf0\xf0\x80\x80\x80\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
}

test "parse quoted ascii string as wide string" {
    var arena_allocator = std.heap.ArenaAllocator.init(std.testing.allocator);
    defer arena_allocator.deinit();
    const arena = arena_allocator.allocator();

    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("кириллица"), try parseQuotedStringAsWideString(
        arena,
        .{ .slice = "\"кириллица\"", .code_page = .utf8 },
        .{ .output_code_page = .windows1252 },
    ));
    // Whether or not invalid escapes are skipped is still determined by the L prefix
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral("\\H"), try parseQuotedStringAsWideString(
        arena,
        .{ .slice = "\"\\H\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSentinel(u16, 0, std.unicode.utf8ToUtf16LeStringLiteral(""), try parseQuotedStringAsWideString(
        arena,
        .{ .slice = "L\"\\H\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    // Maximum escape sequence value is also determined by the L prefix
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{ std.mem.nativeToLittle(u16, 0x12), std.mem.nativeToLittle(u16, '3'), std.mem.nativeToLittle(u16, '4') }, try parseQuotedStringAsWideString(
        arena,
        .{ .slice = "\"\\x1234\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
    try std.testing.expectEqualSentinel(u16, 0, &[_:0]u16{std.mem.nativeToLittle(u16, 0x1234)}, try parseQuotedStringAsWideString(
        arena,
        .{ .slice = "L\"\\x1234\"", .code_page = .windows1252 },
        .{ .output_code_page = .windows1252 },
    ));
}

pub fn columnsUntilTabStop(column: usize, tab_columns: usize) usize {
    // 0 => 8, 1 => 7, 2 => 6, 3 => 5, 4 => 4
    // 5 => 3, 6 => 2, 7 => 1, 8 => 8
    return tab_columns - (column % tab_columns);
}

pub fn columnWidth(cur_column: usize, c: u8, tab_columns: usize) usize {
    return switch (c) {
        '\t' => columnsUntilTabStop(cur_column, tab_columns),
        else => 1,
    };
}

pub const Number = struct {
    value: u32,
    is_long: bool = false,

    pub fn asWord(self: Number) u16 {
        return @truncate(self.value);
    }

    pub fn evaluateOperator(lhs: Number, operator_char: u8, rhs: Number) Number {
        const result = switch (operator_char) {
            '-' => lhs.value -% rhs.value,
            '+' => lhs.value +% rhs.value,
            '|' => lhs.value | rhs.value,
            '&' => lhs.value & rhs.value,
            else => unreachable, // invalid operator, this would be a lexer/parser bug
        };
        return .{
            .value = result,
            .is_long = lhs.is_long or rhs.is_long,
        };
    }
};

/// Assumes that number literals normally rejected by RC's preprocessor
/// are similarly rejected before being parsed.
///
/// Relevant RC preprocessor errors:
///  RC2021: expected exponent value, not '<digit>'
///   example that is rejected: 1e1
///   example that is accepted: 1ea
///   (this function will parse the two examples above the same)
pub fn parseNumberLiteral(bytes: SourceBytes) Number {
    std.debug.assert(bytes.slice.len > 0);
    var result = Number{ .value = 0, .is_long = false };
    var radix: u8 = 10;
    var buf = bytes.slice;

    const Prefix = enum { none, minus, complement };
    var prefix: Prefix = .none;
    switch (buf[0]) {
        '-' => {
            prefix = .minus;
            buf = buf[1..];
        },
        '~' => {
            prefix = .complement;
            buf = buf[1..];
        },
        else => {},
    }

    if (buf.len > 2 and buf[0] == '0') {
        switch (buf[1]) {
            'o' => { // octal radix prefix is case-sensitive
                radix = 8;
                buf = buf[2..];
            },
            'x', 'X' => {
                radix = 16;
                buf = buf[2..];
            },
            else => {},
        }
    }

    var i: usize = 0;
    while (bytes.code_page.codepointAt(i, buf)) |codepoint| : (i += codepoint.byte_len) {
        const c = codepoint.value;
        if (c == 'L' or c == 'l') {
            result.is_long = true;
            break;
        }
        const digit = switch (c) {
            // On invalid digit for the radix, just stop parsing but don't fail
            0x00...0x7F => std.fmt.charToDigit(@intCast(c), radix) catch break,
            else => break,
        };

        if (result.value != 0) {
            result.value *%= radix;
        }
        result.value +%= digit;
    }

    switch (prefix) {
        .none => {},
        .minus => result.value = 0 -% result.value,
        .complement => result.value = ~result.value,
    }

    return result;
}

test "parse number literal" {
    try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1L", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "1l", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 1, .is_long = false }, parseNumberLiteral(.{ .slice = "1garbageL", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 4294967295, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967295", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "4294967296", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 1, .is_long = true }, parseNumberLiteral(.{ .slice = "4294967297L", .code_page = .windows1252 }));

    // can handle any length of number, wraps on overflow appropriately
    const big_overflow = parseNumberLiteral(.{ .slice = "1000000000000000000000000000000000000000000000000000000000000000000000000000000090000000001", .code_page = .windows1252 });
    try std.testing.expectEqual(Number{ .value = 4100654081, .is_long = false }, big_overflow);
    try std.testing.expectEqual(@as(u16, 1025), big_overflow.asWord());

    try std.testing.expectEqual(Number{ .value = 0x20, .is_long = false }, parseNumberLiteral(.{ .slice = "0x20", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2AL", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL", .code_page = .windows1252 }));

    try std.testing.expectEqual(Number{ .value = 0o20, .is_long = false }, parseNumberLiteral(.{ .slice = "0o20", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0o20, .is_long = true }, parseNumberLiteral(.{ .slice = "0o20L", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0o2, .is_long = false }, parseNumberLiteral(.{ .slice = "0o29", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0, .is_long = false }, parseNumberLiteral(.{ .slice = "0O29", .code_page = .windows1252 }));

    try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = false }, parseNumberLiteral(.{ .slice = "-1", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = false }, parseNumberLiteral(.{ .slice = "~1", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0xFFFFFFFF, .is_long = true }, parseNumberLiteral(.{ .slice = "-4294967297L", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0xFFFFFFFE, .is_long = true }, parseNumberLiteral(.{ .slice = "~4294967297L", .code_page = .windows1252 }));
    try std.testing.expectEqual(Number{ .value = 0xFFFFFFFD, .is_long = false }, parseNumberLiteral(.{ .slice = "-0X3", .code_page = .windows1252 }));

    // anything after L is ignored
    try std.testing.expectEqual(Number{ .value = 0x2A, .is_long = true }, parseNumberLiteral(.{ .slice = "0x2aL5", .code_page = .windows1252 }));
}