zig/lib/compiler/resinator/lex.zig
Andrew Kelley 0e37ff0d59 std.fmt: breaking API changes
added adapter to AnyWriter and GenericWriter to help bridge the gap
between old and new API

make std.testing.expectFmt work at compile-time

std.fmt no longer has a dependency on std.unicode. Formatted printing
was never properly unicode-aware. Now it no longer pretends to be.

Breakage/deprecations:
* std.fs.File.reader -> std.fs.File.deprecatedReader
* std.fs.File.writer -> std.fs.File.deprecatedWriter
* std.io.GenericReader -> std.io.Reader
* std.io.GenericWriter -> std.io.Writer
* std.io.AnyReader -> std.io.Reader
* std.io.AnyWriter -> std.io.Writer
* std.fmt.format -> std.fmt.deprecatedFormat
* std.fmt.fmtSliceEscapeLower -> std.ascii.hexEscape
* std.fmt.fmtSliceEscapeUpper -> std.ascii.hexEscape
* std.fmt.fmtSliceHexLower -> {x}
* std.fmt.fmtSliceHexUpper -> {X}
* std.fmt.fmtIntSizeDec -> {B}
* std.fmt.fmtIntSizeBin -> {Bi}
* std.fmt.fmtDuration -> {D}
* std.fmt.fmtDurationSigned -> {D}
* {} -> {f} when there is a format method
* format method signature
  - anytype -> *std.io.Writer
  - inferred error set -> error{WriteFailed}
  - options -> (deleted)
* std.fmt.Formatted
  - now takes context type explicitly
  - no fmt string
2025-07-07 22:43:51 -07:00

1129 lines
48 KiB
Zig

//! Expects to be run after the C preprocessor and after `removeComments`.
//! This means that the lexer assumes that:
//! - Splices ('\' at the end of a line) have been handled/collapsed.
//! - Preprocessor directives and macros have been expanded (any remaining should be skipped with the exception of `#pragma code_page`).
//! - All comments have been removed.
const std = @import("std");
const ErrorDetails = @import("errors.zig").ErrorDetails;
const columnWidth = @import("literals.zig").columnWidth;
const code_pages = @import("code_pages.zig");
const SupportedCodePage = code_pages.SupportedCodePage;
const SourceMappings = @import("source_mapping.zig").SourceMappings;
const isNonAsciiDigit = @import("utils.zig").isNonAsciiDigit;
const dumpTokensDuringTests = false;
pub const default_max_string_literal_codepoints = 4097;
pub const Token = struct {
id: Id,
start: usize,
end: usize,
line_number: usize,
pub const Id = enum {
literal,
number,
quoted_ascii_string,
quoted_wide_string,
operator,
begin,
end,
comma,
open_paren,
close_paren,
/// This Id is only used for errors, the Lexer will never return one
/// of these from a `next` call.
preprocessor_command,
invalid,
eof,
pub fn nameForErrorDisplay(self: Id) []const u8 {
return switch (self) {
.literal => "<literal>",
.number => "<number>",
.quoted_ascii_string => "<quoted ascii string>",
.quoted_wide_string => "<quoted wide string>",
.operator => "<operator>",
.begin => "<'{' or BEGIN>",
.end => "<'}' or END>",
.comma => ",",
.open_paren => "(",
.close_paren => ")",
.preprocessor_command => "<preprocessor command>",
.invalid => unreachable,
.eof => "<eof>",
};
}
};
pub fn slice(self: Token, buffer: []const u8) []const u8 {
return buffer[self.start..self.end];
}
/// Returns 0-based column
pub fn calculateColumn(token: Token, source: []const u8, tab_columns: usize, maybe_line_start: ?usize) usize {
const line_start = maybe_line_start orelse token.getLineStartForColumnCalc(source);
var i: usize = line_start;
var column: usize = 0;
while (i < token.start) : (i += 1) {
column += columnWidth(column, source[i], tab_columns);
}
return column;
}
// TODO: More testing is needed to determine if this can be merged with getLineStartForErrorDisplay
// (the TODO in currentIndexFormsLineEndingPair should be taken into account as well)
pub fn getLineStartForColumnCalc(token: Token, source: []const u8) usize {
const line_start = line_start: {
if (token.start != 0) {
// start checking at the byte before the token
var index = token.start - 1;
while (true) {
if (source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
if (index != 0) index -= 1 else break;
}
}
break :line_start 0;
};
return line_start;
}
pub fn getLineStartForErrorDisplay(token: Token, source: []const u8) usize {
const line_start = line_start: {
if (token.start != 0) {
// start checking at the byte before the token
var index = token.start - 1;
while (true) {
if (source[index] == '\r' or source[index] == '\n') break :line_start @min(source.len - 1, index + 1);
if (index != 0) index -= 1 else break;
}
}
break :line_start 0;
};
return line_start;
}
pub fn getLineForErrorDisplay(token: Token, source: []const u8, maybe_line_start: ?usize) []const u8 {
const line_start = maybe_line_start orelse token.getLineStartForErrorDisplay(source);
var line_end = line_start;
while (line_end < source.len and source[line_end] != '\r' and source[line_end] != '\n') : (line_end += 1) {}
return source[line_start..line_end];
}
pub fn isStringLiteral(token: Token) bool {
return token.id == .quoted_ascii_string or token.id == .quoted_wide_string;
}
};
pub const LineHandler = struct {
line_number: usize = 1,
buffer: []const u8,
last_line_ending_index: ?usize = null,
/// Like incrementLineNumber but checks that the current char is a line ending first.
/// Returns the new line number if it was incremented, null otherwise.
pub fn maybeIncrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
const c = self.buffer[cur_index];
if (c == '\r' or c == '\n') {
return self.incrementLineNumber(cur_index);
}
return null;
}
/// Increments line_number appropriately (handling line ending pairs)
/// and returns the new line number if it was incremented, or null otherwise.
pub fn incrementLineNumber(self: *LineHandler, cur_index: usize) ?usize {
if (self.currentIndexFormsLineEndingPair(cur_index)) {
self.last_line_ending_index = null;
return null;
} else {
self.line_number += 1;
self.last_line_ending_index = cur_index;
return self.line_number;
}
}
/// \r\n and \n\r pairs are treated as a single line ending (but not \r\r \n\n)
/// expects self.index and last_line_ending_index (if non-null) to contain line endings
///
/// TODO: This is not really how the Win32 RC compiler handles line endings. Instead, it
/// seems to drop all carriage returns during preprocessing and then replace all
/// remaining line endings with well-formed CRLF pairs (e.g. `<CR>a<CR>b<LF>c` becomes `ab<CR><LF>c`).
/// Handling this the same as the Win32 RC compiler would need control over the preprocessor,
/// since Clang converts unpaired <CR> into unpaired <LF>.
pub fn currentIndexFormsLineEndingPair(self: *const LineHandler, cur_index: usize) bool {
if (self.last_line_ending_index == null) return false;
// must immediately precede the current index, we know cur_index must
// be >= 1 since last_line_ending_index is non-null (so if the subtraction
// overflows it is a bug at the callsite of this function).
if (self.last_line_ending_index.? != cur_index - 1) return false;
const cur_line_ending = self.buffer[cur_index];
const last_line_ending = self.buffer[self.last_line_ending_index.?];
// sanity check
std.debug.assert(cur_line_ending == '\r' or cur_line_ending == '\n');
std.debug.assert(last_line_ending == '\r' or last_line_ending == '\n');
// can't be \n\n or \r\r
if (last_line_ending == cur_line_ending) return false;
return true;
}
};
pub const LexError = error{
UnfinishedStringLiteral,
StringLiteralTooLong,
InvalidNumberWithExponent,
InvalidDigitCharacterInNumberLiteral,
IllegalByte,
IllegalByteOutsideStringLiterals,
IllegalCodepointOutsideStringLiterals,
IllegalByteOrderMark,
IllegalPrivateUseCharacter,
FoundCStyleEscapedQuote,
CodePagePragmaMissingLeftParen,
CodePagePragmaMissingRightParen,
/// Can be caught and ignored
CodePagePragmaInvalidCodePage,
CodePagePragmaNotInteger,
CodePagePragmaOverflow,
CodePagePragmaUnsupportedCodePage,
/// Can be caught and ignored
CodePagePragmaInIncludedFile,
};
pub const Lexer = struct {
const Self = @This();
buffer: []const u8,
index: usize,
line_handler: LineHandler,
at_start_of_line: bool = true,
error_context_token: ?Token = null,
current_code_page: SupportedCodePage,
default_code_page: SupportedCodePage,
source_mappings: ?*SourceMappings,
max_string_literal_codepoints: u15,
/// Needed to determine whether or not the output code page should
/// be set in the parser.
seen_pragma_code_pages: u2 = 0,
last_pragma_code_page_token: ?Token = null,
pub const Error = LexError;
pub const LexerOptions = struct {
default_code_page: SupportedCodePage = .windows1252,
source_mappings: ?*SourceMappings = null,
max_string_literal_codepoints: u15 = default_max_string_literal_codepoints,
};
pub fn init(buffer: []const u8, options: LexerOptions) Self {
return Self{
.buffer = buffer,
.index = 0,
.current_code_page = options.default_code_page,
.default_code_page = options.default_code_page,
.source_mappings = options.source_mappings,
.max_string_literal_codepoints = options.max_string_literal_codepoints,
.line_handler = .{ .buffer = buffer },
};
}
pub fn dump(self: *Self, token: *const Token) void {
std.debug.print("{s}:{d}: {f}\n", .{
@tagName(token.id), token.line_number, std.ascii.hexEscape(token.slice(self.buffer), .lower),
});
}
pub const LexMethod = enum {
whitespace_delimiter_only,
normal,
normal_expect_operator,
};
pub fn next(self: *Self, comptime method: LexMethod) LexError!Token {
switch (method) {
.whitespace_delimiter_only => return self.nextWhitespaceDelimeterOnly(),
.normal => return self.nextNormal(),
.normal_expect_operator => return self.nextNormalWithContext(.expect_operator),
}
}
const StateWhitespaceDelimiterOnly = enum {
start,
literal,
preprocessor,
semicolon,
};
pub fn nextWhitespaceDelimeterOnly(self: *Self) LexError!Token {
const start_index = self.index;
var result = Token{
.id = .eof,
.start = start_index,
.end = undefined,
.line_number = self.line_handler.line_number,
};
var state = StateWhitespaceDelimiterOnly.start;
while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
const c = codepoint.value;
try self.checkForIllegalCodepoint(codepoint, false);
switch (state) {
.start => switch (c) {
'\r', '\n' => {
result.start = self.index + 1;
result.line_number = self.incrementLineNumber();
},
' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
result.start = self.index + 1;
},
// NBSP only counts as whitespace at the start of a line (but
// can be intermixed with other whitespace). Who knows why.
// TODO: This should either be removed, or it should also include
// the codepoints listed in disjoint_code_page.zig
'\xA0' => if (self.at_start_of_line) {
result.start = self.index + codepoint.byte_len;
} else {
state = .literal;
self.at_start_of_line = false;
},
'#' => {
if (self.at_start_of_line) {
state = .preprocessor;
} else {
state = .literal;
}
self.at_start_of_line = false;
},
';' => {
state = .semicolon;
self.at_start_of_line = false;
},
else => {
state = .literal;
self.at_start_of_line = false;
},
},
.literal => switch (c) {
'\r', '\n', ' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
result.id = .literal;
break;
},
else => {},
},
.preprocessor => switch (c) {
'\r', '\n' => {
try self.evaluatePreprocessorCommand(result.start, self.index);
result.start = self.index + 1;
state = .start;
result.line_number = self.incrementLineNumber();
},
else => {},
},
.semicolon => switch (c) {
'\r', '\n' => {
result.start = self.index + 1;
state = .start;
result.line_number = self.incrementLineNumber();
},
else => {},
},
}
} else { // got EOF
switch (state) {
.start => {},
.semicolon => {
// Skip past everything up to the EOF
result.start = self.index;
},
.literal => {
result.id = .literal;
},
.preprocessor => {
try self.evaluatePreprocessorCommand(result.start, self.index);
result.start = self.index;
},
}
}
result.end = self.index;
// EOF tokens must have their start index match the end index
std.debug.assert(result.id != .eof or result.start == result.end);
return result;
}
const StateNormal = enum {
start,
literal_or_quoted_wide_string,
quoted_ascii_string,
quoted_wide_string,
quoted_ascii_string_escape,
quoted_wide_string_escape,
quoted_ascii_string_maybe_end,
quoted_wide_string_maybe_end,
literal,
number_literal,
preprocessor,
semicolon,
// end
e,
en,
// begin
b,
be,
beg,
begi,
};
/// TODO: A not-terrible name
pub fn nextNormal(self: *Self) LexError!Token {
return self.nextNormalWithContext(.any);
}
pub fn nextNormalWithContext(self: *Self, context: enum { expect_operator, any }) LexError!Token {
const start_index = self.index;
var result = Token{
.id = .eof,
.start = start_index,
.end = undefined,
.line_number = self.line_handler.line_number,
};
var state = StateNormal.start;
// Note: The Windows RC compiler uses a non-standard method of computing
// length for its 'string literal too long' errors; it isn't easily
// explained or intuitive (it's sort-of pre-parsed byte length but with
// a few of exceptions/edge cases).
//
// It also behaves strangely with non-ASCII codepoints, e.g. even though the default
// limit is 4097, you can only have 4094 € codepoints (1 UTF-16 code unit each),
// and 2048 𐐷 codepoints (2 UTF-16 code units each).
//
// TODO: Understand this more, bring it more in line with how the Win32 limits work.
// Alternatively, do something that makes more sense but may be more permissive.
var string_literal_length: usize = 0;
// Keeping track of the string literal column prevents pathological edge cases when
// there are tons of tab stop characters within a string literal.
var string_literal_column: usize = 0;
var string_literal_collapsing_whitespace: bool = false;
var still_could_have_exponent: bool = true;
var exponent_index: ?usize = null;
while (self.current_code_page.codepointAt(self.index, self.buffer)) |codepoint| : (self.index += codepoint.byte_len) {
const c = codepoint.value;
const in_string_literal = switch (state) {
.quoted_ascii_string,
.quoted_wide_string,
.quoted_ascii_string_escape,
.quoted_wide_string_escape,
.quoted_ascii_string_maybe_end,
.quoted_wide_string_maybe_end,
=>
// If the current line is not the same line as the start of the string literal,
// then we want to treat the current codepoint as 'not in a string literal'
// for the purposes of detecting illegal codepoints. This means that we will
// error on illegal-outside-string-literal characters that are outside string
// literals from the perspective of a C preprocessor, but that may be
// inside string literals from the perspective of the RC lexer. For example,
// "hello
// @"
// will be treated as a single string literal by the RC lexer but the Win32
// preprocessor will consider this an unclosed string literal followed by
// the character @ and ", and will therefore error since the Win32 RC preprocessor
// errors on the @ character outside string literals.
//
// By doing this here, we can effectively emulate the Win32 RC preprocessor behavior
// at lex-time, and avoid the need for a separate step that checks for this edge-case
// specifically.
result.line_number == self.line_handler.line_number,
else => false,
};
try self.checkForIllegalCodepoint(codepoint, in_string_literal);
switch (state) {
.start => switch (c) {
'\r', '\n' => {
result.start = self.index + 1;
result.line_number = self.incrementLineNumber();
},
' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F' => {
result.start = self.index + 1;
},
// NBSP only counts as whitespace at the start of a line (but
// can be intermixed with other whitespace). Who knows why.
'\xA0' => if (self.at_start_of_line) {
result.start = self.index + codepoint.byte_len;
} else {
state = .literal;
self.at_start_of_line = false;
},
'L', 'l' => {
state = .literal_or_quoted_wide_string;
self.at_start_of_line = false;
},
'E', 'e' => {
state = .e;
self.at_start_of_line = false;
},
'B', 'b' => {
state = .b;
self.at_start_of_line = false;
},
'"' => {
state = .quoted_ascii_string;
self.at_start_of_line = false;
string_literal_collapsing_whitespace = false;
string_literal_length = 0;
var dummy_token = Token{
.start = self.index,
.end = self.index,
.line_number = self.line_handler.line_number,
.id = .invalid,
};
string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
},
'+', '&', '|' => {
self.index += 1;
result.id = .operator;
self.at_start_of_line = false;
break;
},
'-' => {
if (context == .expect_operator) {
self.index += 1;
result.id = .operator;
self.at_start_of_line = false;
break;
} else {
state = .number_literal;
still_could_have_exponent = true;
exponent_index = null;
self.at_start_of_line = false;
}
},
'0'...'9', '~' => {
state = .number_literal;
still_could_have_exponent = true;
exponent_index = null;
self.at_start_of_line = false;
},
'#' => {
if (self.at_start_of_line) {
state = .preprocessor;
} else {
state = .literal;
}
self.at_start_of_line = false;
},
';' => {
state = .semicolon;
self.at_start_of_line = false;
},
'{', '}' => {
self.index += 1;
result.id = if (c == '{') .begin else .end;
self.at_start_of_line = false;
break;
},
'(', ')' => {
self.index += 1;
result.id = if (c == '(') .open_paren else .close_paren;
self.at_start_of_line = false;
break;
},
',' => {
self.index += 1;
result.id = .comma;
self.at_start_of_line = false;
break;
},
else => {
if (isNonAsciiDigit(c)) {
self.error_context_token = .{
.id = .number,
.start = result.start,
.end = self.index + 1,
.line_number = self.line_handler.line_number,
};
return error.InvalidDigitCharacterInNumberLiteral;
}
state = .literal;
self.at_start_of_line = false;
},
},
.preprocessor => switch (c) {
'\r', '\n' => {
try self.evaluatePreprocessorCommand(result.start, self.index);
result.start = self.index + 1;
state = .start;
result.line_number = self.incrementLineNumber();
},
else => {},
},
// Semi-colon acts as a line-terminator--everything is skipped until
// the next line.
.semicolon => switch (c) {
'\r', '\n' => {
result.start = self.index + 1;
state = .start;
result.line_number = self.incrementLineNumber();
},
else => {},
},
.number_literal => switch (c) {
// zig fmt: off
' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
'\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
'\'', ';', '=',
=> {
// zig fmt: on
result.id = .number;
break;
},
'0'...'9' => {
if (exponent_index) |exp_i| {
if (self.index - 1 == exp_i) {
// Note: This being an error is a quirk of the preprocessor used by
// the Win32 RC compiler.
self.error_context_token = .{
.id = .number,
.start = result.start,
.end = self.index + 1,
.line_number = self.line_handler.line_number,
};
return error.InvalidNumberWithExponent;
}
}
},
'e', 'E' => {
if (still_could_have_exponent) {
exponent_index = self.index;
still_could_have_exponent = false;
}
},
else => {
if (isNonAsciiDigit(c)) {
self.error_context_token = .{
.id = .number,
.start = result.start,
.end = self.index + 1,
.line_number = self.line_handler.line_number,
};
return error.InvalidDigitCharacterInNumberLiteral;
}
still_could_have_exponent = false;
},
},
.literal_or_quoted_wide_string => switch (c) {
// zig fmt: off
' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
'\r', '\n', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
'\'', ';', '=',
// zig fmt: on
=> {
result.id = .literal;
break;
},
'"' => {
state = .quoted_wide_string;
string_literal_collapsing_whitespace = false;
string_literal_length = 0;
var dummy_token = Token{
.start = self.index,
.end = self.index,
.line_number = self.line_handler.line_number,
.id = .invalid,
};
string_literal_column = dummy_token.calculateColumn(self.buffer, 8, null);
},
else => {
state = .literal;
},
},
.literal => switch (c) {
// zig fmt: off
' ', '\t', '\x05'...'\x08', '\x0B'...'\x0C', '\x0E'...'\x1F',
'\r', '\n', '"', ',', '{', '}', '+', '-', '|', '&', '~', '(', ')',
'\'', ';', '=',
=> {
// zig fmt: on
result.id = .literal;
break;
},
else => {},
},
.e => switch (c) {
'N', 'n' => {
state = .en;
},
else => {
state = .literal;
self.index -= 1;
},
},
.en => switch (c) {
'D', 'd' => {
result.id = .end;
self.index += 1;
break;
},
else => {
state = .literal;
self.index -= 1;
},
},
.b => switch (c) {
'E', 'e' => {
state = .be;
},
else => {
state = .literal;
self.index -= 1;
},
},
.be => switch (c) {
'G', 'g' => {
state = .beg;
},
else => {
state = .literal;
self.index -= 1;
},
},
.beg => switch (c) {
'I', 'i' => {
state = .begi;
},
else => {
state = .literal;
self.index -= 1;
},
},
.begi => switch (c) {
'N', 'n' => {
result.id = .begin;
self.index += 1;
break;
},
else => {
state = .literal;
self.index -= 1;
},
},
.quoted_ascii_string, .quoted_wide_string => switch (c) {
'"' => {
string_literal_column += 1;
state = if (state == .quoted_ascii_string) .quoted_ascii_string_maybe_end else .quoted_wide_string_maybe_end;
},
'\\' => {
string_literal_length += 1;
string_literal_column += 1;
state = if (state == .quoted_ascii_string) .quoted_ascii_string_escape else .quoted_wide_string_escape;
},
'\r' => {
string_literal_column = 0;
// \r doesn't count towards string literal length
// Increment line number but don't affect the result token's line number
_ = self.incrementLineNumber();
},
'\n' => {
string_literal_column = 0;
// first \n expands to <space><\n>
if (!string_literal_collapsing_whitespace) {
string_literal_length += 2;
string_literal_collapsing_whitespace = true;
}
// the rest are collapsed into the <space><\n>
// Increment line number but don't affect the result token's line number
_ = self.incrementLineNumber();
},
// only \t, space, Vertical Tab, and Form Feed count as whitespace when collapsing
'\t', ' ', '\x0b', '\x0c' => {
if (!string_literal_collapsing_whitespace) {
// Literal tab characters are counted as the number of space characters
// needed to reach the next 8-column tab stop.
const width = columnWidth(string_literal_column, @intCast(c), 8);
string_literal_length += width;
string_literal_column += width;
}
},
else => {
string_literal_collapsing_whitespace = false;
string_literal_length += 1;
string_literal_column += 1;
},
},
.quoted_ascii_string_escape, .quoted_wide_string_escape => switch (c) {
'"' => {
self.error_context_token = .{
.id = .invalid,
.start = self.index - 1,
.end = self.index + 1,
.line_number = self.line_handler.line_number,
};
return error.FoundCStyleEscapedQuote;
},
else => {
string_literal_length += 1;
string_literal_column += 1;
state = if (state == .quoted_ascii_string_escape) .quoted_ascii_string else .quoted_wide_string;
},
},
.quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => switch (c) {
'"' => {
state = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
// Escaped quotes count as 1 char for string literal length checks.
// Since we did not increment on the first " (because it could have been
// the end of the quoted string), we increment here
string_literal_length += 1;
string_literal_column += 1;
},
else => {
result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
break;
},
},
}
} else { // got EOF
switch (state) {
.start => {},
.semicolon => {
// Skip past everything up to the EOF
result.start = self.index;
},
.literal_or_quoted_wide_string, .literal, .e, .en, .b, .be, .beg, .begi => {
result.id = .literal;
},
.preprocessor => {
try self.evaluatePreprocessorCommand(result.start, self.index);
result.start = self.index;
},
.number_literal => {
result.id = .number;
},
.quoted_ascii_string_maybe_end, .quoted_wide_string_maybe_end => {
result.id = if (state == .quoted_ascii_string_maybe_end) .quoted_ascii_string else .quoted_wide_string;
},
.quoted_ascii_string,
.quoted_wide_string,
.quoted_ascii_string_escape,
.quoted_wide_string_escape,
=> {
self.error_context_token = .{
.id = .eof,
.start = self.index,
.end = self.index,
.line_number = self.line_handler.line_number,
};
return LexError.UnfinishedStringLiteral;
},
}
}
result.end = self.index;
if (result.id == .quoted_ascii_string or result.id == .quoted_wide_string) {
if (string_literal_length > self.max_string_literal_codepoints) {
self.error_context_token = result;
return LexError.StringLiteralTooLong;
}
}
// EOF tokens must have their start index match the end index
std.debug.assert(result.id != .eof or result.start == result.end);
return result;
}
/// Increments line_number appropriately (handling line ending pairs)
/// and returns the new line number.
fn incrementLineNumber(self: *Self) usize {
_ = self.line_handler.incrementLineNumber(self.index);
self.at_start_of_line = true;
return self.line_handler.line_number;
}
fn checkForIllegalCodepoint(self: *Self, codepoint: code_pages.Codepoint, in_string_literal: bool) LexError!void {
const err = switch (codepoint.value) {
// 0x00 = NUL
// 0x1A = Substitute (treated as EOF)
// NOTE: 0x1A gets treated as EOF by the clang preprocessor so after a .rc file
// is run through the clang preprocessor it will no longer have 0x1A characters in it.
// 0x7F = DEL (treated as a context-specific terminator by the Windows RC compiler)
0x00, 0x1A, 0x7F => error.IllegalByte,
// 0x01...0x03 result in strange 'macro definition too big' errors when used outside of string literals
// 0x04 is valid but behaves strangely (sort of acts as a 'skip the next character' instruction)
0x01...0x04 => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
// @ and ` both result in error RC2018: unknown character '0x60' (and subsequently
// fatal error RC1116: RC terminating after preprocessor errors) if they are ever used
// outside of string literals. Not exactly sure why this would be the case, though.
// TODO: Make sure there aren't any exceptions
'@', '`' => if (!in_string_literal) error.IllegalByteOutsideStringLiterals else return,
// The Byte Order Mark is mostly skipped over by the Windows RC compiler, but
// there are edge cases where it leads to cryptic 'compiler limit : macro definition too big'
// errors (e.g. a BOM within a number literal). By making this illegal we avoid having to
// deal with a lot of edge cases and remove the potential footgun of the bytes of a BOM
// being 'missing' when included in a string literal (the Windows RC compiler acts as
// if the codepoint was never part of the string literal).
'\u{FEFF}' => error.IllegalByteOrderMark,
// Similar deal with this private use codepoint, it gets skipped/ignored by the
// RC compiler (but without the cryptic errors). Silently dropping bytes still seems like
// enough of a footgun with no real use-cases that it's still worth erroring instead of
// emulating the RC compiler's behavior, though.
'\u{E000}' => error.IllegalPrivateUseCharacter,
// These codepoints lead to strange errors when used outside of string literals,
// and miscompilations when used within string literals. We avoid the miscompilation
// within string literals and emit a warning, but outside of string literals it makes
// more sense to just disallow these codepoints.
0x900, 0xA00, 0xA0D, 0x2000, 0xD00, 0xFFFE, 0xFFFF => if (!in_string_literal) error.IllegalCodepointOutsideStringLiterals else return,
else => return,
};
self.error_context_token = .{
.id = .invalid,
.start = self.index,
.end = self.index + codepoint.byte_len,
.line_number = self.line_handler.line_number,
};
return err;
}
fn evaluatePreprocessorCommand(self: *Self, start: usize, end: usize) !void {
const token = Token{
.id = .preprocessor_command,
.start = start,
.end = end,
.line_number = self.line_handler.line_number,
};
errdefer self.error_context_token = token;
const full_command = self.buffer[start..end];
const code_page = (parsePragmaCodePage(full_command) catch |err| switch (err) {
error.NotPragma, error.NotCodePagePragma => return,
else => |e| return e,
}) orelse self.default_code_page;
// https://learn.microsoft.com/en-us/windows/win32/menurc/pragma-directives
// > This pragma is not supported in an included resource file (.rc)
//
// Even though the Win32 behavior is to just ignore such directives silently,
// this is an error in the lexer to allow for emitting warnings/errors when
// such directives are found if that's wanted. The intention is for the lexer
// to still be able to work correctly after this error is returned.
if (self.source_mappings) |source_mappings| {
if (!source_mappings.isRootFile(token.line_number)) {
return error.CodePagePragmaInIncludedFile;
}
}
self.seen_pragma_code_pages +|= 1;
self.last_pragma_code_page_token = token;
self.current_code_page = code_page;
}
pub fn getErrorDetails(self: Self, lex_err: LexError) ErrorDetails {
const err = switch (lex_err) {
error.UnfinishedStringLiteral => ErrorDetails.Error.unfinished_string_literal,
error.StringLiteralTooLong => return .{
.err = .string_literal_too_long,
.code_page = self.current_code_page,
.token = self.error_context_token.?,
.extra = .{ .number = self.max_string_literal_codepoints },
},
error.InvalidNumberWithExponent => ErrorDetails.Error.invalid_number_with_exponent,
error.InvalidDigitCharacterInNumberLiteral => ErrorDetails.Error.invalid_digit_character_in_number_literal,
error.IllegalByte => ErrorDetails.Error.illegal_byte,
error.IllegalByteOutsideStringLiterals => ErrorDetails.Error.illegal_byte_outside_string_literals,
error.IllegalCodepointOutsideStringLiterals => ErrorDetails.Error.illegal_codepoint_outside_string_literals,
error.IllegalByteOrderMark => ErrorDetails.Error.illegal_byte_order_mark,
error.IllegalPrivateUseCharacter => ErrorDetails.Error.illegal_private_use_character,
error.FoundCStyleEscapedQuote => ErrorDetails.Error.found_c_style_escaped_quote,
error.CodePagePragmaMissingLeftParen => ErrorDetails.Error.code_page_pragma_missing_left_paren,
error.CodePagePragmaMissingRightParen => ErrorDetails.Error.code_page_pragma_missing_right_paren,
error.CodePagePragmaInvalidCodePage => ErrorDetails.Error.code_page_pragma_invalid_code_page,
error.CodePagePragmaNotInteger => ErrorDetails.Error.code_page_pragma_not_integer,
error.CodePagePragmaOverflow => ErrorDetails.Error.code_page_pragma_overflow,
error.CodePagePragmaUnsupportedCodePage => ErrorDetails.Error.code_page_pragma_unsupported_code_page,
error.CodePagePragmaInIncludedFile => ErrorDetails.Error.code_page_pragma_in_included_file,
};
return .{
.err = err,
.code_page = self.current_code_page,
.token = self.error_context_token.?,
};
}
};
fn parseCodePageNum(str: []const u8) !u32 {
var x: u32 = 0;
for (str) |c| {
const digit = try std.fmt.charToDigit(c, 10);
if (x != 0) x = try std.math.mul(u32, x, 10);
x = try std.math.add(u32, x, digit);
}
return x;
}
/// Returns `null` when the code_page is set to DEFAULT
pub fn parsePragmaCodePage(full_command: []const u8) !?SupportedCodePage {
var command = full_command;
// Anything besides exactly this is ignored by the Windows RC implementation
const expected_directive = "#pragma";
if (!std.mem.startsWith(u8, command, expected_directive)) return error.NotPragma;
command = command[expected_directive.len..];
if (command.len == 0 or !std.ascii.isWhitespace(command[0])) return error.NotCodePagePragma;
while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
command = command[1..];
}
// Note: CoDe_PaGeZ is also treated as "code_page" by the Windows RC implementation,
// and it will error with 'Missing left parenthesis in code_page #pragma'
const expected_extension = "code_page";
if (!std.ascii.startsWithIgnoreCase(command, expected_extension)) return error.NotCodePagePragma;
command = command[expected_extension.len..];
while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
command = command[1..];
}
if (command.len == 0 or command[0] != '(') {
return error.CodePagePragmaMissingLeftParen;
}
command = command[1..];
while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
command = command[1..];
}
var num_str: []u8 = command[0..0];
while (command.len > 0 and (command[0] != ')' and !std.ascii.isWhitespace(command[0]))) {
command = command[1..];
num_str.len += 1;
}
if (num_str.len == 0) {
return error.CodePagePragmaNotInteger;
}
while (command.len > 0 and std.ascii.isWhitespace(command[0])) {
command = command[1..];
}
if (command.len == 0 or command[0] != ')') {
return error.CodePagePragmaMissingRightParen;
}
const code_page: ?SupportedCodePage = code_page: {
if (std.ascii.eqlIgnoreCase("DEFAULT", num_str)) {
break :code_page null;
}
// The Win32 compiler behaves fairly strangely around maxInt(u32):
// - If the overflowed u32 wraps and becomes a known code page ID, then
// it will error/warn with "Codepage not valid: ignored" (depending on /w)
// - If the overflowed u32 wraps and does not become a known code page ID,
// then it will error with 'constant too big' and 'Codepage not integer'
//
// Instead of that, we just have a separate error specifically for overflow.
const num = parseCodePageNum(num_str) catch |err| switch (err) {
error.InvalidCharacter => return error.CodePagePragmaNotInteger,
error.Overflow => return error.CodePagePragmaOverflow,
};
// Anything that starts with 0 but does not resolve to 0 is treated as invalid, e.g. 01252
if (num_str[0] == '0' and num != 0) {
return error.CodePagePragmaInvalidCodePage;
}
// Anything that resolves to 0 is treated as 'not an integer' by the Win32 implementation.
else if (num == 0) {
return error.CodePagePragmaNotInteger;
}
// Anything above u16 max is not going to be found since our CodePage enum is backed by a u16.
if (num > std.math.maxInt(u16)) {
return error.CodePagePragmaInvalidCodePage;
}
break :code_page code_pages.getByIdentifierEnsureSupported(@intCast(num)) catch |err| switch (err) {
error.InvalidCodePage => return error.CodePagePragmaInvalidCodePage,
error.UnsupportedCodePage => return error.CodePagePragmaUnsupportedCodePage,
};
};
return code_page;
}
fn testLexNormal(source: []const u8, expected_tokens: []const Token.Id) !void {
var lexer = Lexer.init(source, .{});
if (dumpTokensDuringTests) std.debug.print("\n----------------------\n{s}\n----------------------\n", .{lexer.buffer});
for (expected_tokens) |expected_token_id| {
const token = try lexer.nextNormal();
if (dumpTokensDuringTests) lexer.dump(&token);
try std.testing.expectEqual(expected_token_id, token.id);
}
const last_token = try lexer.nextNormal();
try std.testing.expectEqual(Token.Id.eof, last_token.id);
}
fn expectLexError(expected: LexError, actual: anytype) !void {
try std.testing.expectError(expected, actual);
if (dumpTokensDuringTests) std.debug.print("{!}\n", .{actual});
}
test "normal: numbers" {
try testLexNormal("1", &.{.number});
try testLexNormal("-1", &.{.number});
try testLexNormal("- 1", &.{ .number, .number });
try testLexNormal("-a", &.{.number});
}
test "normal: string literals" {
try testLexNormal("\"\"", &.{.quoted_ascii_string});
// "" is an escaped "
try testLexNormal("\" \"\" \"", &.{.quoted_ascii_string});
}
test "superscript chars and code pages" {
const firstToken = struct {
pub fn firstToken(source: []const u8, default_code_page: SupportedCodePage, comptime lex_method: Lexer.LexMethod) LexError!Token {
var lexer = Lexer.init(source, .{ .default_code_page = default_code_page });
return lexer.next(lex_method);
}
}.firstToken;
const utf8_source = "²";
const windows1252_source = "\xB2";
const windows1252_encoded_as_windows1252 = firstToken(windows1252_source, .windows1252, .normal);
try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, windows1252_encoded_as_windows1252);
const utf8_encoded_as_windows1252 = try firstToken(utf8_source, .windows1252, .normal);
try std.testing.expectEqual(Token{
.id = .literal,
.start = 0,
.end = 2,
.line_number = 1,
}, utf8_encoded_as_windows1252);
const utf8_encoded_as_utf8 = firstToken(utf8_source, .utf8, .normal);
try std.testing.expectError(error.InvalidDigitCharacterInNumberLiteral, utf8_encoded_as_utf8);
const windows1252_encoded_as_utf8 = try firstToken(windows1252_source, .utf8, .normal);
try std.testing.expectEqual(Token{
.id = .literal,
.start = 0,
.end = 1,
.line_number = 1,
}, windows1252_encoded_as_utf8);
}