Tokenizer bug fixes and improvements

Fixes many error messages corresponding to invalid bytes displaying the
wrong byte. Additionaly improves handling of UTF-8 in some places.
This commit is contained in:
gooncreeper 2024-07-09 11:20:04 -04:00 committed by Veikka Tuominen
parent 9d38e82b5c
commit c50f300387
5 changed files with 214 additions and 155 deletions

View file

@ -188,9 +188,8 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
var tokenizer: std.zig.Tokenizer = .{
.buffer = tree.source,
.index = token_starts[token_index],
.pending_invalid_token = null,
};
const token = tokenizer.findTagAtCurrentIndex(token_tag);
const token = tokenizer.next();
assert(token.tag == token_tag);
return tree.source[token.loc.start..token.loc.end];
}

View file

@ -13824,10 +13824,10 @@ fn lowerAstErrors(astgen: *AstGen) !void {
var notes: std.ArrayListUnmanaged(u32) = .{};
defer notes.deinit(gpa);
if (token_tags[parse_err.token + @intFromBool(parse_err.token_is_prev)] == .invalid) {
const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
const bad_off: u32 = @intCast(tree.tokenSlice(parse_err.token + @intFromBool(parse_err.token_is_prev)).len);
const byte_abs = token_starts[parse_err.token + @intFromBool(parse_err.token_is_prev)] + bad_off;
const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
if (token_tags[tok] == .invalid) {
const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
const byte_abs = token_starts[tok] + bad_off;
try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
}));

View file

@ -337,7 +337,6 @@ pub const Token = struct {
pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
pending_invalid_token: ?Token,
/// For debugging purposes
pub fn dump(self: *Tokenizer, token: *const Token) void {
@ -350,7 +349,6 @@ pub const Tokenizer = struct {
return Tokenizer{
.buffer = buffer,
.index = src_start,
.pending_invalid_token = null,
};
}
@ -366,8 +364,6 @@ pub const Tokenizer = struct {
char_literal_hex_escape,
char_literal_unicode_escape_saw_u,
char_literal_unicode_escape,
char_literal_unicode_invalid,
char_literal_unicode,
char_literal_end,
backslash,
equal,
@ -406,43 +402,7 @@ pub const Tokenizer = struct {
saw_at_sign,
};
/// This is a workaround to the fact that the tokenizer can queue up
/// 'pending_invalid_token's when parsing literals, which means that we need
/// to scan from the start of the current line to find a matching tag - just
/// in case it was an invalid character generated during literal
/// tokenization. Ideally this processing of this would be pushed to the AST
/// parser or another later stage, both to give more useful error messages
/// with that extra context and in order to be able to remove this
/// workaround.
pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
if (tag == .invalid) {
const target_index = self.index;
var starting_index = target_index;
while (starting_index > 0) {
if (self.buffer[starting_index] == '\n') {
break;
}
starting_index -= 1;
}
self.index = starting_index;
while (self.index <= target_index or self.pending_invalid_token != null) {
const result = self.next();
if (result.loc.start == target_index and result.tag == tag) {
return result;
}
}
unreachable;
} else {
return self.next();
}
}
pub fn next(self: *Tokenizer) Token {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
var state: State = .start;
var result = Token{
.tag = .eof,
@ -452,7 +412,6 @@ pub const Tokenizer = struct {
},
};
var seen_escape_digits: usize = undefined;
var remaining_code_units: usize = undefined;
while (true) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@ -460,9 +419,8 @@ pub const Tokenizer = struct {
0 => {
if (self.index != self.buffer.len) {
result.tag = .invalid;
result.loc.start = self.index;
self.index += 1;
result.loc.end = self.index;
self.index += 1;
return result;
}
break;
@ -589,7 +547,7 @@ pub const Tokenizer = struct {
else => {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
self.index += std.unicode.utf8ByteSequenceLength(c) catch 1;
return result;
},
},
@ -762,6 +720,14 @@ pub const Tokenizer = struct {
},
},
.string_literal => switch (c) {
0, '\n' => {
result.tag = .invalid;
result.loc.end = self.index;
if (self.index != self.buffer.len) {
self.index += 1;
}
return result;
},
'\\' => {
state = .string_literal_backslash;
},
@ -769,68 +735,75 @@ pub const Tokenizer = struct {
self.index += 1;
break;
},
0 => {
if (self.index == self.buffer.len) {
else => {
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
break;
} else {
self.checkLiteralCharacter();
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
'\n' => {
result.tag = .invalid;
break;
},
else => self.checkLiteralCharacter(),
},
.string_literal_backslash => switch (c) {
0, '\n' => {
result.tag = .invalid;
break;
result.loc.end = self.index;
if (self.index != self.buffer.len) {
self.index += 1;
}
return result;
},
else => {
state = .string_literal;
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.char_literal => switch (c) {
0 => {
0, '\n', '\'' => {
result.tag = .invalid;
break;
result.loc.end = self.index;
if (self.index != self.buffer.len) {
self.index += 1;
}
return result;
},
'\\' => {
state = .char_literal_backslash;
},
'\'', 0x80...0xbf, 0xf8...0xff => {
result.tag = .invalid;
break;
},
0xc0...0xdf => { // 110xxxxx
remaining_code_units = 1;
state = .char_literal_unicode;
},
0xe0...0xef => { // 1110xxxx
remaining_code_units = 2;
state = .char_literal_unicode;
},
0xf0...0xf7 => { // 11110xxx
remaining_code_units = 3;
state = .char_literal_unicode;
},
'\n' => {
result.tag = .invalid;
break;
},
else => {
state = .char_literal_end;
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.char_literal_backslash => switch (c) {
0, '\n' => {
result.tag = .invalid;
break;
result.loc.end = self.index;
if (self.index != self.buffer.len) {
self.index += 1;
}
return result;
},
'x' => {
state = .char_literal_hex_escape;
@ -841,6 +814,15 @@ pub const Tokenizer = struct {
},
else => {
state = .char_literal_end;
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
@ -858,42 +840,26 @@ pub const Tokenizer = struct {
},
.char_literal_unicode_escape_saw_u => switch (c) {
0 => {
result.tag = .invalid;
break;
},
'{' => {
state = .char_literal_unicode_escape;
},
else => {
result.tag = .invalid;
state = .char_literal_unicode_invalid;
break;
},
},
.char_literal_unicode_escape => switch (c) {
0 => {
result.tag = .invalid;
break;
},
'0'...'9', 'a'...'f', 'A'...'F' => {},
'}' => {
state = .char_literal_end; // too many/few digits handled later
},
else => {
result.tag = .invalid;
state = .char_literal_unicode_invalid;
break;
},
},
.char_literal_unicode_invalid => switch (c) {
// Keep consuming characters until an obvious stopping point.
// This consolidates e.g. `u{0ab1Q}` into a single invalid token
// instead of creating the tokens `u{0ab1`, `Q`, `}`
'0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
else => break,
},
.char_literal_end => switch (c) {
'\'' => {
result.tag = .char_literal;
@ -906,27 +872,31 @@ pub const Tokenizer = struct {
},
},
.char_literal_unicode => switch (c) {
0x80...0xbf => {
remaining_code_units -= 1;
if (remaining_code_units == 0) {
state = .char_literal_end;
.multiline_string_literal_line => switch (c) {
0 => {
if (self.index != self.buffer.len) {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
return result;
}
},
else => {
result.tag = .invalid;
break;
},
},
.multiline_string_literal_line => switch (c) {
0 => break,
'\n' => {
self.index += 1;
break;
},
'\t' => {},
else => self.checkLiteralCharacter(),
else => {
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.bang => switch (c) {
@ -1144,7 +1114,9 @@ pub const Tokenizer = struct {
0 => {
if (self.index != self.buffer.len) {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
return result;
}
break;
},
@ -1159,17 +1131,37 @@ pub const Tokenizer = struct {
state = .start;
result.loc.start = self.index + 1;
},
'\t' => state = .line_comment,
'\t' => {
state = .line_comment;
},
else => {
state = .line_comment;
self.checkLiteralCharacter();
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.doc_comment_start => switch (c) {
'/' => {
state = .line_comment;
},
0, '\n' => {
0 => {
if (self.index != self.buffer.len) {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
return result;
}
result.tag = .doc_comment;
break;
},
'\n' => {
result.tag = .doc_comment;
break;
},
@ -1180,14 +1172,24 @@ pub const Tokenizer = struct {
else => {
state = .doc_comment;
result.tag = .doc_comment;
self.checkLiteralCharacter();
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.line_comment => switch (c) {
0 => {
if (self.index != self.buffer.len) {
result.tag = .invalid;
result.loc.end = self.index;
self.index += 1;
return result;
}
break;
},
@ -1196,12 +1198,30 @@ pub const Tokenizer = struct {
result.loc.start = self.index + 1;
},
'\t' => {},
else => self.checkLiteralCharacter(),
else => {
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.doc_comment => switch (c) {
0, '\n' => break,
'\t' => {},
else => self.checkLiteralCharacter(),
else => {
if (self.invalidCharacterLength()) |len| {
result.tag = .invalid;
result.loc.end = self.index;
self.index += len;
return result;
}
self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
},
},
.int => switch (c) {
'.' => state = .int_period,
@ -1244,10 +1264,6 @@ pub const Tokenizer = struct {
}
if (result.tag == .eof) {
if (self.pending_invalid_token) |token| {
self.pending_invalid_token = null;
return token;
}
result.loc.start = self.index;
}
@ -1255,27 +1271,14 @@ pub const Tokenizer = struct {
return result;
}
fn checkLiteralCharacter(self: *Tokenizer) void {
if (self.pending_invalid_token != null) return;
const invalid_length = self.getInvalidCharacterLength();
if (invalid_length == 0) return;
self.pending_invalid_token = .{
.tag = .invalid,
.loc = .{
.start = self.index,
.end = self.index + invalid_length,
},
};
}
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
fn invalidCharacterLength(self: *Tokenizer) ?u3 {
const c0 = self.buffer[self.index];
if (std.ascii.isAscii(c0)) {
if (c0 == '\r') {
if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
// Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
// they constitute an illegal byte!
return 0;
return null;
} else {
return 1;
}
@ -1285,7 +1288,7 @@ pub const Tokenizer = struct {
return 1;
}
// looks fine to me.
return 0;
return null;
} else {
// check utf8-encoded character.
const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
@ -1308,8 +1311,7 @@ pub const Tokenizer = struct {
},
else => unreachable,
}
self.index += length - 1;
return 0;
return null;
}
}
};
@ -1394,27 +1396,37 @@ test "code point literal with unicode escapes" {
// Invalid unicode escapes
try testTokenize(
\\'\u'
, &.{.invalid});
, &.{ .invalid, .invalid });
try testTokenize(
\\'\u{{'
, &.{ .invalid, .invalid });
, &.{ .invalid, .l_brace, .invalid });
try testTokenize(
\\'\u{}'
, &.{.char_literal});
try testTokenize(
\\'\u{s}'
, &.{ .invalid, .invalid });
, &.{
.invalid,
.identifier,
.r_brace,
.invalid,
});
try testTokenize(
\\'\u{2z}'
, &.{ .invalid, .invalid });
, &.{
.invalid,
.identifier,
.r_brace,
.invalid,
});
try testTokenize(
\\'\u{4a'
, &.{.invalid});
, &.{ .invalid, .invalid }); // 4a is valid
// Test old-style unicode literals
try testTokenize(
\\'\u0333'
, &.{ .invalid, .invalid });
, &.{ .invalid, .number_literal, .invalid });
try testTokenize(
\\'\U0333'
, &.{ .invalid, .number_literal, .invalid });
@ -1453,13 +1465,14 @@ test "invalid token characters" {
try testTokenize("`", &.{.invalid});
try testTokenize("'c", &.{.invalid});
try testTokenize("'", &.{.invalid});
try testTokenize("''", &.{ .invalid, .invalid });
try testTokenize("''", &.{.invalid});
try testTokenize("'\n'", &.{ .invalid, .invalid });
}
test "invalid literal/comment characters" {
try testTokenize("\"\x00\"", &.{
.string_literal,
.invalid,
.invalid, // Incomplete string literal starting after invalid
});
try testTokenize("//\x00", &.{
.invalid,
@ -1910,10 +1923,10 @@ test "saturating operators" {
test "null byte before eof" {
try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal });
try testTokenize("//\x00", &.{.invalid});
try testTokenize("\\\\\x00", &.{ .multiline_string_literal_line, .invalid });
try testTokenize("\\\\\x00", &.{.invalid});
try testTokenize("\x00", &.{.invalid});
try testTokenize("// NUL\x00\n", &.{.invalid});
try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
try testTokenize("///\x00\n", &.{.invalid});
try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
}

View file

@ -0,0 +1,11 @@
export fn entry() void {
const a = '\u{12z34}';
}
// error
// backend=stage2
// target=native
//
// :2:15: error: expected expression, found 'invalid bytes'
// :2:21: note: invalid byte: 'z'

View file

@ -42,8 +42,8 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
":1:19: error: expected ';' after declaration",
":1:20: note: invalid byte: '\\r'",
":1:13: error: expected expression, found 'invalid bytes'",
":1:19: note: invalid byte: '\\r'",
});
}
@ -217,4 +217,40 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
\\pub fn anytypeFunction(_: anytype) void {}
);
}
{
const case = ctx.obj("invalid byte in string", b.graph.host);
case.addError("_ = \"\x01Q\";", &[_][]const u8{
":1:5: error: expected expression, found 'invalid bytes'",
":1:6: note: invalid byte: '\\x01'",
});
}
{
const case = ctx.obj("invalid byte in comment", b.graph.host);
case.addError("//\x01Q", &[_][]const u8{
":1:1: error: expected type expression, found 'invalid bytes'",
":1:3: note: invalid byte: '\\x01'",
});
}
{
const case = ctx.obj("control character in character literal", b.graph.host);
case.addError("const c = '\x01';", &[_][]const u8{
":1:11: error: expected expression, found 'invalid bytes'",
":1:12: note: invalid byte: '\\x01'",
});
}
{
const case = ctx.obj("invalid byte at start of token", b.graph.host);
case.addError("x = \x00Q", &[_][]const u8{
":1:5: error: expected expression, found 'invalid bytes'",
":1:5: note: invalid byte: '\\x00'",
});
}
}