Tokenizer bug fixes and improvements

Fixes many error messages corresponding to invalid bytes displaying the wrong byte. Additionaly improves handling of UTF-8 in some places.
2025-12-06 13:54:21 +00:00 · 2024-07-09 11:20:04 -04:00 · 2024-07-09 11:20:04 -04:00 · c50f300387
commit c50f300387
parent 9d38e82b5c
5 changed files with 214 additions and 155 deletions
--- a/lib/std/zig/Ast.zig
+++ b/lib/std/zig/Ast.zig
@ -188,9 +188,8 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
    var tokenizer: std.zig.Tokenizer = .{
        .buffer = tree.source,
        .index = token_starts[token_index],
-        .pending_invalid_token = null,
    };
-    const token = tokenizer.findTagAtCurrentIndex(token_tag);
+    const token = tokenizer.next();
    assert(token.tag == token_tag);
    return tree.source[token.loc.start..token.loc.end];
 }
--- a/lib/std/zig/AstGen.zig
+++ b/lib/std/zig/AstGen.zig
@ -13824,10 +13824,10 @@ fn lowerAstErrors(astgen: *AstGen) !void {
    var notes: std.ArrayListUnmanaged(u32) = .{};
    defer notes.deinit(gpa);

-    if (token_tags[parse_err.token + @intFromBool(parse_err.token_is_prev)] == .invalid) {
-        const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
-        const bad_off: u32 = @intCast(tree.tokenSlice(parse_err.token + @intFromBool(parse_err.token_is_prev)).len);
-        const byte_abs = token_starts[parse_err.token + @intFromBool(parse_err.token_is_prev)] + bad_off;
+    const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
+    if (token_tags[tok] == .invalid) {
+        const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
+        const byte_abs = token_starts[tok] + bad_off;
        try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
            std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
        }));
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@ -337,7 +337,6 @@ pub const Token = struct {
 pub const Tokenizer = struct {
    buffer: [:0]const u8,
    index: usize,
-    pending_invalid_token: ?Token,

    /// For debugging purposes
    pub fn dump(self: *Tokenizer, token: *const Token) void {
@ -350,7 +349,6 @@ pub const Tokenizer = struct {
        return Tokenizer{
            .buffer = buffer,
            .index = src_start,
-            .pending_invalid_token = null,
        };
    }

@ -366,8 +364,6 @@ pub const Tokenizer = struct {
        char_literal_hex_escape,
        char_literal_unicode_escape_saw_u,
        char_literal_unicode_escape,
-        char_literal_unicode_invalid,
-        char_literal_unicode,
        char_literal_end,
        backslash,
        equal,
@ -406,43 +402,7 @@ pub const Tokenizer = struct {
        saw_at_sign,
    };

-    /// This is a workaround to the fact that the tokenizer can queue up
-    /// 'pending_invalid_token's when parsing literals, which means that we need
-    /// to scan from the start of the current line to find a matching tag - just
-    /// in case it was an invalid character generated during literal
-    /// tokenization. Ideally this processing of this would be pushed to the AST
-    /// parser or another later stage, both to give more useful error messages
-    /// with that extra context and in order to be able to remove this
-    /// workaround.
-    pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
-        if (tag == .invalid) {
-            const target_index = self.index;
-            var starting_index = target_index;
-            while (starting_index > 0) {
-                if (self.buffer[starting_index] == '\n') {
-                    break;
-                }
-                starting_index -= 1;
-            }
-
-            self.index = starting_index;
-            while (self.index <= target_index or self.pending_invalid_token != null) {
-                const result = self.next();
-                if (result.loc.start == target_index and result.tag == tag) {
-                    return result;
-                }
-            }
-            unreachable;
-        } else {
-            return self.next();
-        }
-    }
-
    pub fn next(self: *Tokenizer) Token {
-        if (self.pending_invalid_token) |token| {
-            self.pending_invalid_token = null;
-            return token;
-        }
        var state: State = .start;
        var result = Token{
            .tag = .eof,
@ -452,7 +412,6 @@ pub const Tokenizer = struct {
            },
        };
        var seen_escape_digits: usize = undefined;
-        var remaining_code_units: usize = undefined;
        while (true) : (self.index += 1) {
            const c = self.buffer[self.index];
            switch (state) {
@ -460,9 +419,8 @@ pub const Tokenizer = struct {
                    0 => {
                        if (self.index != self.buffer.len) {
                            result.tag = .invalid;
-                            result.loc.start = self.index;
-                            self.index += 1;
                            result.loc.end = self.index;
+                            self.index += 1;
                            return result;
                        }
                        break;
@ -589,7 +547,7 @@ pub const Tokenizer = struct {
                    else => {
                        result.tag = .invalid;
                        result.loc.end = self.index;
-                        self.index += 1;
+                        self.index += std.unicode.utf8ByteSequenceLength(c) catch 1;
                        return result;
                    },
                },
@ -762,6 +720,14 @@ pub const Tokenizer = struct {
                    },
                },
                .string_literal => switch (c) {
+                    0, '\n' => {
+                        result.tag = .invalid;
+                        result.loc.end = self.index;
+                        if (self.index != self.buffer.len) {
+                            self.index += 1;
+                        }
+                        return result;
+                    },
                    '\\' => {
                        state = .string_literal_backslash;
                    },
@ -769,68 +735,75 @@ pub const Tokenizer = struct {
                        self.index += 1;
                        break;
                    },
-                    0 => {
-                        if (self.index == self.buffer.len) {
+                    else => {
+                        if (self.invalidCharacterLength()) |len| {
                            result.tag = .invalid;
-                            break;
-                        } else {
-                            self.checkLiteralCharacter();
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
-                    '\n' => {
-                        result.tag = .invalid;
-                        break;
-                    },
-                    else => self.checkLiteralCharacter(),
                },

                .string_literal_backslash => switch (c) {
                    0, '\n' => {
                        result.tag = .invalid;
-                        break;
+                        result.loc.end = self.index;
+                        if (self.index != self.buffer.len) {
+                            self.index += 1;
+                        }
+                        return result;
                    },
                    else => {
                        state = .string_literal;
+
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },

                .char_literal => switch (c) {
-                    0 => {
+                    0, '\n', '\'' => {
                        result.tag = .invalid;
-                        break;
+                        result.loc.end = self.index;
+                        if (self.index != self.buffer.len) {
+                            self.index += 1;
+                        }
+                        return result;
                    },
                    '\\' => {
                        state = .char_literal_backslash;
                    },
-                    '\'', 0x80...0xbf, 0xf8...0xff => {
-                        result.tag = .invalid;
-                        break;
-                    },
-                    0xc0...0xdf => { // 110xxxxx
-                        remaining_code_units = 1;
-                        state = .char_literal_unicode;
-                    },
-                    0xe0...0xef => { // 1110xxxx
-                        remaining_code_units = 2;
-                        state = .char_literal_unicode;
-                    },
-                    0xf0...0xf7 => { // 11110xxx
-                        remaining_code_units = 3;
-                        state = .char_literal_unicode;
-                    },
-                    '\n' => {
-                        result.tag = .invalid;
-                        break;
-                    },
                    else => {
                        state = .char_literal_end;
+
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },

                .char_literal_backslash => switch (c) {
                    0, '\n' => {
                        result.tag = .invalid;
-                        break;
+                        result.loc.end = self.index;
+                        if (self.index != self.buffer.len) {
+                            self.index += 1;
+                        }
+                        return result;
                    },
                    'x' => {
                        state = .char_literal_hex_escape;
@ -841,6 +814,15 @@ pub const Tokenizer = struct {
                    },
                    else => {
                        state = .char_literal_end;
+
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },

@ -858,42 +840,26 @@ pub const Tokenizer = struct {
                },

                .char_literal_unicode_escape_saw_u => switch (c) {
-                    0 => {
-                        result.tag = .invalid;
-                        break;
-                    },
                    '{' => {
                        state = .char_literal_unicode_escape;
                    },
                    else => {
                        result.tag = .invalid;
-                        state = .char_literal_unicode_invalid;
+                        break;
                    },
                },

                .char_literal_unicode_escape => switch (c) {
-                    0 => {
-                        result.tag = .invalid;
-                        break;
-                    },
                    '0'...'9', 'a'...'f', 'A'...'F' => {},
                    '}' => {
                        state = .char_literal_end; // too many/few digits handled later
                    },
                    else => {
                        result.tag = .invalid;
-                        state = .char_literal_unicode_invalid;
+                        break;
                    },
                },

-                .char_literal_unicode_invalid => switch (c) {
-                    // Keep consuming characters until an obvious stopping point.
-                    // This consolidates e.g. `u{0ab1Q}` into a single invalid token
-                    // instead of creating the tokens `u{0ab1`, `Q`, `}`
-                    '0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
-                    else => break,
-                },
-
                .char_literal_end => switch (c) {
                    '\'' => {
                        result.tag = .char_literal;
@ -906,27 +872,31 @@ pub const Tokenizer = struct {
                    },
                },

-                .char_literal_unicode => switch (c) {
-                    0x80...0xbf => {
-                        remaining_code_units -= 1;
-                        if (remaining_code_units == 0) {
-                            state = .char_literal_end;
+                .multiline_string_literal_line => switch (c) {
+                    0 => {
+                        if (self.index != self.buffer.len) {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += 1;
+                            return result;
                        }
-                    },
-                    else => {
-                        result.tag = .invalid;
                        break;
                    },
-                },
-
-                .multiline_string_literal_line => switch (c) {
-                    0 => break,
                    '\n' => {
                        self.index += 1;
                        break;
                    },
                    '\t' => {},
-                    else => self.checkLiteralCharacter(),
+                    else => {
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    },
                },

                .bang => switch (c) {
@ -1144,7 +1114,9 @@ pub const Tokenizer = struct {
                    0 => {
                        if (self.index != self.buffer.len) {
                            result.tag = .invalid;
+                            result.loc.end = self.index;
                            self.index += 1;
+                            return result;
                        }
                        break;
                    },
@ -1159,17 +1131,37 @@ pub const Tokenizer = struct {
                        state = .start;
                        result.loc.start = self.index + 1;
                    },
-                    '\t' => state = .line_comment,
+                    '\t' => {
+                        state = .line_comment;
+                    },
                    else => {
                        state = .line_comment;
-                        self.checkLiteralCharacter();
+
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },
                .doc_comment_start => switch (c) {
                    '/' => {
                        state = .line_comment;
                    },
-                    0, '\n' => {
+                    0 => {
+                        if (self.index != self.buffer.len) {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += 1;
+                            return result;
+                        }
+                        result.tag = .doc_comment;
+                        break;
+                    },
+                    '\n' => {
                        result.tag = .doc_comment;
                        break;
                    },
@ -1180,14 +1172,24 @@ pub const Tokenizer = struct {
                    else => {
                        state = .doc_comment;
                        result.tag = .doc_comment;
-                        self.checkLiteralCharacter();
+
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },
                .line_comment => switch (c) {
                    0 => {
                        if (self.index != self.buffer.len) {
                            result.tag = .invalid;
+                            result.loc.end = self.index;
                            self.index += 1;
+                            return result;
                        }
                        break;
                    },
@ -1196,12 +1198,30 @@ pub const Tokenizer = struct {
                        result.loc.start = self.index + 1;
                    },
                    '\t' => {},
-                    else => self.checkLiteralCharacter(),
+                    else => {
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    },
                },
                .doc_comment => switch (c) {
                    0, '\n' => break,
                    '\t' => {},
-                    else => self.checkLiteralCharacter(),
+                    else => {
+                        if (self.invalidCharacterLength()) |len| {
+                            result.tag = .invalid;
+                            result.loc.end = self.index;
+                            self.index += len;
+                            return result;
+                        }
+
+                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    },
                },
                .int => switch (c) {
                    '.' => state = .int_period,
@ -1244,10 +1264,6 @@ pub const Tokenizer = struct {
        }

        if (result.tag == .eof) {
-            if (self.pending_invalid_token) |token| {
-                self.pending_invalid_token = null;
-                return token;
-            }
            result.loc.start = self.index;
        }

@ -1255,27 +1271,14 @@ pub const Tokenizer = struct {
        return result;
    }

-    fn checkLiteralCharacter(self: *Tokenizer) void {
-        if (self.pending_invalid_token != null) return;
-        const invalid_length = self.getInvalidCharacterLength();
-        if (invalid_length == 0) return;
-        self.pending_invalid_token = .{
-            .tag = .invalid,
-            .loc = .{
-                .start = self.index,
-                .end = self.index + invalid_length,
-            },
-        };
-    }
-
-    fn getInvalidCharacterLength(self: *Tokenizer) u3 {
+    fn invalidCharacterLength(self: *Tokenizer) ?u3 {
        const c0 = self.buffer[self.index];
        if (std.ascii.isAscii(c0)) {
            if (c0 == '\r') {
                if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
                    // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
                    // they constitute an illegal byte!
-                    return 0;
+                    return null;
                } else {
                    return 1;
                }
@ -1285,7 +1288,7 @@ pub const Tokenizer = struct {
                return 1;
            }
            // looks fine to me.
-            return 0;
+            return null;
        } else {
            // check utf8-encoded character.
            const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
@ -1308,8 +1311,7 @@ pub const Tokenizer = struct {
                },
                else => unreachable,
            }
-            self.index += length - 1;
-            return 0;
+            return null;
        }
    }
 };
@ -1394,27 +1396,37 @@ test "code point literal with unicode escapes" {
    // Invalid unicode escapes
    try testTokenize(
        \\'\u'
-    , &.{.invalid});
+    , &.{ .invalid, .invalid });
    try testTokenize(
        \\'\u{{'
-    , &.{ .invalid, .invalid });
+    , &.{ .invalid, .l_brace, .invalid });
    try testTokenize(
        \\'\u{}'
    , &.{.char_literal});
    try testTokenize(
        \\'\u{s}'
-    , &.{ .invalid, .invalid });
+    , &.{
+        .invalid,
+        .identifier,
+        .r_brace,
+        .invalid,
+    });
    try testTokenize(
        \\'\u{2z}'
-    , &.{ .invalid, .invalid });
+    , &.{
+        .invalid,
+        .identifier,
+        .r_brace,
+        .invalid,
+    });
    try testTokenize(
        \\'\u{4a'
-    , &.{.invalid});
+    , &.{ .invalid, .invalid }); // 4a is valid

    // Test old-style unicode literals
    try testTokenize(
        \\'\u0333'
-    , &.{ .invalid, .invalid });
+    , &.{ .invalid, .number_literal, .invalid });
    try testTokenize(
        \\'\U0333'
    , &.{ .invalid, .number_literal, .invalid });
@ -1453,13 +1465,14 @@ test "invalid token characters" {
    try testTokenize("`", &.{.invalid});
    try testTokenize("'c", &.{.invalid});
    try testTokenize("'", &.{.invalid});
-    try testTokenize("''", &.{ .invalid, .invalid });
+    try testTokenize("''", &.{.invalid});
+    try testTokenize("'\n'", &.{ .invalid, .invalid });
 }

 test "invalid literal/comment characters" {
    try testTokenize("\"\x00\"", &.{
-        .string_literal,
        .invalid,
+        .invalid, // Incomplete string literal starting after invalid
    });
    try testTokenize("//\x00", &.{
        .invalid,
@ -1910,10 +1923,10 @@ test "saturating operators" {
 test "null byte before eof" {
    try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal });
    try testTokenize("//\x00", &.{.invalid});
-    try testTokenize("\\\\\x00", &.{ .multiline_string_literal_line, .invalid });
+    try testTokenize("\\\\\x00", &.{.invalid});
    try testTokenize("\x00", &.{.invalid});
    try testTokenize("// NUL\x00\n", &.{.invalid});
-    try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
+    try testTokenize("///\x00\n", &.{.invalid});
    try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
 }

--- a/test/cases/compile_errors/invalid_unicode_escape.zig
+++ b/test/cases/compile_errors/invalid_unicode_escape.zig
@ -0,0 +1,11 @@
+export fn entry() void {
+    const a = '\u{12z34}';
+}
+
+// error
+// backend=stage2
+// target=native
+//
+// :2:15: error: expected expression, found 'invalid bytes'
+// :2:21: note: invalid byte: 'z'
+
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@ -42,8 +42,8 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);

        case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
-            ":1:19: error: expected ';' after declaration",
-            ":1:20: note: invalid byte: '\\r'",
+            ":1:13: error: expected expression, found 'invalid bytes'",
+            ":1:19: note: invalid byte: '\\r'",
        });
    }

@ -217,4 +217,40 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
            \\pub fn anytypeFunction(_: anytype) void {}
        );
    }
+
+    {
+        const case = ctx.obj("invalid byte in string", b.graph.host);
+
+        case.addError("_ = \"\x01Q\";", &[_][]const u8{
+            ":1:5: error: expected expression, found 'invalid bytes'",
+            ":1:6: note: invalid byte: '\\x01'",
+        });
+    }
+
+    {
+        const case = ctx.obj("invalid byte in comment", b.graph.host);
+
+        case.addError("//\x01Q", &[_][]const u8{
+            ":1:1: error: expected type expression, found 'invalid bytes'",
+            ":1:3: note: invalid byte: '\\x01'",
+        });
+    }
+
+    {
+        const case = ctx.obj("control character in character literal", b.graph.host);
+
+        case.addError("const c = '\x01';", &[_][]const u8{
+            ":1:11: error: expected expression, found 'invalid bytes'",
+            ":1:12: note: invalid byte: '\\x01'",
+        });
+    }
+
+    {
+        const case = ctx.obj("invalid byte at start of token", b.graph.host);
+
+        case.addError("x = \x00Q", &[_][]const u8{
+            ":1:5: error: expected expression, found 'invalid bytes'",
+            ":1:5: note: invalid byte: '\\x00'",
+        });
+    }
 }