Correctly handle carriage return characters according to the spec (#12661)

* Scan from line start when finding tag in tokenizer This resolves a crash that can occur for invalid bytes like carriage returns that are valid characters when not parsed from within literals. There are potentially other edge cases this could resolve as well, as the calling code for this function didn't account for any potential 'pending_invalid_tokens' that could be queued up by the tokenizer from within another state. * Fix carriage return crash in multiline string Follow the guidance of #38: > However CR directly before NL is interpreted as only a newline and not part of the multiline string. zig fmt will delete the CR. Zig fmt already had code for deleting carriage returns, but would still crash - now it no longer does so. Carriage returns encountered before line-feeds are now appropriately removed on program compilation as well. * Only accept carriage returns before line feeds Previous commit was much less strict about this, this more closely matches the desired spec of only allow CR characters in a CRLF pair, but not otherwise. * Fix CR being rejected when used as whitespace Missed this comment from ziglang/zig-spec#83: > CR used as whitespace, whether directly preceding NL or stray, is still unambiguously whitespace. It is accepted by the grammar and replaced by the canonical whitespace by zig fmt. * Add tests for carriage return handling
2025-12-06 05:44:20 +00:00 · 2023-02-19 12:14:03 +00:00 · 2023-02-19 12:14:03 +00:00 · 346ec15c50
commit 346ec15c50
parent 281d4c0ff6
6 changed files with 71 additions and 9 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -11551,7 +11551,8 @@ fn readU32Be() u32 {}
      </p>
      <p>
      Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#})
-      to form a Windows style line ending, but this is discouraged.
+      to form a Windows style line ending, but this is discouraged. Note that in mulitline strings, CRLF sequences will
+      be encoded as LF when compiled into a zig program.
      A CR in any other context is not allowed.
      </p>
      <p>
--- a/lib/std/zig/Ast.zig
+++ b/lib/std/zig/Ast.zig
@ -171,7 +171,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
        .index = token_starts[token_index],
        .pending_invalid_token = null,
    };
-    const token = tokenizer.next();
+    const token = tokenizer.findTagAtCurrentIndex(token_tag);
    assert(token.tag == token_tag);
    return tree.source[token.loc.start..token.loc.end];
 }
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@ -406,6 +406,38 @@ pub const Tokenizer = struct {
        saw_at_sign,
    };

+    /// This is a workaround to the fact that the tokenizer can queue up
+    /// 'pending_invalid_token's when parsing literals, which means that we need
+    /// to scan from the start of the current line to find a matching tag - just
+    /// in case it was an invalid character generated during literal
+    /// tokenization. Ideally this processing of this would be pushed to the AST
+    /// parser or another later stage, both to give more useful error messages
+    /// with that extra context and in order to be able to remove this
+    /// workaround.
+    pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token {
+        if (tag == .invalid) {
+            const target_index = self.index;
+            var starting_index = target_index;
+            while (starting_index > 0) {
+                if (self.buffer[starting_index] == '\n') {
+                    break;
+                }
+                starting_index -= 1;
+            }
+
+            self.index = starting_index;
+            while (self.index <= target_index or self.pending_invalid_token != null) {
+                const result = self.next();
+                if (result.loc.start == target_index and result.tag == tag) {
+                    return result;
+                }
+            }
+            unreachable;
+        } else {
+            return self.next();
+        }
+    }
+
    pub fn next(self: *Tokenizer) Token {
        if (self.pending_invalid_token) |token| {
            self.pending_invalid_token = null;
@ -1127,7 +1159,7 @@ pub const Tokenizer = struct {
                        state = .start;
                        result.loc.start = self.index + 1;
                    },
-                    '\t', '\r' => state = .line_comment,
+                    '\t' => state = .line_comment,
                    else => {
                        state = .line_comment;
                        self.checkLiteralCharacter();
@ -1141,7 +1173,7 @@ pub const Tokenizer = struct {
                        result.tag = .doc_comment;
                        break;
                    },
-                    '\t', '\r' => {
+                    '\t' => {
                        state = .doc_comment;
                        result.tag = .doc_comment;
                    },
@ -1163,12 +1195,12 @@ pub const Tokenizer = struct {
                        state = .start;
                        result.loc.start = self.index + 1;
                    },
-                    '\t', '\r' => {},
+                    '\t' => {},
                    else => self.checkLiteralCharacter(),
                },
                .doc_comment => switch (c) {
                    0, '\n' => break,
-                    '\t', '\r' => {},
+                    '\t' => {},
                    else => self.checkLiteralCharacter(),
                },
                .int => switch (c) {
@ -1239,7 +1271,15 @@ pub const Tokenizer = struct {
    fn getInvalidCharacterLength(self: *Tokenizer) u3 {
        const c0 = self.buffer[self.index];
        if (std.ascii.isASCII(c0)) {
-            if (std.ascii.isControl(c0)) {
+            if (c0 == '\r') {
+                if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
+                    // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
+                    // they constitute an illegal byte!
+                    return 0;
+                } else {
+                    return 1;
+                }
+            } else if (std.ascii.isControl(c0)) {
                // ascii control codes are never allowed
                // (note that \n was checked before we got here)
                return 1;
--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@ -10491,14 +10491,16 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice {
    var tok_i = start;
    {
        const slice = tree.tokenSlice(tok_i);
-        const line_bytes = slice[2 .. slice.len - 1];
+        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
        try string_bytes.appendSlice(gpa, line_bytes);
        tok_i += 1;
    }
    // Following lines: each line prepends a newline.
    while (tok_i <= end) : (tok_i += 1) {
        const slice = tree.tokenSlice(tok_i);
-        const line_bytes = slice[2 .. slice.len - 1];
+        const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1;
+        const line_bytes = slice[2 .. slice.len - carriage_return_ending];
        try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1);
        string_bytes.appendAssumeCapacity('\n');
        string_bytes.appendSliceAssumeCapacity(line_bytes);
--- a/test/compare_output.zig
+++ b/test/compare_output.zig
@ -535,4 +535,13 @@ pub fn addCases(cases: *tests.CompareOutputContext) void {
        \\debug: free - len: 5
        \\
    );
+
+    cases.add("valid carriage return example", "const io = @import(\"std\").io;\r\n" ++ // Testing CRLF line endings are valid
+        "\r\n" ++
+        "pub \r fn main() void {\r\n" ++ // Testing isolated carriage return as whitespace is valid
+        "    const stdout = io.getStdOut().writer();\r\n" ++
+        "    stdout.print(\\\\A Multiline\r\n" ++ // testing CRLF at end of multiline string line is valid and normalises to \n in the output
+        "                 \\\\String\r\n" ++
+        "                 , .{}) catch unreachable;\r\n" ++
+        "}\r\n", "A Multiline\nString");
 }
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@ -174,6 +174,16 @@ pub fn addCases(ctx: *TestContext) !void {
        });
    }

+    {
+        const case = ctx.obj("isolated carriage return in multiline string literal", .{});
+        case.backend = .stage2;
+
+        case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
+            ":1:19: error: expected ';' after declaration",
+            ":1:20: note: invalid byte: '\\r'",
+        });
+    }
+
    {
        const case = ctx.obj("missing semicolon at EOF", .{});
        case.addError(