diff --git a/doc/langref.html.in b/doc/langref.html.in index a74d06ccbf..8ef31558d5 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -11551,7 +11551,8 @@ fn readU32Be() u32 {}

Each LF may be immediately preceded by a single CR (byte value 0x0d, code point U+000d, {#syntax#}'\r'{#endsyntax#}) - to form a Windows style line ending, but this is discouraged. + to form a Windows style line ending, but this is discouraged. Note that in mulitline strings, CRLF sequences will + be encoded as LF when compiled into a zig program. A CR in any other context is not allowed.

diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig index 94cdcff4e7..ac52e8e942 100644 --- a/lib/std/zig/Ast.zig +++ b/lib/std/zig/Ast.zig @@ -171,7 +171,7 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 { .index = token_starts[token_index], .pending_invalid_token = null, }; - const token = tokenizer.next(); + const token = tokenizer.findTagAtCurrentIndex(token_tag); assert(token.tag == token_tag); return tree.source[token.loc.start..token.loc.end]; } diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 0e0742eab3..0d6a6d4fd8 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -406,6 +406,38 @@ pub const Tokenizer = struct { saw_at_sign, }; + /// This is a workaround to the fact that the tokenizer can queue up + /// 'pending_invalid_token's when parsing literals, which means that we need + /// to scan from the start of the current line to find a matching tag - just + /// in case it was an invalid character generated during literal + /// tokenization. Ideally this processing of this would be pushed to the AST + /// parser or another later stage, both to give more useful error messages + /// with that extra context and in order to be able to remove this + /// workaround. + pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token { + if (tag == .invalid) { + const target_index = self.index; + var starting_index = target_index; + while (starting_index > 0) { + if (self.buffer[starting_index] == '\n') { + break; + } + starting_index -= 1; + } + + self.index = starting_index; + while (self.index <= target_index or self.pending_invalid_token != null) { + const result = self.next(); + if (result.loc.start == target_index and result.tag == tag) { + return result; + } + } + unreachable; + } else { + return self.next(); + } + } + pub fn next(self: *Tokenizer) Token { if (self.pending_invalid_token) |token| { self.pending_invalid_token = null; @@ -1127,7 +1159,7 @@ pub const Tokenizer = struct { state = .start; result.loc.start = self.index + 1; }, - '\t', '\r' => state = .line_comment, + '\t' => state = .line_comment, else => { state = .line_comment; self.checkLiteralCharacter(); @@ -1141,7 +1173,7 @@ pub const Tokenizer = struct { result.tag = .doc_comment; break; }, - '\t', '\r' => { + '\t' => { state = .doc_comment; result.tag = .doc_comment; }, @@ -1163,12 +1195,12 @@ pub const Tokenizer = struct { state = .start; result.loc.start = self.index + 1; }, - '\t', '\r' => {}, + '\t' => {}, else => self.checkLiteralCharacter(), }, .doc_comment => switch (c) { 0, '\n' => break, - '\t', '\r' => {}, + '\t' => {}, else => self.checkLiteralCharacter(), }, .int => switch (c) { @@ -1239,7 +1271,15 @@ pub const Tokenizer = struct { fn getInvalidCharacterLength(self: *Tokenizer) u3 { const c0 = self.buffer[self.index]; if (std.ascii.isASCII(c0)) { - if (std.ascii.isControl(c0)) { + if (c0 == '\r') { + if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') { + // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise + // they constitute an illegal byte! + return 0; + } else { + return 1; + } + } else if (std.ascii.isControl(c0)) { // ascii control codes are never allowed // (note that \n was checked before we got here) return 1; diff --git a/src/AstGen.zig b/src/AstGen.zig index 40eef32d4e..a7ce79511d 100644 --- a/src/AstGen.zig +++ b/src/AstGen.zig @@ -10491,14 +10491,16 @@ fn strLitNodeAsString(astgen: *AstGen, node: Ast.Node.Index) !IndexSlice { var tok_i = start; { const slice = tree.tokenSlice(tok_i); - const line_bytes = slice[2 .. slice.len - 1]; + const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1; + const line_bytes = slice[2 .. slice.len - carriage_return_ending]; try string_bytes.appendSlice(gpa, line_bytes); tok_i += 1; } // Following lines: each line prepends a newline. while (tok_i <= end) : (tok_i += 1) { const slice = tree.tokenSlice(tok_i); - const line_bytes = slice[2 .. slice.len - 1]; + const carriage_return_ending: usize = if (slice[slice.len - 2] == '\r') 2 else 1; + const line_bytes = slice[2 .. slice.len - carriage_return_ending]; try string_bytes.ensureUnusedCapacity(gpa, line_bytes.len + 1); string_bytes.appendAssumeCapacity('\n'); string_bytes.appendSliceAssumeCapacity(line_bytes); diff --git a/test/compare_output.zig b/test/compare_output.zig index b3a6144729..31617f6427 100644 --- a/test/compare_output.zig +++ b/test/compare_output.zig @@ -535,4 +535,13 @@ pub fn addCases(cases: *tests.CompareOutputContext) void { \\debug: free - len: 5 \\ ); + + cases.add("valid carriage return example", "const io = @import(\"std\").io;\r\n" ++ // Testing CRLF line endings are valid + "\r\n" ++ + "pub \r fn main() void {\r\n" ++ // Testing isolated carriage return as whitespace is valid + " const stdout = io.getStdOut().writer();\r\n" ++ + " stdout.print(\\\\A Multiline\r\n" ++ // testing CRLF at end of multiline string line is valid and normalises to \n in the output + " \\\\String\r\n" ++ + " , .{}) catch unreachable;\r\n" ++ + "}\r\n", "A Multiline\nString"); } diff --git a/test/compile_errors.zig b/test/compile_errors.zig index 801d0464ea..21c8822eb3 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -174,6 +174,16 @@ pub fn addCases(ctx: *TestContext) !void { }); } + { + const case = ctx.obj("isolated carriage return in multiline string literal", .{}); + case.backend = .stage2; + + case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{ + ":1:19: error: expected ';' after declaration", + ":1:20: note: invalid byte: '\\r'", + }); + } + { const case = ctx.obj("missing semicolon at EOF", .{}); case.addError(