diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig index 20bdba8cf7..f50ec8305a 100644 --- a/lib/std/zig/Ast.zig +++ b/lib/std/zig/Ast.zig @@ -188,9 +188,8 @@ pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 { var tokenizer: std.zig.Tokenizer = .{ .buffer = tree.source, .index = token_starts[token_index], - .pending_invalid_token = null, }; - const token = tokenizer.findTagAtCurrentIndex(token_tag); + const token = tokenizer.next(); assert(token.tag == token_tag); return tree.source[token.loc.start..token.loc.end]; } diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig index 92be592a43..aa61714434 100644 --- a/lib/std/zig/AstGen.zig +++ b/lib/std/zig/AstGen.zig @@ -13824,10 +13824,10 @@ fn lowerAstErrors(astgen: *AstGen) !void { var notes: std.ArrayListUnmanaged(u32) = .{}; defer notes.deinit(gpa); - if (token_tags[parse_err.token + @intFromBool(parse_err.token_is_prev)] == .invalid) { - const tok = parse_err.token + @intFromBool(parse_err.token_is_prev); - const bad_off: u32 = @intCast(tree.tokenSlice(parse_err.token + @intFromBool(parse_err.token_is_prev)).len); - const byte_abs = token_starts[parse_err.token + @intFromBool(parse_err.token_is_prev)] + bad_off; + const tok = parse_err.token + @intFromBool(parse_err.token_is_prev); + if (token_tags[tok] == .invalid) { + const bad_off: u32 = @intCast(tree.tokenSlice(tok).len); + const byte_abs = token_starts[tok] + bad_off; try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{ std.zig.fmtEscapes(tree.source[byte_abs..][0..1]), })); diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 6897980fdd..36cbf9a856 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -337,7 +337,6 @@ pub const Token = struct { pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, - pending_invalid_token: ?Token, /// For debugging purposes pub fn dump(self: *Tokenizer, token: *const Token) void { @@ -350,7 +349,6 @@ pub const Tokenizer = struct { return Tokenizer{ .buffer = buffer, .index = src_start, - .pending_invalid_token = null, }; } @@ -366,8 +364,6 @@ pub const Tokenizer = struct { char_literal_hex_escape, char_literal_unicode_escape_saw_u, char_literal_unicode_escape, - char_literal_unicode_invalid, - char_literal_unicode, char_literal_end, backslash, equal, @@ -406,43 +402,7 @@ pub const Tokenizer = struct { saw_at_sign, }; - /// This is a workaround to the fact that the tokenizer can queue up - /// 'pending_invalid_token's when parsing literals, which means that we need - /// to scan from the start of the current line to find a matching tag - just - /// in case it was an invalid character generated during literal - /// tokenization. Ideally this processing of this would be pushed to the AST - /// parser or another later stage, both to give more useful error messages - /// with that extra context and in order to be able to remove this - /// workaround. - pub fn findTagAtCurrentIndex(self: *Tokenizer, tag: Token.Tag) Token { - if (tag == .invalid) { - const target_index = self.index; - var starting_index = target_index; - while (starting_index > 0) { - if (self.buffer[starting_index] == '\n') { - break; - } - starting_index -= 1; - } - - self.index = starting_index; - while (self.index <= target_index or self.pending_invalid_token != null) { - const result = self.next(); - if (result.loc.start == target_index and result.tag == tag) { - return result; - } - } - unreachable; - } else { - return self.next(); - } - } - pub fn next(self: *Tokenizer) Token { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } var state: State = .start; var result = Token{ .tag = .eof, @@ -452,7 +412,6 @@ pub const Tokenizer = struct { }, }; var seen_escape_digits: usize = undefined; - var remaining_code_units: usize = undefined; while (true) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -460,9 +419,8 @@ pub const Tokenizer = struct { 0 => { if (self.index != self.buffer.len) { result.tag = .invalid; - result.loc.start = self.index; - self.index += 1; result.loc.end = self.index; + self.index += 1; return result; } break; @@ -589,7 +547,7 @@ pub const Tokenizer = struct { else => { result.tag = .invalid; result.loc.end = self.index; - self.index += 1; + self.index += std.unicode.utf8ByteSequenceLength(c) catch 1; return result; }, }, @@ -762,6 +720,14 @@ pub const Tokenizer = struct { }, }, .string_literal => switch (c) { + 0, '\n' => { + result.tag = .invalid; + result.loc.end = self.index; + if (self.index != self.buffer.len) { + self.index += 1; + } + return result; + }, '\\' => { state = .string_literal_backslash; }, @@ -769,68 +735,75 @@ pub const Tokenizer = struct { self.index += 1; break; }, - 0 => { - if (self.index == self.buffer.len) { + else => { + if (self.invalidCharacterLength()) |len| { result.tag = .invalid; - break; - } else { - self.checkLiteralCharacter(); + result.loc.end = self.index; + self.index += len; + return result; } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, - '\n' => { - result.tag = .invalid; - break; - }, - else => self.checkLiteralCharacter(), }, .string_literal_backslash => switch (c) { 0, '\n' => { result.tag = .invalid; - break; + result.loc.end = self.index; + if (self.index != self.buffer.len) { + self.index += 1; + } + return result; }, else => { state = .string_literal; + + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .char_literal => switch (c) { - 0 => { + 0, '\n', '\'' => { result.tag = .invalid; - break; + result.loc.end = self.index; + if (self.index != self.buffer.len) { + self.index += 1; + } + return result; }, '\\' => { state = .char_literal_backslash; }, - '\'', 0x80...0xbf, 0xf8...0xff => { - result.tag = .invalid; - break; - }, - 0xc0...0xdf => { // 110xxxxx - remaining_code_units = 1; - state = .char_literal_unicode; - }, - 0xe0...0xef => { // 1110xxxx - remaining_code_units = 2; - state = .char_literal_unicode; - }, - 0xf0...0xf7 => { // 11110xxx - remaining_code_units = 3; - state = .char_literal_unicode; - }, - '\n' => { - result.tag = .invalid; - break; - }, else => { state = .char_literal_end; + + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .char_literal_backslash => switch (c) { 0, '\n' => { result.tag = .invalid; - break; + result.loc.end = self.index; + if (self.index != self.buffer.len) { + self.index += 1; + } + return result; }, 'x' => { state = .char_literal_hex_escape; @@ -841,6 +814,15 @@ pub const Tokenizer = struct { }, else => { state = .char_literal_end; + + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, @@ -858,42 +840,26 @@ pub const Tokenizer = struct { }, .char_literal_unicode_escape_saw_u => switch (c) { - 0 => { - result.tag = .invalid; - break; - }, '{' => { state = .char_literal_unicode_escape; }, else => { result.tag = .invalid; - state = .char_literal_unicode_invalid; + break; }, }, .char_literal_unicode_escape => switch (c) { - 0 => { - result.tag = .invalid; - break; - }, '0'...'9', 'a'...'f', 'A'...'F' => {}, '}' => { state = .char_literal_end; // too many/few digits handled later }, else => { result.tag = .invalid; - state = .char_literal_unicode_invalid; + break; }, }, - .char_literal_unicode_invalid => switch (c) { - // Keep consuming characters until an obvious stopping point. - // This consolidates e.g. `u{0ab1Q}` into a single invalid token - // instead of creating the tokens `u{0ab1`, `Q`, `}` - '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, - else => break, - }, - .char_literal_end => switch (c) { '\'' => { result.tag = .char_literal; @@ -906,27 +872,31 @@ pub const Tokenizer = struct { }, }, - .char_literal_unicode => switch (c) { - 0x80...0xbf => { - remaining_code_units -= 1; - if (remaining_code_units == 0) { - state = .char_literal_end; + .multiline_string_literal_line => switch (c) { + 0 => { + if (self.index != self.buffer.len) { + result.tag = .invalid; + result.loc.end = self.index; + self.index += 1; + return result; } - }, - else => { - result.tag = .invalid; break; }, - }, - - .multiline_string_literal_line => switch (c) { - 0 => break, '\n' => { self.index += 1; break; }, '\t' => {}, - else => self.checkLiteralCharacter(), + else => { + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + }, }, .bang => switch (c) { @@ -1144,7 +1114,9 @@ pub const Tokenizer = struct { 0 => { if (self.index != self.buffer.len) { result.tag = .invalid; + result.loc.end = self.index; self.index += 1; + return result; } break; }, @@ -1159,17 +1131,37 @@ pub const Tokenizer = struct { state = .start; result.loc.start = self.index + 1; }, - '\t' => state = .line_comment, + '\t' => { + state = .line_comment; + }, else => { state = .line_comment; - self.checkLiteralCharacter(); + + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .doc_comment_start => switch (c) { '/' => { state = .line_comment; }, - 0, '\n' => { + 0 => { + if (self.index != self.buffer.len) { + result.tag = .invalid; + result.loc.end = self.index; + self.index += 1; + return result; + } + result.tag = .doc_comment; + break; + }, + '\n' => { result.tag = .doc_comment; break; }, @@ -1180,14 +1172,24 @@ pub const Tokenizer = struct { else => { state = .doc_comment; result.tag = .doc_comment; - self.checkLiteralCharacter(); + + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; }, }, .line_comment => switch (c) { 0 => { if (self.index != self.buffer.len) { result.tag = .invalid; + result.loc.end = self.index; self.index += 1; + return result; } break; }, @@ -1196,12 +1198,30 @@ pub const Tokenizer = struct { result.loc.start = self.index + 1; }, '\t' => {}, - else => self.checkLiteralCharacter(), + else => { + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + }, }, .doc_comment => switch (c) { 0, '\n' => break, '\t' => {}, - else => self.checkLiteralCharacter(), + else => { + if (self.invalidCharacterLength()) |len| { + result.tag = .invalid; + result.loc.end = self.index; + self.index += len; + return result; + } + + self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1; + }, }, .int => switch (c) { '.' => state = .int_period, @@ -1244,10 +1264,6 @@ pub const Tokenizer = struct { } if (result.tag == .eof) { - if (self.pending_invalid_token) |token| { - self.pending_invalid_token = null; - return token; - } result.loc.start = self.index; } @@ -1255,27 +1271,14 @@ pub const Tokenizer = struct { return result; } - fn checkLiteralCharacter(self: *Tokenizer) void { - if (self.pending_invalid_token != null) return; - const invalid_length = self.getInvalidCharacterLength(); - if (invalid_length == 0) return; - self.pending_invalid_token = .{ - .tag = .invalid, - .loc = .{ - .start = self.index, - .end = self.index + invalid_length, - }, - }; - } - - fn getInvalidCharacterLength(self: *Tokenizer) u3 { + fn invalidCharacterLength(self: *Tokenizer) ?u3 { const c0 = self.buffer[self.index]; if (std.ascii.isAscii(c0)) { if (c0 == '\r') { if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') { // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise // they constitute an illegal byte! - return 0; + return null; } else { return 1; } @@ -1285,7 +1288,7 @@ pub const Tokenizer = struct { return 1; } // looks fine to me. - return 0; + return null; } else { // check utf8-encoded character. const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; @@ -1308,8 +1311,7 @@ pub const Tokenizer = struct { }, else => unreachable, } - self.index += length - 1; - return 0; + return null; } } }; @@ -1394,27 +1396,37 @@ test "code point literal with unicode escapes" { // Invalid unicode escapes try testTokenize( \\'\u' - , &.{.invalid}); + , &.{ .invalid, .invalid }); try testTokenize( \\'\u{{' - , &.{ .invalid, .invalid }); + , &.{ .invalid, .l_brace, .invalid }); try testTokenize( \\'\u{}' , &.{.char_literal}); try testTokenize( \\'\u{s}' - , &.{ .invalid, .invalid }); + , &.{ + .invalid, + .identifier, + .r_brace, + .invalid, + }); try testTokenize( \\'\u{2z}' - , &.{ .invalid, .invalid }); + , &.{ + .invalid, + .identifier, + .r_brace, + .invalid, + }); try testTokenize( \\'\u{4a' - , &.{.invalid}); + , &.{ .invalid, .invalid }); // 4a is valid // Test old-style unicode literals try testTokenize( \\'\u0333' - , &.{ .invalid, .invalid }); + , &.{ .invalid, .number_literal, .invalid }); try testTokenize( \\'\U0333' , &.{ .invalid, .number_literal, .invalid }); @@ -1453,13 +1465,14 @@ test "invalid token characters" { try testTokenize("`", &.{.invalid}); try testTokenize("'c", &.{.invalid}); try testTokenize("'", &.{.invalid}); - try testTokenize("''", &.{ .invalid, .invalid }); + try testTokenize("''", &.{.invalid}); + try testTokenize("'\n'", &.{ .invalid, .invalid }); } test "invalid literal/comment characters" { try testTokenize("\"\x00\"", &.{ - .string_literal, .invalid, + .invalid, // Incomplete string literal starting after invalid }); try testTokenize("//\x00", &.{ .invalid, @@ -1910,10 +1923,10 @@ test "saturating operators" { test "null byte before eof" { try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal }); try testTokenize("//\x00", &.{.invalid}); - try testTokenize("\\\\\x00", &.{ .multiline_string_literal_line, .invalid }); + try testTokenize("\\\\\x00", &.{.invalid}); try testTokenize("\x00", &.{.invalid}); try testTokenize("// NUL\x00\n", &.{.invalid}); - try testTokenize("///\x00\n", &.{ .doc_comment, .invalid }); + try testTokenize("///\x00\n", &.{.invalid}); try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid }); } diff --git a/test/cases/compile_errors/invalid_unicode_escape.zig b/test/cases/compile_errors/invalid_unicode_escape.zig new file mode 100644 index 0000000000..1555f2be80 --- /dev/null +++ b/test/cases/compile_errors/invalid_unicode_escape.zig @@ -0,0 +1,11 @@ +export fn entry() void { + const a = '\u{12z34}'; +} + +// error +// backend=stage2 +// target=native +// +// :2:15: error: expected expression, found 'invalid bytes' +// :2:21: note: invalid byte: 'z' + diff --git a/test/compile_errors.zig b/test/compile_errors.zig index 0dc191260f..5c5a574caf 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -42,8 +42,8 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host); case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{ - ":1:19: error: expected ';' after declaration", - ":1:20: note: invalid byte: '\\r'", + ":1:13: error: expected expression, found 'invalid bytes'", + ":1:19: note: invalid byte: '\\r'", }); } @@ -217,4 +217,40 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void { \\pub fn anytypeFunction(_: anytype) void {} ); } + + { + const case = ctx.obj("invalid byte in string", b.graph.host); + + case.addError("_ = \"\x01Q\";", &[_][]const u8{ + ":1:5: error: expected expression, found 'invalid bytes'", + ":1:6: note: invalid byte: '\\x01'", + }); + } + + { + const case = ctx.obj("invalid byte in comment", b.graph.host); + + case.addError("//\x01Q", &[_][]const u8{ + ":1:1: error: expected type expression, found 'invalid bytes'", + ":1:3: note: invalid byte: '\\x01'", + }); + } + + { + const case = ctx.obj("control character in character literal", b.graph.host); + + case.addError("const c = '\x01';", &[_][]const u8{ + ":1:11: error: expected expression, found 'invalid bytes'", + ":1:12: note: invalid byte: '\\x01'", + }); + } + + { + const case = ctx.obj("invalid byte at start of token", b.graph.host); + + case.addError("x = \x00Q", &[_][]const u8{ + ":1:5: error: expected expression, found 'invalid bytes'", + ":1:5: note: invalid byte: '\\x00'", + }); + } }