diff --git a/doc/langref.html.in b/doc/langref.html.in index 2d853e8611..ae5744c52b 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -566,7 +566,7 @@ test "string literals" { assert(normal_bytes.len == 5); assert(normal_bytes[1] == 'e'); assert('e' == '\x65'); - assert('\U01f4a9' == 128169); + assert('\u{1f4a9}' == 128169); assert(mem.eql(u8, "hello", "h\x65llo")); // A C string literal is a null terminated pointer. @@ -616,12 +616,8 @@ test "string literals" { hexadecimal 8-bit character code (2 digits) - \uNNNN - hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) - - - \UNNNNNN - hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) + \u{NNNNNN} + hexadecimal Unicode character code UTF-8 encoded (1 or more digits) @@ -10011,8 +10007,7 @@ eof <- !. hex <- [0-9a-fA-F] char_escape <- "\\x" hex hex - / "\\u" hex hex hex hex - / "\\U" hex hex hex hex hex hex + / "\\u{" hex+ "}" / "\\" [nr\\t'"] char_char <- char_escape diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index a0acde52e9..4358146f24 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -190,6 +190,7 @@ enum TokenizeState { TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, TokenizeStateStringEscape, + TokenizeStateStringEscapeUnicodeStart, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, TokenizeStateSawStar, @@ -241,7 +242,6 @@ struct Tokenize { int32_t exp_add_amt; bool is_exp_negative; size_t char_code_index; - size_t char_code_end; bool unicode; uint32_t char_code; int exponent_in_bin_or_dec; @@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) { t.radix = 16; t.char_code = 0; t.char_code_index = 0; - t.char_code_end = 2; t.unicode = false; break; case 'u': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 4; - t.unicode = true; - break; - case 'U': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 6; - t.unicode = true; + t.state = TokenizeStateStringEscapeUnicodeStart; break; case 'n': handle_string_escape(&t, '\n'); @@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateStringEscapeUnicodeStart: + switch (c) { + case '{': + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.unicode = true; + break; + default: + invalid_char_error(&t, c); + } + break; case TokenizeStateCharCode: { + if (t.unicode && c == '}') { + if (t.char_code_index == 0) { + tokenize_error(&t, "empty unicode escape sequence"); + break; + } + if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: %x", t.char_code); + break; + } + if (t.cur_tok->id == TokenIdCharLiteral) { + t.cur_tok->data.char_lit.c = t.char_code; + t.state = TokenizeStateCharLiteralEnd; + } else if (t.char_code <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + handle_string_escape(&t, (uint8_t)t.char_code); + } else if (t.char_code <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); + // 00000000 000000xx xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else { + zig_unreachable(); + } + break; + } + uint32_t digit_value = get_digit_value(c); if (digit_value >= t.radix) { tokenize_error(&t, "invalid digit: '%c'", c); @@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) { t.char_code += digit_value; t.char_code_index += 1; - if (t.char_code_index >= t.char_code_end) { - if (t.unicode) { - if (t.char_code > 0x10ffff) { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); - break; - } - if (t.cur_tok->id == TokenIdCharLiteral) { - t.cur_tok->data.char_lit.c = t.char_code; - t.state = TokenizeStateCharLiteralEnd; - } else if (t.char_code <= 0x7f) { - // 00000000 00000000 00000000 0xxxxxxx - handle_string_escape(&t, (uint8_t)t.char_code); - } else if (t.char_code <= 0x7ff) { - // 00000000 00000000 00000xxx xx000000 - handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0xffff) { - // 00000000 00000000 xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0x10ffff) { - // 00000000 000xxx00 00000000 00000000 - handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); - // 00000000 000000xx xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } - } else { - assert(t.char_code <= 255); - handle_string_escape(&t, (uint8_t)t.char_code); - } + if (!t.unicode && t.char_code_index >= 2) { + assert(t.char_code <= 255); + handle_string_escape(&t, (uint8_t)t.char_code); } } break; @@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) { tokenize_error(&t, "unterminated string"); break; case TokenizeStateStringEscape: + case TokenizeStateStringEscapeUnicodeStart: case TokenizeStateCharCode: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index 0f5789dc34..f6f3363bf6 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -80,7 +80,7 @@ test "zig fmt: enum literal inside array literal" { test "zig fmt: character literal larger than u8" { try testCanonical( - \\const x = '\U01f4a9'; + \\const x = '\u{01f4a9}'; \\ ); } diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index fb4827da86..71765e2025 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -240,6 +240,9 @@ pub const Tokenizer = struct { CharLiteral, CharLiteralBackslash, CharLiteralHexEscape, + CharLiteralUnicodeEscapeSawU, + CharLiteralUnicodeEscape, + CharLiteralUnicodeInvalid, CharLiteralEnd, Backslash, Equal, @@ -296,7 +299,6 @@ pub const Tokenizer = struct { .end = undefined, }; var seen_escape_digits: usize = undefined; - var expected_escape_digits: usize = undefined; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -661,17 +663,9 @@ pub const Tokenizer = struct { 'x' => { state = State.CharLiteralHexEscape; seen_escape_digits = 0; - expected_escape_digits = 2; }, 'u' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 4; - }, - 'U' => { - state = State.CharLiteralHexEscape; - seen_escape_digits = 0; - expected_escape_digits = 6; + state = State.CharLiteralUnicodeEscapeSawU; }, else => { state = State.CharLiteralEnd; @@ -679,9 +673,9 @@ pub const Tokenizer = struct { }, State.CharLiteralHexEscape => switch (c) { - '0'...'9', 'a'...'z', 'A'...'F' => { + '0'...'9', 'a'...'f', 'A'...'F' => { seen_escape_digits += 1; - if (seen_escape_digits == expected_escape_digits) { + if (seen_escape_digits == 2) { state = State.CharLiteralEnd; } }, @@ -691,6 +685,43 @@ pub const Tokenizer = struct { }, }, + State.CharLiteralUnicodeEscapeSawU => switch (c) { + '{' => { + state = State.CharLiteralUnicodeEscape; + seen_escape_digits = 0; + }, + else => { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + }, + }, + + State.CharLiteralUnicodeEscape => switch (c) { + '0'...'9', 'a'...'f', 'A'...'F' => { + seen_escape_digits += 1; + }, + '}' => { + if (seen_escape_digits == 0) { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + } else { + state = State.CharLiteralEnd; + } + }, + else => { + result.id = Token.Id.Invalid; + state = State.CharLiteralUnicodeInvalid; + }, + }, + + State.CharLiteralUnicodeInvalid => switch (c) { + // Keep consuming characters until an obvious stopping point. + // This consolidates e.g. `u{0ab1Q}` into a single invalid token + // instead of creating the tokens `u{0ab1`, `Q`, `}` + '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, + else => break, + }, + State.CharLiteralEnd => switch (c) { '\'' => { result.id = Token.Id.CharLiteral; @@ -1052,6 +1083,9 @@ pub const Tokenizer = struct { State.CharLiteral, State.CharLiteralBackslash, State.CharLiteralHexEscape, + State.CharLiteralUnicodeEscapeSawU, + State.CharLiteralUnicodeEscape, + State.CharLiteralUnicodeInvalid, State.CharLiteralEnd, State.StringLiteralBackslash, State.LBracketStar, @@ -1205,7 +1239,60 @@ test "tokenizer - unknown length pointer and then c pointer" { test "tokenizer - char literal with hex escape" { testTokenize( \\'\x1b' - , [_]Token.Id{Token.Id.CharLiteral}); + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\x1' + , [_]Token.Id{ .Invalid, .Invalid }); +} + +test "tokenizer - char literal with unicode escapes" { + // Valid unicode escapes + testTokenize( + \\'\u{3}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{01}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{2a}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{3f9}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\'\u{6E09aBc1523}' + , [_]Token.Id{.CharLiteral}); + testTokenize( + \\"\u{440}" + , [_]Token.Id{.StringLiteral}); + + // Invalid unicode escapes + testTokenize( + \\'\u' + , [_]Token.Id{.Invalid}); + testTokenize( + \\'\u{{' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{s}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{2z}' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\u{4a' + , [_]Token.Id{.Invalid}); + + // Test old-style unicode literals + testTokenize( + \\'\u0333' + , [_]Token.Id{ .Invalid, .Invalid }); + testTokenize( + \\'\U0333' + , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); } test "tokenizer - float literal e exponent" { diff --git a/test/compile_errors.zig b/test/compile_errors.zig index df4e38583c..9967770931 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void { "tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported", ); + cases.add( + "invalid legacy unicode escape", + \\export fn entry() void { + \\ const a = '\U1234'; + \\} + , + "tmp.zig:2:17: error: invalid character: 'U'", + ); + + cases.add( + "invalid empty unicode escape", + \\export fn entry() void { + \\ const a = '\u{}'; + \\} + , + "tmp.zig:2:19: error: empty unicode escape sequence", + ); + cases.add( "non-printable invalid character", "\xff\xfe" ++ diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index d499df4cb7..ab58f2ed08 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -189,7 +189,7 @@ test "string escapes" { expect(mem.eql(u8, "\r", "\x0d")); expect(mem.eql(u8, "\t", "\x09")); expect(mem.eql(u8, "\\", "\x5c")); - expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69")); + expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01")); } test "multiline string" { @@ -695,7 +695,7 @@ test "thread local variable" { } test "unicode escape in character literal" { - var a: u24 = '\U01f4a9'; + var a: u24 = '\u{01f4a9}'; expect(a == 128169); }