diff --git a/doc/langref.html.in b/doc/langref.html.in
index 2d853e8611..ae5744c52b 100644
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@@ -566,7 +566,7 @@ test "string literals" {
assert(normal_bytes.len == 5);
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
- assert('\U01f4a9' == 128169);
+ assert('\u{1f4a9}' == 128169);
assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer.
@@ -616,12 +616,8 @@ test "string literals" {
hexadecimal 8-bit character code (2 digits) |
- \uNNNN |
- hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits) |
-
-
- \UNNNNNN |
- hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits) |
+ \u{NNNNNN} |
+ hexadecimal Unicode character code UTF-8 encoded (1 or more digits) |
@@ -10011,8 +10007,7 @@ eof <- !.
hex <- [0-9a-fA-F]
char_escape
<- "\\x" hex hex
- / "\\u" hex hex hex hex
- / "\\U" hex hex hex hex hex hex
+ / "\\u{" hex+ "}"
/ "\\" [nr\\t'"]
char_char
<- char_escape
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index a0acde52e9..4358146f24 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -190,6 +190,7 @@ enum TokenizeState {
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
TokenizeStateString,
TokenizeStateStringEscape,
+ TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
TokenizeStateSawStar,
@@ -241,7 +242,6 @@ struct Tokenize {
int32_t exp_add_amt;
bool is_exp_negative;
size_t char_code_index;
- size_t char_code_end;
bool unicode;
uint32_t char_code;
int exponent_in_bin_or_dec;
@@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
t.radix = 16;
t.char_code = 0;
t.char_code_index = 0;
- t.char_code_end = 2;
t.unicode = false;
break;
case 'u':
- t.state = TokenizeStateCharCode;
- t.radix = 16;
- t.char_code = 0;
- t.char_code_index = 0;
- t.char_code_end = 4;
- t.unicode = true;
- break;
- case 'U':
- t.state = TokenizeStateCharCode;
- t.radix = 16;
- t.char_code = 0;
- t.char_code_index = 0;
- t.char_code_end = 6;
- t.unicode = true;
+ t.state = TokenizeStateStringEscapeUnicodeStart;
break;
case 'n':
handle_string_escape(&t, '\n');
@@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
+ case TokenizeStateStringEscapeUnicodeStart:
+ switch (c) {
+ case '{':
+ t.state = TokenizeStateCharCode;
+ t.radix = 16;
+ t.char_code = 0;
+ t.char_code_index = 0;
+ t.unicode = true;
+ break;
+ default:
+ invalid_char_error(&t, c);
+ }
+ break;
case TokenizeStateCharCode:
{
+ if (t.unicode && c == '}') {
+ if (t.char_code_index == 0) {
+ tokenize_error(&t, "empty unicode escape sequence");
+ break;
+ }
+ if (t.char_code > 0x10ffff) {
+ tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+ break;
+ }
+ if (t.cur_tok->id == TokenIdCharLiteral) {
+ t.cur_tok->data.char_lit.c = t.char_code;
+ t.state = TokenizeStateCharLiteralEnd;
+ } else if (t.char_code <= 0x7f) {
+ // 00000000 00000000 00000000 0xxxxxxx
+ handle_string_escape(&t, (uint8_t)t.char_code);
+ } else if (t.char_code <= 0x7ff) {
+ // 00000000 00000000 00000xxx xx000000
+ handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
+ // 00000000 00000000 00000000 00xxxxxx
+ handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+ } else if (t.char_code <= 0xffff) {
+ // 00000000 00000000 xxxx0000 00000000
+ handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
+ // 00000000 00000000 0000xxxx xx000000
+ handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+ // 00000000 00000000 00000000 00xxxxxx
+ handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+ } else if (t.char_code <= 0x10ffff) {
+ // 00000000 000xxx00 00000000 00000000
+ handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
+ // 00000000 000000xx xxxx0000 00000000
+ handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
+ // 00000000 00000000 0000xxxx xx000000
+ handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+ // 00000000 00000000 00000000 00xxxxxx
+ handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+ } else {
+ zig_unreachable();
+ }
+ break;
+ }
+
uint32_t digit_value = get_digit_value(c);
if (digit_value >= t.radix) {
tokenize_error(&t, "invalid digit: '%c'", c);
@@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
t.char_code += digit_value;
t.char_code_index += 1;
- if (t.char_code_index >= t.char_code_end) {
- if (t.unicode) {
- if (t.char_code > 0x10ffff) {
- tokenize_error(&t, "unicode value out of range: %x", t.char_code);
- break;
- }
- if (t.cur_tok->id == TokenIdCharLiteral) {
- t.cur_tok->data.char_lit.c = t.char_code;
- t.state = TokenizeStateCharLiteralEnd;
- } else if (t.char_code <= 0x7f) {
- // 00000000 00000000 00000000 0xxxxxxx
- handle_string_escape(&t, (uint8_t)t.char_code);
- } else if (t.char_code <= 0x7ff) {
- // 00000000 00000000 00000xxx xx000000
- handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else if (t.char_code <= 0xffff) {
- // 00000000 00000000 xxxx0000 00000000
- handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
- // 00000000 00000000 0000xxxx xx000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- } else if (t.char_code <= 0x10ffff) {
- // 00000000 000xxx00 00000000 00000000
- handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
- // 00000000 000000xx xxxx0000 00000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
- // 00000000 00000000 0000xxxx xx000000
- handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
- // 00000000 00000000 00000000 00xxxxxx
- handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
- }
- } else {
- assert(t.char_code <= 255);
- handle_string_escape(&t, (uint8_t)t.char_code);
- }
+ if (!t.unicode && t.char_code_index >= 2) {
+ assert(t.char_code <= 255);
+ handle_string_escape(&t, (uint8_t)t.char_code);
}
}
break;
@@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
tokenize_error(&t, "unterminated string");
break;
case TokenizeStateStringEscape:
+ case TokenizeStateStringEscapeUnicodeStart:
case TokenizeStateCharCode:
if (t.cur_tok->id == TokenIdStringLiteral) {
tokenize_error(&t, "unterminated string");
diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig
index 0f5789dc34..f6f3363bf6 100644
--- a/std/zig/parser_test.zig
+++ b/std/zig/parser_test.zig
@@ -80,7 +80,7 @@ test "zig fmt: enum literal inside array literal" {
test "zig fmt: character literal larger than u8" {
try testCanonical(
- \\const x = '\U01f4a9';
+ \\const x = '\u{01f4a9}';
\\
);
}
diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig
index fb4827da86..71765e2025 100644
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@@ -240,6 +240,9 @@ pub const Tokenizer = struct {
CharLiteral,
CharLiteralBackslash,
CharLiteralHexEscape,
+ CharLiteralUnicodeEscapeSawU,
+ CharLiteralUnicodeEscape,
+ CharLiteralUnicodeInvalid,
CharLiteralEnd,
Backslash,
Equal,
@@ -296,7 +299,6 @@ pub const Tokenizer = struct {
.end = undefined,
};
var seen_escape_digits: usize = undefined;
- var expected_escape_digits: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@@ -661,17 +663,9 @@ pub const Tokenizer = struct {
'x' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
- expected_escape_digits = 2;
},
'u' => {
- state = State.CharLiteralHexEscape;
- seen_escape_digits = 0;
- expected_escape_digits = 4;
- },
- 'U' => {
- state = State.CharLiteralHexEscape;
- seen_escape_digits = 0;
- expected_escape_digits = 6;
+ state = State.CharLiteralUnicodeEscapeSawU;
},
else => {
state = State.CharLiteralEnd;
@@ -679,9 +673,9 @@ pub const Tokenizer = struct {
},
State.CharLiteralHexEscape => switch (c) {
- '0'...'9', 'a'...'z', 'A'...'F' => {
+ '0'...'9', 'a'...'f', 'A'...'F' => {
seen_escape_digits += 1;
- if (seen_escape_digits == expected_escape_digits) {
+ if (seen_escape_digits == 2) {
state = State.CharLiteralEnd;
}
},
@@ -691,6 +685,43 @@ pub const Tokenizer = struct {
},
},
+ State.CharLiteralUnicodeEscapeSawU => switch (c) {
+ '{' => {
+ state = State.CharLiteralUnicodeEscape;
+ seen_escape_digits = 0;
+ },
+ else => {
+ result.id = Token.Id.Invalid;
+ state = State.CharLiteralUnicodeInvalid;
+ },
+ },
+
+ State.CharLiteralUnicodeEscape => switch (c) {
+ '0'...'9', 'a'...'f', 'A'...'F' => {
+ seen_escape_digits += 1;
+ },
+ '}' => {
+ if (seen_escape_digits == 0) {
+ result.id = Token.Id.Invalid;
+ state = State.CharLiteralUnicodeInvalid;
+ } else {
+ state = State.CharLiteralEnd;
+ }
+ },
+ else => {
+ result.id = Token.Id.Invalid;
+ state = State.CharLiteralUnicodeInvalid;
+ },
+ },
+
+ State.CharLiteralUnicodeInvalid => switch (c) {
+ // Keep consuming characters until an obvious stopping point.
+ // This consolidates e.g. `u{0ab1Q}` into a single invalid token
+ // instead of creating the tokens `u{0ab1`, `Q`, `}`
+ '0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
+ else => break,
+ },
+
State.CharLiteralEnd => switch (c) {
'\'' => {
result.id = Token.Id.CharLiteral;
@@ -1052,6 +1083,9 @@ pub const Tokenizer = struct {
State.CharLiteral,
State.CharLiteralBackslash,
State.CharLiteralHexEscape,
+ State.CharLiteralUnicodeEscapeSawU,
+ State.CharLiteralUnicodeEscape,
+ State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd,
State.StringLiteralBackslash,
State.LBracketStar,
@@ -1205,7 +1239,60 @@ test "tokenizer - unknown length pointer and then c pointer" {
test "tokenizer - char literal with hex escape" {
testTokenize(
\\'\x1b'
- , [_]Token.Id{Token.Id.CharLiteral});
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\'\x1'
+ , [_]Token.Id{ .Invalid, .Invalid });
+}
+
+test "tokenizer - char literal with unicode escapes" {
+ // Valid unicode escapes
+ testTokenize(
+ \\'\u{3}'
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\'\u{01}'
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\'\u{2a}'
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\'\u{3f9}'
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\'\u{6E09aBc1523}'
+ , [_]Token.Id{.CharLiteral});
+ testTokenize(
+ \\"\u{440}"
+ , [_]Token.Id{.StringLiteral});
+
+ // Invalid unicode escapes
+ testTokenize(
+ \\'\u'
+ , [_]Token.Id{.Invalid});
+ testTokenize(
+ \\'\u{{'
+ , [_]Token.Id{ .Invalid, .Invalid });
+ testTokenize(
+ \\'\u{}'
+ , [_]Token.Id{ .Invalid, .Invalid });
+ testTokenize(
+ \\'\u{s}'
+ , [_]Token.Id{ .Invalid, .Invalid });
+ testTokenize(
+ \\'\u{2z}'
+ , [_]Token.Id{ .Invalid, .Invalid });
+ testTokenize(
+ \\'\u{4a'
+ , [_]Token.Id{.Invalid});
+
+ // Test old-style unicode literals
+ testTokenize(
+ \\'\u0333'
+ , [_]Token.Id{ .Invalid, .Invalid });
+ testTokenize(
+ \\'\U0333'
+ , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
}
test "tokenizer - float literal e exponent" {
diff --git a/test/compile_errors.zig b/test/compile_errors.zig
index df4e38583c..9967770931 100644
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
"tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported",
);
+ cases.add(
+ "invalid legacy unicode escape",
+ \\export fn entry() void {
+ \\ const a = '\U1234';
+ \\}
+ ,
+ "tmp.zig:2:17: error: invalid character: 'U'",
+ );
+
+ cases.add(
+ "invalid empty unicode escape",
+ \\export fn entry() void {
+ \\ const a = '\u{}';
+ \\}
+ ,
+ "tmp.zig:2:19: error: empty unicode escape sequence",
+ );
+
cases.add(
"non-printable invalid character",
"\xff\xfe" ++
diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig
index d499df4cb7..ab58f2ed08 100644
--- a/test/stage1/behavior/misc.zig
+++ b/test/stage1/behavior/misc.zig
@@ -189,7 +189,7 @@ test "string escapes" {
expect(mem.eql(u8, "\r", "\x0d"));
expect(mem.eql(u8, "\t", "\x09"));
expect(mem.eql(u8, "\\", "\x5c"));
- expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69"));
+ expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01"));
}
test "multiline string" {
@@ -695,7 +695,7 @@ test "thread local variable" {
}
test "unicode escape in character literal" {
- var a: u24 = '\U01f4a9';
+ var a: u24 = '\u{01f4a9}';
expect(a == 128169);
}