Merge pull request #2823 from hryx/unicode-escape

Unicode escapes: support u{N...}
2026-02-21 16:54:52 +00:00 · 2019-07-06 13:14:43 -04:00 · 2019-07-06 13:14:43 -04:00 · 21c60922e3
commit 21c60922e3
parent 7f618184ad e35d49c4d0
6 changed files with 186 additions and 79 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -566,7 +566,7 @@ test "string literals" {
    assert(normal_bytes.len == 5);
    assert(normal_bytes[1] == 'e');
    assert('e' == '\x65');
-    assert('\U01f4a9' == 128169);
+    assert('\u{1f4a9}' == 128169);
    assert(mem.eql(u8, "hello", "h\x65llo"));

    // A C string literal is a null terminated pointer.
@ -616,12 +616,8 @@ test "string literals" {
          <td>hexadecimal 8-bit character code (2 digits)</td>
        </tr>
        <tr>
-            <td><code>\uNNNN</code></td>
-          <td>hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)</td>
-        </tr>
-        <tr>
-            <td><code>\UNNNNNN</code></td>
-          <td>hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)</td>
+            <td><code>\u{NNNNNN}</code></td>
+          <td>hexadecimal Unicode character code UTF-8 encoded (1 or more digits)</td>
        </tr>
      </table>
      </div>
@ -10011,8 +10007,7 @@ eof &lt;- !.
 hex &lt;- [0-9a-fA-F]
 char_escape
    &lt;- "\\x" hex hex
-     / "\\u" hex hex hex hex
-     / "\\U" hex hex hex hex hex hex
+     / "\\u{" hex+ "}"
     / "\\" [nr\\t'"]
 char_char
    &lt;- char_escape
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -190,6 +190,7 @@ enum TokenizeState {
    TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
    TokenizeStateString,
    TokenizeStateStringEscape,
+    TokenizeStateStringEscapeUnicodeStart,
    TokenizeStateCharLiteral,
    TokenizeStateCharLiteralEnd,
    TokenizeStateSawStar,
@ -241,7 +242,6 @@ struct Tokenize {
    int32_t exp_add_amt;
    bool is_exp_negative;
    size_t char_code_index;
-    size_t char_code_end;
    bool unicode;
    uint32_t char_code;
    int exponent_in_bin_or_dec;
@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
                        t.radix = 16;
                        t.char_code = 0;
                        t.char_code_index = 0;
-                        t.char_code_end = 2;
                        t.unicode = false;
                        break;
                    case 'u':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 4;
-                        t.unicode = true;
-                        break;
-                    case 'U':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 6;
-                        t.unicode = true;
+                        t.state = TokenizeStateStringEscapeUnicodeStart;
                        break;
                    case 'n':
                        handle_string_escape(&t, '\n');
@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
                        invalid_char_error(&t, c);
                }
                break;
+            case TokenizeStateStringEscapeUnicodeStart:
+                switch (c) {
+                    case '{':
+                        t.state = TokenizeStateCharCode;
+                        t.radix = 16;
+                        t.char_code = 0;
+                        t.char_code_index = 0;
+                        t.unicode = true;
+                        break;
+                    default:
+                        invalid_char_error(&t, c);
+                }
+                break;
            case TokenizeStateCharCode:
                {
+                    if (t.unicode && c == '}') {
+                        if (t.char_code_index == 0) {
+                            tokenize_error(&t, "empty unicode escape sequence");
+                            break;
+                        }
+                        if (t.char_code > 0x10ffff) {
+                            tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            break;
+                        }
+                        if (t.cur_tok->id == TokenIdCharLiteral) {
+                            t.cur_tok->data.char_lit.c = t.char_code;
+                            t.state = TokenizeStateCharLiteralEnd;
+                        } else if (t.char_code <= 0x7f) {
+                            // 00000000 00000000 00000000 0xxxxxxx
+                            handle_string_escape(&t, (uint8_t)t.char_code);
+                        } else if (t.char_code <= 0x7ff) {
+                            // 00000000 00000000 00000xxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0xffff) {
+                            // 00000000 00000000 xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0x10ffff) {
+                            // 00000000 000xxx00 00000000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
+                            // 00000000 000000xx xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else {
+                            zig_unreachable();
+                        }
+                        break;
+                    }
+
                    uint32_t digit_value = get_digit_value(c);
                    if (digit_value >= t.radix) {
                        tokenize_error(&t, "invalid digit: '%c'", c);
@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
                    t.char_code += digit_value;
                    t.char_code_index += 1;

-                    if (t.char_code_index >= t.char_code_end) {
-                        if (t.unicode) {
-                            if (t.char_code > 0x10ffff) {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
-                                break;
-                            }
-                            if (t.cur_tok->id == TokenIdCharLiteral) {
-                                t.cur_tok->data.char_lit.c = t.char_code;
-                                t.state = TokenizeStateCharLiteralEnd;
-                            } else if (t.char_code <= 0x7f) {
-                                // 00000000 00000000 00000000 0xxxxxxx
-                                handle_string_escape(&t, (uint8_t)t.char_code);
-                            } else if (t.char_code <= 0x7ff) {
-                                // 00000000 00000000 00000xxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0xffff) {
-                                // 00000000 00000000 xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0x10ffff) {
-                                // 00000000 000xxx00 00000000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
-                                // 00000000 000000xx xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            }
-                        } else {
-                            assert(t.char_code <= 255);
-                            handle_string_escape(&t, (uint8_t)t.char_code);
-                        }
+                    if (!t.unicode && t.char_code_index >= 2) {
+                        assert(t.char_code <= 255);
+                        handle_string_escape(&t, (uint8_t)t.char_code);
                    }
                }
                break;
@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
            tokenize_error(&t, "unterminated string");
            break;
        case TokenizeStateStringEscape:
+        case TokenizeStateStringEscapeUnicodeStart:
        case TokenizeStateCharCode:
            if (t.cur_tok->id == TokenIdStringLiteral) {
                tokenize_error(&t, "unterminated string");
--- a/std/zig/parser_test.zig
+++ b/std/zig/parser_test.zig
@ -80,7 +80,7 @@ test "zig fmt: enum literal inside array literal" {

 test "zig fmt: character literal larger than u8" {
    try testCanonical(
-        \\const x = '\U01f4a9';
+        \\const x = '\u{01f4a9}';
        \\
    );
 }
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@ -240,6 +240,9 @@ pub const Tokenizer = struct {
        CharLiteral,
        CharLiteralBackslash,
        CharLiteralHexEscape,
+        CharLiteralUnicodeEscapeSawU,
+        CharLiteralUnicodeEscape,
+        CharLiteralUnicodeInvalid,
        CharLiteralEnd,
        Backslash,
        Equal,
@ -296,7 +299,6 @@ pub const Tokenizer = struct {
            .end = undefined,
        };
        var seen_escape_digits: usize = undefined;
-        var expected_escape_digits: usize = undefined;
        while (self.index < self.buffer.len) : (self.index += 1) {
            const c = self.buffer[self.index];
            switch (state) {
@ -661,17 +663,9 @@ pub const Tokenizer = struct {
                    'x' => {
                        state = State.CharLiteralHexEscape;
                        seen_escape_digits = 0;
-                        expected_escape_digits = 2;
                    },
                    'u' => {
-                        state = State.CharLiteralHexEscape;
-                        seen_escape_digits = 0;
-                        expected_escape_digits = 4;
-                    },
-                    'U' => {
-                        state = State.CharLiteralHexEscape;
-                        seen_escape_digits = 0;
-                        expected_escape_digits = 6;
+                        state = State.CharLiteralUnicodeEscapeSawU;
                    },
                    else => {
                        state = State.CharLiteralEnd;
@ -679,9 +673,9 @@ pub const Tokenizer = struct {
                },

                State.CharLiteralHexEscape => switch (c) {
-                    '0'...'9', 'a'...'z', 'A'...'F' => {
+                    '0'...'9', 'a'...'f', 'A'...'F' => {
                        seen_escape_digits += 1;
-                        if (seen_escape_digits == expected_escape_digits) {
+                        if (seen_escape_digits == 2) {
                            state = State.CharLiteralEnd;
                        }
                    },
@ -691,6 +685,43 @@ pub const Tokenizer = struct {
                    },
                },

+                State.CharLiteralUnicodeEscapeSawU => switch (c) {
+                    '{' => {
+                        state = State.CharLiteralUnicodeEscape;
+                        seen_escape_digits = 0;
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        state = State.CharLiteralUnicodeInvalid;
+                    },
+                },
+
+                State.CharLiteralUnicodeEscape => switch (c) {
+                    '0'...'9', 'a'...'f', 'A'...'F' => {
+                        seen_escape_digits += 1;
+                    },
+                    '}' => {
+                        if (seen_escape_digits == 0) {
+                            result.id = Token.Id.Invalid;
+                            state = State.CharLiteralUnicodeInvalid;
+                        } else {
+                            state = State.CharLiteralEnd;
+                        }
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        state = State.CharLiteralUnicodeInvalid;
+                    },
+                },
+
+                State.CharLiteralUnicodeInvalid => switch (c) {
+                    // Keep consuming characters until an obvious stopping point.
+                    // This consolidates e.g. `u{0ab1Q}` into a single invalid token
+                    // instead of creating the tokens `u{0ab1`, `Q`, `}`
+                    '0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
+                    else => break,
+                },
+
                State.CharLiteralEnd => switch (c) {
                    '\'' => {
                        result.id = Token.Id.CharLiteral;
@ -1052,6 +1083,9 @@ pub const Tokenizer = struct {
                State.CharLiteral,
                State.CharLiteralBackslash,
                State.CharLiteralHexEscape,
+                State.CharLiteralUnicodeEscapeSawU,
+                State.CharLiteralUnicodeEscape,
+                State.CharLiteralUnicodeInvalid,
                State.CharLiteralEnd,
                State.StringLiteralBackslash,
                State.LBracketStar,
@ -1205,7 +1239,60 @@ test "tokenizer - unknown length pointer and then c pointer" {
 test "tokenizer - char literal with hex escape" {
    testTokenize(
        \\'\x1b'
-    , [_]Token.Id{Token.Id.CharLiteral});
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\'\x1'
+    , [_]Token.Id{ .Invalid, .Invalid });
+}
+
+test "tokenizer - char literal with unicode escapes" {
+    // Valid unicode escapes
+    testTokenize(
+        \\'\u{3}'
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\'\u{01}'
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\'\u{2a}'
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\'\u{3f9}'
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\'\u{6E09aBc1523}'
+    , [_]Token.Id{.CharLiteral});
+    testTokenize(
+        \\"\u{440}"
+    , [_]Token.Id{.StringLiteral});
+
+    // Invalid unicode escapes
+    testTokenize(
+        \\'\u'
+    , [_]Token.Id{.Invalid});
+    testTokenize(
+        \\'\u{{'
+    , [_]Token.Id{ .Invalid, .Invalid });
+    testTokenize(
+        \\'\u{}'
+    , [_]Token.Id{ .Invalid, .Invalid });
+    testTokenize(
+        \\'\u{s}'
+    , [_]Token.Id{ .Invalid, .Invalid });
+    testTokenize(
+        \\'\u{2z}'
+    , [_]Token.Id{ .Invalid, .Invalid });
+    testTokenize(
+        \\'\u{4a'
+    , [_]Token.Id{.Invalid});
+
+    // Test old-style unicode literals
+    testTokenize(
+        \\'\u0333'
+    , [_]Token.Id{ .Invalid, .Invalid });
+    testTokenize(
+        \\'\U0333'
+    , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }

 test "tokenizer - float literal e exponent" {
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
        "tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported",
    );

+    cases.add(
+        "invalid legacy unicode escape",
+        \\export fn entry() void {
+        \\    const a = '\U1234';
+        \\}
+    ,
+        "tmp.zig:2:17: error: invalid character: 'U'",
+    );
+
+    cases.add(
+        "invalid empty unicode escape",
+        \\export fn entry() void {
+        \\    const a = '\u{}';
+        \\}
+    ,
+        "tmp.zig:2:19: error: empty unicode escape sequence",
+    );
+
    cases.add(
        "non-printable invalid character",
        "\xff\xfe" ++
--- a/test/stage1/behavior/misc.zig
+++ b/test/stage1/behavior/misc.zig
@ -189,7 +189,7 @@ test "string escapes" {
    expect(mem.eql(u8, "\r", "\x0d"));
    expect(mem.eql(u8, "\t", "\x09"));
    expect(mem.eql(u8, "\\", "\x5c"));
-    expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69"));
+    expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01"));
 }

 test "multiline string" {
@ -695,7 +695,7 @@ test "thread local variable" {
 }

 test "unicode escape in character literal" {
-    var a: u24 = '\U01f4a9';
+    var a: u24 = '\u{01f4a9}';
    expect(a == 128169);
 }