From 6bfa8546bbdf6dd644a65876135893339b767bba Mon Sep 17 00:00:00 2001
From: hryx <codroid@gmail.com>
Date: Thu, 4 Jul 2019 22:40:19 -0700
Subject: [PATCH] Unicode escapes: stage1 tokenizer and behavior tests

---
 src/tokenizer.cpp             | 115 ++++++++++++++++++----------------
 test/compile_errors.zig       |  18 ++++++
 test/stage1/behavior/misc.zig |   4 +-
 3 files changed, 81 insertions(+), 56 deletions(-)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index a0acde52e9..4358146f24 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -190,6 +190,7 @@ enum TokenizeState {
     TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
     TokenizeStateString,
     TokenizeStateStringEscape,
+    TokenizeStateStringEscapeUnicodeStart,
     TokenizeStateCharLiteral,
     TokenizeStateCharLiteralEnd,
     TokenizeStateSawStar,
@@ -241,7 +242,6 @@ struct Tokenize {
     int32_t exp_add_amt;
     bool is_exp_negative;
     size_t char_code_index;
-    size_t char_code_end;
     bool unicode;
     uint32_t char_code;
     int exponent_in_bin_or_dec;
@@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
                         t.radix = 16;
                         t.char_code = 0;
                         t.char_code_index = 0;
-                        t.char_code_end = 2;
                         t.unicode = false;
                         break;
                     case 'u':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 4;
-                        t.unicode = true;
-                        break;
-                    case 'U':
-                        t.state = TokenizeStateCharCode;
-                        t.radix = 16;
-                        t.char_code = 0;
-                        t.char_code_index = 0;
-                        t.char_code_end = 6;
-                        t.unicode = true;
+                        t.state = TokenizeStateStringEscapeUnicodeStart;
                         break;
                     case 'n':
                         handle_string_escape(&t, '\n');
@@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
                         invalid_char_error(&t, c);
                 }
                 break;
+            case TokenizeStateStringEscapeUnicodeStart:
+                switch (c) {
+                    case '{':
+                        t.state = TokenizeStateCharCode;
+                        t.radix = 16;
+                        t.char_code = 0;
+                        t.char_code_index = 0;
+                        t.unicode = true;
+                        break;
+                    default:
+                        invalid_char_error(&t, c);
+                }
+                break;
             case TokenizeStateCharCode:
                 {
+                    if (t.unicode && c == '}') {
+                        if (t.char_code_index == 0) {
+                            tokenize_error(&t, "empty unicode escape sequence");
+                            break;
+                        }
+                        if (t.char_code > 0x10ffff) {
+                            tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            break;
+                        }
+                        if (t.cur_tok->id == TokenIdCharLiteral) {
+                            t.cur_tok->data.char_lit.c = t.char_code;
+                            t.state = TokenizeStateCharLiteralEnd;
+                        } else if (t.char_code <= 0x7f) {
+                            // 00000000 00000000 00000000 0xxxxxxx
+                            handle_string_escape(&t, (uint8_t)t.char_code);
+                        } else if (t.char_code <= 0x7ff) {
+                            // 00000000 00000000 00000xxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0xffff) {
+                            // 00000000 00000000 xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else if (t.char_code <= 0x10ffff) {
+                            // 00000000 000xxx00 00000000 00000000
+                            handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
+                            // 00000000 000000xx xxxx0000 00000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
+                            // 00000000 00000000 0000xxxx xx000000
+                            handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
+                            // 00000000 00000000 00000000 00xxxxxx
+                            handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
+                        } else {
+                            zig_unreachable();
+                        }
+                        break;
+                    }
+
                     uint32_t digit_value = get_digit_value(c);
                     if (digit_value >= t.radix) {
                         tokenize_error(&t, "invalid digit: '%c'", c);
@@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
                     t.char_code += digit_value;
                     t.char_code_index += 1;
 
-                    if (t.char_code_index >= t.char_code_end) {
-                        if (t.unicode) {
-                            if (t.char_code > 0x10ffff) {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
-                                break;
-                            }
-                            if (t.cur_tok->id == TokenIdCharLiteral) {
-                                t.cur_tok->data.char_lit.c = t.char_code;
-                                t.state = TokenizeStateCharLiteralEnd;
-                            } else if (t.char_code <= 0x7f) {
-                                // 00000000 00000000 00000000 0xxxxxxx
-                                handle_string_escape(&t, (uint8_t)t.char_code);
-                            } else if (t.char_code <= 0x7ff) {
-                                // 00000000 00000000 00000xxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0xffff) {
-                                // 00000000 00000000 xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else if (t.char_code <= 0x10ffff) {
-                                // 00000000 000xxx00 00000000 00000000
-                                handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
-                                // 00000000 000000xx xxxx0000 00000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
-                                // 00000000 00000000 0000xxxx xx000000
-                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
-                                // 00000000 00000000 00000000 00xxxxxx
-                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            }
-                        } else {
-                            assert(t.char_code <= 255);
-                            handle_string_escape(&t, (uint8_t)t.char_code);
-                        }
+                    if (!t.unicode && t.char_code_index >= 2) {
+                        assert(t.char_code <= 255);
+                        handle_string_escape(&t, (uint8_t)t.char_code);
                     }
                 }
                 break;
@@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
             tokenize_error(&t, "unterminated string");
             break;
         case TokenizeStateStringEscape:
+        case TokenizeStateStringEscapeUnicodeStart:
         case TokenizeStateCharCode:
             if (t.cur_tok->id == TokenIdStringLiteral) {
                 tokenize_error(&t, "unterminated string");
diff --git a/test/compile_errors.zig b/test/compile_errors.zig
index df4e38583c..9967770931 100644
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
         "tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported",
     );
 
+    cases.add(
+        "invalid legacy unicode escape",
+        \\export fn entry() void {
+        \\    const a = '\U1234';
+        \\}
+    ,
+        "tmp.zig:2:17: error: invalid character: 'U'",
+    );
+
+    cases.add(
+        "invalid empty unicode escape",
+        \\export fn entry() void {
+        \\    const a = '\u{}';
+        \\}
+    ,
+        "tmp.zig:2:19: error: empty unicode escape sequence",
+    );
+
     cases.add(
         "non-printable invalid character",
         "\xff\xfe" ++
diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig
index d499df4cb7..ab58f2ed08 100644
--- a/test/stage1/behavior/misc.zig
+++ b/test/stage1/behavior/misc.zig
@@ -189,7 +189,7 @@ test "string escapes" {
     expect(mem.eql(u8, "\r", "\x0d"));
     expect(mem.eql(u8, "\t", "\x09"));
     expect(mem.eql(u8, "\\", "\x5c"));
-    expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69"));
+    expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01"));
 }
 
 test "multiline string" {
@@ -695,7 +695,7 @@ test "thread local variable" {
 }
 
 test "unicode escape in character literal" {
-    var a: u24 = '\U01f4a9';
+    var a: u24 = '\u{01f4a9}';
     expect(a == 128169);
 }