From 6bfa8546bbdf6dd644a65876135893339b767bba Mon Sep 17 00:00:00 2001 From: hryx Date: Thu, 4 Jul 2019 22:40:19 -0700 Subject: [PATCH] Unicode escapes: stage1 tokenizer and behavior tests --- src/tokenizer.cpp | 115 ++++++++++++++++++---------------- test/compile_errors.zig | 18 ++++++ test/stage1/behavior/misc.zig | 4 +- 3 files changed, 81 insertions(+), 56 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index a0acde52e9..4358146f24 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -190,6 +190,7 @@ enum TokenizeState { TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5" TokenizeStateString, TokenizeStateStringEscape, + TokenizeStateStringEscapeUnicodeStart, TokenizeStateCharLiteral, TokenizeStateCharLiteralEnd, TokenizeStateSawStar, @@ -241,7 +242,6 @@ struct Tokenize { int32_t exp_add_amt; bool is_exp_negative; size_t char_code_index; - size_t char_code_end; bool unicode; uint32_t char_code; int exponent_in_bin_or_dec; @@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) { t.radix = 16; t.char_code = 0; t.char_code_index = 0; - t.char_code_end = 2; t.unicode = false; break; case 'u': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 4; - t.unicode = true; - break; - case 'U': - t.state = TokenizeStateCharCode; - t.radix = 16; - t.char_code = 0; - t.char_code_index = 0; - t.char_code_end = 6; - t.unicode = true; + t.state = TokenizeStateStringEscapeUnicodeStart; break; case 'n': handle_string_escape(&t, '\n'); @@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) { invalid_char_error(&t, c); } break; + case TokenizeStateStringEscapeUnicodeStart: + switch (c) { + case '{': + t.state = TokenizeStateCharCode; + t.radix = 16; + t.char_code = 0; + t.char_code_index = 0; + t.unicode = true; + break; + default: + invalid_char_error(&t, c); + } + break; case TokenizeStateCharCode: { + if (t.unicode && c == '}') { + if (t.char_code_index == 0) { + tokenize_error(&t, "empty unicode escape sequence"); + break; + } + if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: %x", t.char_code); + break; + } + if (t.cur_tok->id == TokenIdCharLiteral) { + t.cur_tok->data.char_lit.c = t.char_code; + t.state = TokenizeStateCharLiteralEnd; + } else if (t.char_code <= 0x7f) { + // 00000000 00000000 00000000 0xxxxxxx + handle_string_escape(&t, (uint8_t)t.char_code); + } else if (t.char_code <= 0x7ff) { + // 00000000 00000000 00000xxx xx000000 + handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0xffff) { + // 00000000 00000000 xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else if (t.char_code <= 0x10ffff) { + // 00000000 000xxx00 00000000 00000000 + handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); + // 00000000 000000xx xxxx0000 00000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); + // 00000000 00000000 0000xxxx xx000000 + handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); + // 00000000 00000000 00000000 00xxxxxx + handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); + } else { + zig_unreachable(); + } + break; + } + uint32_t digit_value = get_digit_value(c); if (digit_value >= t.radix) { tokenize_error(&t, "invalid digit: '%c'", c); @@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) { t.char_code += digit_value; t.char_code_index += 1; - if (t.char_code_index >= t.char_code_end) { - if (t.unicode) { - if (t.char_code > 0x10ffff) { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); - break; - } - if (t.cur_tok->id == TokenIdCharLiteral) { - t.cur_tok->data.char_lit.c = t.char_code; - t.state = TokenizeStateCharLiteralEnd; - } else if (t.char_code <= 0x7f) { - // 00000000 00000000 00000000 0xxxxxxx - handle_string_escape(&t, (uint8_t)t.char_code); - } else if (t.char_code <= 0x7ff) { - // 00000000 00000000 00000xxx xx000000 - handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0xffff) { - // 00000000 00000000 xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else if (t.char_code <= 0x10ffff) { - // 00000000 000xxx00 00000000 00000000 - handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18))); - // 00000000 000000xx xxxx0000 00000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f))); - // 00000000 00000000 0000xxxx xx000000 - handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); - // 00000000 00000000 00000000 00xxxxxx - handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } - } else { - assert(t.char_code <= 255); - handle_string_escape(&t, (uint8_t)t.char_code); - } + if (!t.unicode && t.char_code_index >= 2) { + assert(t.char_code <= 255); + handle_string_escape(&t, (uint8_t)t.char_code); } } break; @@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) { tokenize_error(&t, "unterminated string"); break; case TokenizeStateStringEscape: + case TokenizeStateStringEscapeUnicodeStart: case TokenizeStateCharCode: if (t.cur_tok->id == TokenIdStringLiteral) { tokenize_error(&t, "unterminated string"); diff --git a/test/compile_errors.zig b/test/compile_errors.zig index df4e38583c..9967770931 100644 --- a/test/compile_errors.zig +++ b/test/compile_errors.zig @@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void { "tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported", ); + cases.add( + "invalid legacy unicode escape", + \\export fn entry() void { + \\ const a = '\U1234'; + \\} + , + "tmp.zig:2:17: error: invalid character: 'U'", + ); + + cases.add( + "invalid empty unicode escape", + \\export fn entry() void { + \\ const a = '\u{}'; + \\} + , + "tmp.zig:2:19: error: empty unicode escape sequence", + ); + cases.add( "non-printable invalid character", "\xff\xfe" ++ diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index d499df4cb7..ab58f2ed08 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -189,7 +189,7 @@ test "string escapes" { expect(mem.eql(u8, "\r", "\x0d")); expect(mem.eql(u8, "\t", "\x09")); expect(mem.eql(u8, "\\", "\x5c")); - expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69")); + expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01")); } test "multiline string" { @@ -695,7 +695,7 @@ test "thread local variable" { } test "unicode escape in character literal" { - var a: u24 = '\U01f4a9'; + var a: u24 = '\u{01f4a9}'; expect(a == 128169); }