mirror of
https://github.com/ziglang/zig.git
synced 2026-01-03 20:13:21 +00:00
Merge pull request #2823 from hryx/unicode-escape
Unicode escapes: support u{N...}
This commit is contained in:
commit
21c60922e3
@ -566,7 +566,7 @@ test "string literals" {
|
||||
assert(normal_bytes.len == 5);
|
||||
assert(normal_bytes[1] == 'e');
|
||||
assert('e' == '\x65');
|
||||
assert('\U01f4a9' == 128169);
|
||||
assert('\u{1f4a9}' == 128169);
|
||||
assert(mem.eql(u8, "hello", "h\x65llo"));
|
||||
|
||||
// A C string literal is a null terminated pointer.
|
||||
@ -616,12 +616,8 @@ test "string literals" {
|
||||
<td>hexadecimal 8-bit character code (2 digits)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>\uNNNN</code></td>
|
||||
<td>hexadecimal 16-bit Unicode character code UTF-8 encoded (4 digits)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>\UNNNNNN</code></td>
|
||||
<td>hexadecimal 24-bit Unicode character code UTF-8 encoded (6 digits)</td>
|
||||
<td><code>\u{NNNNNN}</code></td>
|
||||
<td>hexadecimal Unicode character code UTF-8 encoded (1 or more digits)</td>
|
||||
</tr>
|
||||
</table>
|
||||
</div>
|
||||
@ -10011,8 +10007,7 @@ eof <- !.
|
||||
hex <- [0-9a-fA-F]
|
||||
char_escape
|
||||
<- "\\x" hex hex
|
||||
/ "\\u" hex hex hex hex
|
||||
/ "\\U" hex hex hex hex hex hex
|
||||
/ "\\u{" hex+ "}"
|
||||
/ "\\" [nr\\t'"]
|
||||
char_char
|
||||
<- char_escape
|
||||
|
||||
@ -190,6 +190,7 @@ enum TokenizeState {
|
||||
TokenizeStateFloatExponentNumber, // "123.456e-", "123.456e5", "123.456e5e-5"
|
||||
TokenizeStateString,
|
||||
TokenizeStateStringEscape,
|
||||
TokenizeStateStringEscapeUnicodeStart,
|
||||
TokenizeStateCharLiteral,
|
||||
TokenizeStateCharLiteralEnd,
|
||||
TokenizeStateSawStar,
|
||||
@ -241,7 +242,6 @@ struct Tokenize {
|
||||
int32_t exp_add_amt;
|
||||
bool is_exp_negative;
|
||||
size_t char_code_index;
|
||||
size_t char_code_end;
|
||||
bool unicode;
|
||||
uint32_t char_code;
|
||||
int exponent_in_bin_or_dec;
|
||||
@ -1071,24 +1071,10 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
t.radix = 16;
|
||||
t.char_code = 0;
|
||||
t.char_code_index = 0;
|
||||
t.char_code_end = 2;
|
||||
t.unicode = false;
|
||||
break;
|
||||
case 'u':
|
||||
t.state = TokenizeStateCharCode;
|
||||
t.radix = 16;
|
||||
t.char_code = 0;
|
||||
t.char_code_index = 0;
|
||||
t.char_code_end = 4;
|
||||
t.unicode = true;
|
||||
break;
|
||||
case 'U':
|
||||
t.state = TokenizeStateCharCode;
|
||||
t.radix = 16;
|
||||
t.char_code = 0;
|
||||
t.char_code_index = 0;
|
||||
t.char_code_end = 6;
|
||||
t.unicode = true;
|
||||
t.state = TokenizeStateStringEscapeUnicodeStart;
|
||||
break;
|
||||
case 'n':
|
||||
handle_string_escape(&t, '\n');
|
||||
@ -1112,8 +1098,63 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
invalid_char_error(&t, c);
|
||||
}
|
||||
break;
|
||||
case TokenizeStateStringEscapeUnicodeStart:
|
||||
switch (c) {
|
||||
case '{':
|
||||
t.state = TokenizeStateCharCode;
|
||||
t.radix = 16;
|
||||
t.char_code = 0;
|
||||
t.char_code_index = 0;
|
||||
t.unicode = true;
|
||||
break;
|
||||
default:
|
||||
invalid_char_error(&t, c);
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharCode:
|
||||
{
|
||||
if (t.unicode && c == '}') {
|
||||
if (t.char_code_index == 0) {
|
||||
tokenize_error(&t, "empty unicode escape sequence");
|
||||
break;
|
||||
}
|
||||
if (t.char_code > 0x10ffff) {
|
||||
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
|
||||
break;
|
||||
}
|
||||
if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
t.cur_tok->data.char_lit.c = t.char_code;
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
} else if (t.char_code <= 0x7f) {
|
||||
// 00000000 00000000 00000000 0xxxxxxx
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
} else if (t.char_code <= 0x7ff) {
|
||||
// 00000000 00000000 00000xxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else if (t.char_code <= 0xffff) {
|
||||
// 00000000 00000000 xxxx0000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else if (t.char_code <= 0x10ffff) {
|
||||
// 00000000 000xxx00 00000000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
|
||||
// 00000000 000000xx xxxx0000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else {
|
||||
zig_unreachable();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t digit_value = get_digit_value(c);
|
||||
if (digit_value >= t.radix) {
|
||||
tokenize_error(&t, "invalid digit: '%c'", c);
|
||||
@ -1123,44 +1164,9 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
t.char_code += digit_value;
|
||||
t.char_code_index += 1;
|
||||
|
||||
if (t.char_code_index >= t.char_code_end) {
|
||||
if (t.unicode) {
|
||||
if (t.char_code > 0x10ffff) {
|
||||
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
|
||||
break;
|
||||
}
|
||||
if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
t.cur_tok->data.char_lit.c = t.char_code;
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
} else if (t.char_code <= 0x7f) {
|
||||
// 00000000 00000000 00000000 0xxxxxxx
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
} else if (t.char_code <= 0x7ff) {
|
||||
// 00000000 00000000 00000xxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else if (t.char_code <= 0xffff) {
|
||||
// 00000000 00000000 xxxx0000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0xe0 | (t.char_code >> 12)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else if (t.char_code <= 0x10ffff) {
|
||||
// 00000000 000xxx00 00000000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0xf0 | (t.char_code >> 18)));
|
||||
// 00000000 000000xx xxxx0000 00000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 12) & 0x3f)));
|
||||
// 00000000 00000000 0000xxxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
}
|
||||
} else {
|
||||
assert(t.char_code <= 255);
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
}
|
||||
if (!t.unicode && t.char_code_index >= 2) {
|
||||
assert(t.char_code <= 255);
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -1409,6 +1415,7 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
tokenize_error(&t, "unterminated string");
|
||||
break;
|
||||
case TokenizeStateStringEscape:
|
||||
case TokenizeStateStringEscapeUnicodeStart:
|
||||
case TokenizeStateCharCode:
|
||||
if (t.cur_tok->id == TokenIdStringLiteral) {
|
||||
tokenize_error(&t, "unterminated string");
|
||||
|
||||
@ -80,7 +80,7 @@ test "zig fmt: enum literal inside array literal" {
|
||||
|
||||
test "zig fmt: character literal larger than u8" {
|
||||
try testCanonical(
|
||||
\\const x = '\U01f4a9';
|
||||
\\const x = '\u{01f4a9}';
|
||||
\\
|
||||
);
|
||||
}
|
||||
|
||||
@ -240,6 +240,9 @@ pub const Tokenizer = struct {
|
||||
CharLiteral,
|
||||
CharLiteralBackslash,
|
||||
CharLiteralHexEscape,
|
||||
CharLiteralUnicodeEscapeSawU,
|
||||
CharLiteralUnicodeEscape,
|
||||
CharLiteralUnicodeInvalid,
|
||||
CharLiteralEnd,
|
||||
Backslash,
|
||||
Equal,
|
||||
@ -296,7 +299,6 @@ pub const Tokenizer = struct {
|
||||
.end = undefined,
|
||||
};
|
||||
var seen_escape_digits: usize = undefined;
|
||||
var expected_escape_digits: usize = undefined;
|
||||
while (self.index < self.buffer.len) : (self.index += 1) {
|
||||
const c = self.buffer[self.index];
|
||||
switch (state) {
|
||||
@ -661,17 +663,9 @@ pub const Tokenizer = struct {
|
||||
'x' => {
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 2;
|
||||
},
|
||||
'u' => {
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 4;
|
||||
},
|
||||
'U' => {
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 6;
|
||||
state = State.CharLiteralUnicodeEscapeSawU;
|
||||
},
|
||||
else => {
|
||||
state = State.CharLiteralEnd;
|
||||
@ -679,9 +673,9 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
|
||||
State.CharLiteralHexEscape => switch (c) {
|
||||
'0'...'9', 'a'...'z', 'A'...'F' => {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||||
seen_escape_digits += 1;
|
||||
if (seen_escape_digits == expected_escape_digits) {
|
||||
if (seen_escape_digits == 2) {
|
||||
state = State.CharLiteralEnd;
|
||||
}
|
||||
},
|
||||
@ -691,6 +685,43 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralUnicodeEscapeSawU => switch (c) {
|
||||
'{' => {
|
||||
state = State.CharLiteralUnicodeEscape;
|
||||
seen_escape_digits = 0;
|
||||
},
|
||||
else => {
|
||||
result.id = Token.Id.Invalid;
|
||||
state = State.CharLiteralUnicodeInvalid;
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralUnicodeEscape => switch (c) {
|
||||
'0'...'9', 'a'...'f', 'A'...'F' => {
|
||||
seen_escape_digits += 1;
|
||||
},
|
||||
'}' => {
|
||||
if (seen_escape_digits == 0) {
|
||||
result.id = Token.Id.Invalid;
|
||||
state = State.CharLiteralUnicodeInvalid;
|
||||
} else {
|
||||
state = State.CharLiteralEnd;
|
||||
}
|
||||
},
|
||||
else => {
|
||||
result.id = Token.Id.Invalid;
|
||||
state = State.CharLiteralUnicodeInvalid;
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralUnicodeInvalid => switch (c) {
|
||||
// Keep consuming characters until an obvious stopping point.
|
||||
// This consolidates e.g. `u{0ab1Q}` into a single invalid token
|
||||
// instead of creating the tokens `u{0ab1`, `Q`, `}`
|
||||
'0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
|
||||
else => break,
|
||||
},
|
||||
|
||||
State.CharLiteralEnd => switch (c) {
|
||||
'\'' => {
|
||||
result.id = Token.Id.CharLiteral;
|
||||
@ -1052,6 +1083,9 @@ pub const Tokenizer = struct {
|
||||
State.CharLiteral,
|
||||
State.CharLiteralBackslash,
|
||||
State.CharLiteralHexEscape,
|
||||
State.CharLiteralUnicodeEscapeSawU,
|
||||
State.CharLiteralUnicodeEscape,
|
||||
State.CharLiteralUnicodeInvalid,
|
||||
State.CharLiteralEnd,
|
||||
State.StringLiteralBackslash,
|
||||
State.LBracketStar,
|
||||
@ -1205,7 +1239,60 @@ test "tokenizer - unknown length pointer and then c pointer" {
|
||||
test "tokenizer - char literal with hex escape" {
|
||||
testTokenize(
|
||||
\\'\x1b'
|
||||
, [_]Token.Id{Token.Id.CharLiteral});
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\'\x1'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - char literal with unicode escapes" {
|
||||
// Valid unicode escapes
|
||||
testTokenize(
|
||||
\\'\u{3}'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\'\u{01}'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\'\u{2a}'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\'\u{3f9}'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\'\u{6E09aBc1523}'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
testTokenize(
|
||||
\\"\u{440}"
|
||||
, [_]Token.Id{.StringLiteral});
|
||||
|
||||
// Invalid unicode escapes
|
||||
testTokenize(
|
||||
\\'\u'
|
||||
, [_]Token.Id{.Invalid});
|
||||
testTokenize(
|
||||
\\'\u{{'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
testTokenize(
|
||||
\\'\u{}'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
testTokenize(
|
||||
\\'\u{s}'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
testTokenize(
|
||||
\\'\u{2z}'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
testTokenize(
|
||||
\\'\u{4a'
|
||||
, [_]Token.Id{.Invalid});
|
||||
|
||||
// Test old-style unicode literals
|
||||
testTokenize(
|
||||
\\'\u0333'
|
||||
, [_]Token.Id{ .Invalid, .Invalid });
|
||||
testTokenize(
|
||||
\\'\U0333'
|
||||
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - float literal e exponent" {
|
||||
|
||||
@ -5414,6 +5414,24 @@ pub fn addCases(cases: *tests.CompileErrorContext) void {
|
||||
"tmp.zig:1:17: error: invalid carriage return, only '\\n' line endings are supported",
|
||||
);
|
||||
|
||||
cases.add(
|
||||
"invalid legacy unicode escape",
|
||||
\\export fn entry() void {
|
||||
\\ const a = '\U1234';
|
||||
\\}
|
||||
,
|
||||
"tmp.zig:2:17: error: invalid character: 'U'",
|
||||
);
|
||||
|
||||
cases.add(
|
||||
"invalid empty unicode escape",
|
||||
\\export fn entry() void {
|
||||
\\ const a = '\u{}';
|
||||
\\}
|
||||
,
|
||||
"tmp.zig:2:19: error: empty unicode escape sequence",
|
||||
);
|
||||
|
||||
cases.add(
|
||||
"non-printable invalid character",
|
||||
"\xff\xfe" ++
|
||||
|
||||
@ -189,7 +189,7 @@ test "string escapes" {
|
||||
expect(mem.eql(u8, "\r", "\x0d"));
|
||||
expect(mem.eql(u8, "\t", "\x09"));
|
||||
expect(mem.eql(u8, "\\", "\x5c"));
|
||||
expect(mem.eql(u8, "\u1234\u0069", "\xe1\x88\xb4\x69"));
|
||||
expect(mem.eql(u8, "\u{1234}\u{069}\u{1}", "\xe1\x88\xb4\x69\x01"));
|
||||
}
|
||||
|
||||
test "multiline string" {
|
||||
@ -695,7 +695,7 @@ test "thread local variable" {
|
||||
}
|
||||
|
||||
test "unicode escape in character literal" {
|
||||
var a: u24 = '\U01f4a9';
|
||||
var a: u24 = '\u{01f4a9}';
|
||||
expect(a == 128169);
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user