mirror of
https://github.com/ziglang/zig.git
synced 2025-12-16 11:13:08 +00:00
unicode character literals
This commit is contained in:
parent
571123465b
commit
ae7392e504
@ -552,8 +552,7 @@ pub fn main() void {
|
||||
<p>
|
||||
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
|
||||
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
|
||||
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
|
||||
character literals will be allowed to have a single UTF-8 encoded codepoint.
|
||||
and character literals.
|
||||
</p>
|
||||
{#code_begin|test#}
|
||||
const assert = @import("std").debug.assert;
|
||||
@ -567,6 +566,7 @@ test "string literals" {
|
||||
assert(normal_bytes[1] == 'e');
|
||||
assert('e' == '\x65');
|
||||
assert('\u{1f4a9}' == 128169);
|
||||
assert('💯' == 128175);
|
||||
assert(mem.eql(u8, "hello", "h\x65llo"));
|
||||
|
||||
// A C string literal is a null terminated pointer.
|
||||
|
||||
@ -371,6 +371,7 @@ pub const Tokenizer = struct {
|
||||
CharLiteralUnicodeEscapeSawU,
|
||||
CharLiteralUnicodeEscape,
|
||||
CharLiteralUnicodeInvalid,
|
||||
CharLiteralUnicode,
|
||||
CharLiteralEnd,
|
||||
Backslash,
|
||||
Equal,
|
||||
@ -427,6 +428,7 @@ pub const Tokenizer = struct {
|
||||
.end = undefined,
|
||||
};
|
||||
var seen_escape_digits: usize = undefined;
|
||||
var remaining_code_units: usize = undefined;
|
||||
while (self.index < self.buffer.len) : (self.index += 1) {
|
||||
const c = self.buffer[self.index];
|
||||
switch (state) {
|
||||
@ -774,16 +776,23 @@ pub const Tokenizer = struct {
|
||||
'\\' => {
|
||||
state = State.CharLiteralBackslash;
|
||||
},
|
||||
'\'' => {
|
||||
'\'', 0x80...0xbf, 0xf8...0xff => {
|
||||
result.id = Token.Id.Invalid;
|
||||
break;
|
||||
},
|
||||
0xc0...0xdf => { // 110xxxxx
|
||||
remaining_code_units = 1;
|
||||
state = State.CharLiteralUnicode;
|
||||
},
|
||||
0xe0...0xef => { // 1110xxxx
|
||||
remaining_code_units = 2;
|
||||
state = State.CharLiteralUnicode;
|
||||
},
|
||||
0xf0...0xf7 => { // 11110xxx
|
||||
remaining_code_units = 3;
|
||||
state = State.CharLiteralUnicode;
|
||||
},
|
||||
else => {
|
||||
if (c < 0x20 or c == 0x7f) {
|
||||
result.id = Token.Id.Invalid;
|
||||
break;
|
||||
}
|
||||
|
||||
state = State.CharLiteralEnd;
|
||||
},
|
||||
},
|
||||
@ -867,6 +876,19 @@ pub const Tokenizer = struct {
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralUnicode => switch (c) {
|
||||
0x80...0xbf => {
|
||||
remaining_code_units -= 1;
|
||||
if (remaining_code_units == 0) {
|
||||
state = State.CharLiteralEnd;
|
||||
}
|
||||
},
|
||||
else => {
|
||||
result.id = Token.Id.Invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
|
||||
State.MultilineStringLiteralLine => switch (c) {
|
||||
'\n' => {
|
||||
self.index += 1;
|
||||
@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
|
||||
State.CharLiteralUnicodeEscape,
|
||||
State.CharLiteralUnicodeInvalid,
|
||||
State.CharLiteralEnd,
|
||||
State.CharLiteralUnicode,
|
||||
State.StringLiteralBackslash,
|
||||
State.LBracketStar,
|
||||
State.LBracketStarC,
|
||||
@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
|
||||
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
|
||||
}
|
||||
|
||||
test "tokenizer - char literal with unicode code point" {
|
||||
testTokenize(
|
||||
\\'💩'
|
||||
, [_]Token.Id{.CharLiteral});
|
||||
}
|
||||
|
||||
test "tokenizer - float literal e exponent" {
|
||||
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
|
||||
Token.Id.Identifier,
|
||||
|
||||
@ -193,6 +193,7 @@ enum TokenizeState {
|
||||
TokenizeStateStringEscapeUnicodeStart,
|
||||
TokenizeStateCharLiteral,
|
||||
TokenizeStateCharLiteralEnd,
|
||||
TokenizeStateCharLiteralUnicode,
|
||||
TokenizeStateSawStar,
|
||||
TokenizeStateSawStarPercent,
|
||||
TokenizeStateSawSlash,
|
||||
@ -247,6 +248,7 @@ struct Tokenize {
|
||||
int exponent_in_bin_or_dec;
|
||||
BigInt specified_exponent;
|
||||
BigInt significand;
|
||||
size_t remaining_code_units;
|
||||
};
|
||||
|
||||
ATTRIBUTE_PRINTF(2, 3)
|
||||
@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharLiteral:
|
||||
switch (c) {
|
||||
case '\'':
|
||||
tokenize_error(&t, "expected character");
|
||||
break;
|
||||
case '\\':
|
||||
t.state = TokenizeStateStringEscape;
|
||||
break;
|
||||
default:
|
||||
t.cur_tok->data.char_lit.c = c;
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
break;
|
||||
if (c == '\'') {
|
||||
tokenize_error(&t, "expected character");
|
||||
} else if (c == '\\') {
|
||||
t.state = TokenizeStateStringEscape;
|
||||
} else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
|
||||
// 10xxxxxx
|
||||
// 11111xxx
|
||||
invalid_char_error(&t, c);
|
||||
} else if (c >= 0xc0 && c <= 0xdf) {
|
||||
// 110xxxxx
|
||||
t.cur_tok->data.char_lit.c = c & 0x1f;
|
||||
t.remaining_code_units = 1;
|
||||
t.state = TokenizeStateCharLiteralUnicode;
|
||||
} else if (c >= 0xe0 && c <= 0xef) {
|
||||
// 1110xxxx
|
||||
t.cur_tok->data.char_lit.c = c & 0x0f;
|
||||
t.remaining_code_units = 2;
|
||||
t.state = TokenizeStateCharLiteralUnicode;
|
||||
} else if (c >= 0xf0 && c <= 0xf7) {
|
||||
// 11110xxx
|
||||
t.cur_tok->data.char_lit.c = c & 0x07;
|
||||
t.remaining_code_units = 3;
|
||||
t.state = TokenizeStateCharLiteralUnicode;
|
||||
} else {
|
||||
t.cur_tok->data.char_lit.c = c;
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharLiteralEnd:
|
||||
@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
invalid_char_error(&t, c);
|
||||
}
|
||||
break;
|
||||
case TokenizeStateCharLiteralUnicode:
|
||||
if (c <= 0x7f || c >= 0xc0) {
|
||||
invalid_char_error(&t, c);
|
||||
}
|
||||
t.cur_tok->data.char_lit.c <<= 6;
|
||||
t.cur_tok->data.char_lit.c += c & 0x3f;
|
||||
t.remaining_code_units--;
|
||||
if (t.remaining_code_units == 0) {
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
}
|
||||
break;
|
||||
case TokenizeStateZero:
|
||||
switch (c) {
|
||||
case 'b':
|
||||
@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
break;
|
||||
case TokenizeStateCharLiteral:
|
||||
case TokenizeStateCharLiteralEnd:
|
||||
case TokenizeStateCharLiteralUnicode:
|
||||
tokenize_error(&t, "unterminated character literal");
|
||||
break;
|
||||
case TokenizeStateSymbol:
|
||||
|
||||
@ -699,6 +699,10 @@ test "unicode escape in character literal" {
|
||||
expect(a == 128169);
|
||||
}
|
||||
|
||||
test "unicode character in character literal" {
|
||||
expect('💩' == 128169);
|
||||
}
|
||||
|
||||
test "result location zero sized array inside struct field implicit cast to slice" {
|
||||
const E = struct {
|
||||
entries: []u32,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user