mirror of
https://github.com/ziglang/zig.git
synced 2025-12-16 11:13:08 +00:00
unicode character literals
This commit is contained in:
parent
571123465b
commit
ae7392e504
@ -552,8 +552,7 @@ pub fn main() void {
|
|||||||
<p>
|
<p>
|
||||||
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
|
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
|
||||||
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
|
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
|
||||||
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
|
and character literals.
|
||||||
character literals will be allowed to have a single UTF-8 encoded codepoint.
|
|
||||||
</p>
|
</p>
|
||||||
{#code_begin|test#}
|
{#code_begin|test#}
|
||||||
const assert = @import("std").debug.assert;
|
const assert = @import("std").debug.assert;
|
||||||
@ -567,6 +566,7 @@ test "string literals" {
|
|||||||
assert(normal_bytes[1] == 'e');
|
assert(normal_bytes[1] == 'e');
|
||||||
assert('e' == '\x65');
|
assert('e' == '\x65');
|
||||||
assert('\u{1f4a9}' == 128169);
|
assert('\u{1f4a9}' == 128169);
|
||||||
|
assert('💯' == 128175);
|
||||||
assert(mem.eql(u8, "hello", "h\x65llo"));
|
assert(mem.eql(u8, "hello", "h\x65llo"));
|
||||||
|
|
||||||
// A C string literal is a null terminated pointer.
|
// A C string literal is a null terminated pointer.
|
||||||
|
|||||||
@ -371,6 +371,7 @@ pub const Tokenizer = struct {
|
|||||||
CharLiteralUnicodeEscapeSawU,
|
CharLiteralUnicodeEscapeSawU,
|
||||||
CharLiteralUnicodeEscape,
|
CharLiteralUnicodeEscape,
|
||||||
CharLiteralUnicodeInvalid,
|
CharLiteralUnicodeInvalid,
|
||||||
|
CharLiteralUnicode,
|
||||||
CharLiteralEnd,
|
CharLiteralEnd,
|
||||||
Backslash,
|
Backslash,
|
||||||
Equal,
|
Equal,
|
||||||
@ -427,6 +428,7 @@ pub const Tokenizer = struct {
|
|||||||
.end = undefined,
|
.end = undefined,
|
||||||
};
|
};
|
||||||
var seen_escape_digits: usize = undefined;
|
var seen_escape_digits: usize = undefined;
|
||||||
|
var remaining_code_units: usize = undefined;
|
||||||
while (self.index < self.buffer.len) : (self.index += 1) {
|
while (self.index < self.buffer.len) : (self.index += 1) {
|
||||||
const c = self.buffer[self.index];
|
const c = self.buffer[self.index];
|
||||||
switch (state) {
|
switch (state) {
|
||||||
@ -774,16 +776,23 @@ pub const Tokenizer = struct {
|
|||||||
'\\' => {
|
'\\' => {
|
||||||
state = State.CharLiteralBackslash;
|
state = State.CharLiteralBackslash;
|
||||||
},
|
},
|
||||||
'\'' => {
|
'\'', 0x80...0xbf, 0xf8...0xff => {
|
||||||
result.id = Token.Id.Invalid;
|
result.id = Token.Id.Invalid;
|
||||||
break;
|
break;
|
||||||
},
|
},
|
||||||
|
0xc0...0xdf => { // 110xxxxx
|
||||||
|
remaining_code_units = 1;
|
||||||
|
state = State.CharLiteralUnicode;
|
||||||
|
},
|
||||||
|
0xe0...0xef => { // 1110xxxx
|
||||||
|
remaining_code_units = 2;
|
||||||
|
state = State.CharLiteralUnicode;
|
||||||
|
},
|
||||||
|
0xf0...0xf7 => { // 11110xxx
|
||||||
|
remaining_code_units = 3;
|
||||||
|
state = State.CharLiteralUnicode;
|
||||||
|
},
|
||||||
else => {
|
else => {
|
||||||
if (c < 0x20 or c == 0x7f) {
|
|
||||||
result.id = Token.Id.Invalid;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
state = State.CharLiteralEnd;
|
state = State.CharLiteralEnd;
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -867,6 +876,19 @@ pub const Tokenizer = struct {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
State.CharLiteralUnicode => switch (c) {
|
||||||
|
0x80...0xbf => {
|
||||||
|
remaining_code_units -= 1;
|
||||||
|
if (remaining_code_units == 0) {
|
||||||
|
state = State.CharLiteralEnd;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
else => {
|
||||||
|
result.id = Token.Id.Invalid;
|
||||||
|
break;
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
State.MultilineStringLiteralLine => switch (c) {
|
State.MultilineStringLiteralLine => switch (c) {
|
||||||
'\n' => {
|
'\n' => {
|
||||||
self.index += 1;
|
self.index += 1;
|
||||||
@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
|
|||||||
State.CharLiteralUnicodeEscape,
|
State.CharLiteralUnicodeEscape,
|
||||||
State.CharLiteralUnicodeInvalid,
|
State.CharLiteralUnicodeInvalid,
|
||||||
State.CharLiteralEnd,
|
State.CharLiteralEnd,
|
||||||
|
State.CharLiteralUnicode,
|
||||||
State.StringLiteralBackslash,
|
State.StringLiteralBackslash,
|
||||||
State.LBracketStar,
|
State.LBracketStar,
|
||||||
State.LBracketStarC,
|
State.LBracketStarC,
|
||||||
@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
|
|||||||
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
|
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "tokenizer - char literal with unicode code point" {
|
||||||
|
testTokenize(
|
||||||
|
\\'💩'
|
||||||
|
, [_]Token.Id{.CharLiteral});
|
||||||
|
}
|
||||||
|
|
||||||
test "tokenizer - float literal e exponent" {
|
test "tokenizer - float literal e exponent" {
|
||||||
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
|
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
|
||||||
Token.Id.Identifier,
|
Token.Id.Identifier,
|
||||||
|
|||||||
@ -193,6 +193,7 @@ enum TokenizeState {
|
|||||||
TokenizeStateStringEscapeUnicodeStart,
|
TokenizeStateStringEscapeUnicodeStart,
|
||||||
TokenizeStateCharLiteral,
|
TokenizeStateCharLiteral,
|
||||||
TokenizeStateCharLiteralEnd,
|
TokenizeStateCharLiteralEnd,
|
||||||
|
TokenizeStateCharLiteralUnicode,
|
||||||
TokenizeStateSawStar,
|
TokenizeStateSawStar,
|
||||||
TokenizeStateSawStarPercent,
|
TokenizeStateSawStarPercent,
|
||||||
TokenizeStateSawSlash,
|
TokenizeStateSawSlash,
|
||||||
@ -247,6 +248,7 @@ struct Tokenize {
|
|||||||
int exponent_in_bin_or_dec;
|
int exponent_in_bin_or_dec;
|
||||||
BigInt specified_exponent;
|
BigInt specified_exponent;
|
||||||
BigInt significand;
|
BigInt significand;
|
||||||
|
size_t remaining_code_units;
|
||||||
};
|
};
|
||||||
|
|
||||||
ATTRIBUTE_PRINTF(2, 3)
|
ATTRIBUTE_PRINTF(2, 3)
|
||||||
@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case TokenizeStateCharLiteral:
|
case TokenizeStateCharLiteral:
|
||||||
switch (c) {
|
if (c == '\'') {
|
||||||
case '\'':
|
|
||||||
tokenize_error(&t, "expected character");
|
tokenize_error(&t, "expected character");
|
||||||
break;
|
} else if (c == '\\') {
|
||||||
case '\\':
|
|
||||||
t.state = TokenizeStateStringEscape;
|
t.state = TokenizeStateStringEscape;
|
||||||
break;
|
} else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
|
||||||
default:
|
// 10xxxxxx
|
||||||
|
// 11111xxx
|
||||||
|
invalid_char_error(&t, c);
|
||||||
|
} else if (c >= 0xc0 && c <= 0xdf) {
|
||||||
|
// 110xxxxx
|
||||||
|
t.cur_tok->data.char_lit.c = c & 0x1f;
|
||||||
|
t.remaining_code_units = 1;
|
||||||
|
t.state = TokenizeStateCharLiteralUnicode;
|
||||||
|
} else if (c >= 0xe0 && c <= 0xef) {
|
||||||
|
// 1110xxxx
|
||||||
|
t.cur_tok->data.char_lit.c = c & 0x0f;
|
||||||
|
t.remaining_code_units = 2;
|
||||||
|
t.state = TokenizeStateCharLiteralUnicode;
|
||||||
|
} else if (c >= 0xf0 && c <= 0xf7) {
|
||||||
|
// 11110xxx
|
||||||
|
t.cur_tok->data.char_lit.c = c & 0x07;
|
||||||
|
t.remaining_code_units = 3;
|
||||||
|
t.state = TokenizeStateCharLiteralUnicode;
|
||||||
|
} else {
|
||||||
t.cur_tok->data.char_lit.c = c;
|
t.cur_tok->data.char_lit.c = c;
|
||||||
t.state = TokenizeStateCharLiteralEnd;
|
t.state = TokenizeStateCharLiteralEnd;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case TokenizeStateCharLiteralEnd:
|
case TokenizeStateCharLiteralEnd:
|
||||||
@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||||||
invalid_char_error(&t, c);
|
invalid_char_error(&t, c);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case TokenizeStateCharLiteralUnicode:
|
||||||
|
if (c <= 0x7f || c >= 0xc0) {
|
||||||
|
invalid_char_error(&t, c);
|
||||||
|
}
|
||||||
|
t.cur_tok->data.char_lit.c <<= 6;
|
||||||
|
t.cur_tok->data.char_lit.c += c & 0x3f;
|
||||||
|
t.remaining_code_units--;
|
||||||
|
if (t.remaining_code_units == 0) {
|
||||||
|
t.state = TokenizeStateCharLiteralEnd;
|
||||||
|
}
|
||||||
|
break;
|
||||||
case TokenizeStateZero:
|
case TokenizeStateZero:
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 'b':
|
case 'b':
|
||||||
@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
|
|||||||
break;
|
break;
|
||||||
case TokenizeStateCharLiteral:
|
case TokenizeStateCharLiteral:
|
||||||
case TokenizeStateCharLiteralEnd:
|
case TokenizeStateCharLiteralEnd:
|
||||||
|
case TokenizeStateCharLiteralUnicode:
|
||||||
tokenize_error(&t, "unterminated character literal");
|
tokenize_error(&t, "unterminated character literal");
|
||||||
break;
|
break;
|
||||||
case TokenizeStateSymbol:
|
case TokenizeStateSymbol:
|
||||||
|
|||||||
@ -699,6 +699,10 @@ test "unicode escape in character literal" {
|
|||||||
expect(a == 128169);
|
expect(a == 128169);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test "unicode character in character literal" {
|
||||||
|
expect('💩' == 128169);
|
||||||
|
}
|
||||||
|
|
||||||
test "result location zero sized array inside struct field implicit cast to slice" {
|
test "result location zero sized array inside struct field implicit cast to slice" {
|
||||||
const E = struct {
|
const E = struct {
|
||||||
entries: []u32,
|
entries: []u32,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user