unicode character literals

This commit is contained in:
Nick Erdmann 2019-10-06 19:52:35 +02:00
parent 571123465b
commit ae7392e504
No known key found for this signature in database
GPG Key ID: C174038EAF6578B2
4 changed files with 81 additions and 19 deletions

View File

@ -552,8 +552,7 @@ pub fn main() void {
<p>
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
character literals will be allowed to have a single UTF-8 encoded codepoint.
and character literals.
</p>
{#code_begin|test#}
const assert = @import("std").debug.assert;
@ -567,6 +566,7 @@ test "string literals" {
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\u{1f4a9}' == 128169);
assert('💯' == 128175);
assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer.

View File

@ -371,6 +371,7 @@ pub const Tokenizer = struct {
CharLiteralUnicodeEscapeSawU,
CharLiteralUnicodeEscape,
CharLiteralUnicodeInvalid,
CharLiteralUnicode,
CharLiteralEnd,
Backslash,
Equal,
@ -427,6 +428,7 @@ pub const Tokenizer = struct {
.end = undefined,
};
var seen_escape_digits: usize = undefined;
var remaining_code_units: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@ -774,16 +776,23 @@ pub const Tokenizer = struct {
'\\' => {
state = State.CharLiteralBackslash;
},
'\'' => {
'\'', 0x80...0xbf, 0xf8...0xff => {
result.id = Token.Id.Invalid;
break;
},
0xc0...0xdf => { // 110xxxxx
remaining_code_units = 1;
state = State.CharLiteralUnicode;
},
0xe0...0xef => { // 1110xxxx
remaining_code_units = 2;
state = State.CharLiteralUnicode;
},
0xf0...0xf7 => { // 11110xxx
remaining_code_units = 3;
state = State.CharLiteralUnicode;
},
else => {
if (c < 0x20 or c == 0x7f) {
result.id = Token.Id.Invalid;
break;
}
state = State.CharLiteralEnd;
},
},
@ -867,6 +876,19 @@ pub const Tokenizer = struct {
},
},
State.CharLiteralUnicode => switch (c) {
0x80...0xbf => {
remaining_code_units -= 1;
if (remaining_code_units == 0) {
state = State.CharLiteralEnd;
}
},
else => {
result.id = Token.Id.Invalid;
break;
},
},
State.MultilineStringLiteralLine => switch (c) {
'\n' => {
self.index += 1;
@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
State.CharLiteralUnicodeEscape,
State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd,
State.CharLiteralUnicode,
State.StringLiteralBackslash,
State.LBracketStar,
State.LBracketStarC,
@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
}
test "tokenizer - char literal with unicode code point" {
testTokenize(
\\'💩'
, [_]Token.Id{.CharLiteral});
}
test "tokenizer - float literal e exponent" {
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
Token.Id.Identifier,

View File

@ -193,6 +193,7 @@ enum TokenizeState {
TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd,
TokenizeStateCharLiteralUnicode,
TokenizeStateSawStar,
TokenizeStateSawStarPercent,
TokenizeStateSawSlash,
@ -247,6 +248,7 @@ struct Tokenize {
int exponent_in_bin_or_dec;
BigInt specified_exponent;
BigInt significand;
size_t remaining_code_units;
};
ATTRIBUTE_PRINTF(2, 3)
@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
}
break;
case TokenizeStateCharLiteral:
switch (c) {
case '\'':
if (c == '\'') {
tokenize_error(&t, "expected character");
break;
case '\\':
} else if (c == '\\') {
t.state = TokenizeStateStringEscape;
break;
default:
} else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
// 10xxxxxx
// 11111xxx
invalid_char_error(&t, c);
} else if (c >= 0xc0 && c <= 0xdf) {
// 110xxxxx
t.cur_tok->data.char_lit.c = c & 0x1f;
t.remaining_code_units = 1;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xe0 && c <= 0xef) {
// 1110xxxx
t.cur_tok->data.char_lit.c = c & 0x0f;
t.remaining_code_units = 2;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xf0 && c <= 0xf7) {
// 11110xxx
t.cur_tok->data.char_lit.c = c & 0x07;
t.remaining_code_units = 3;
t.state = TokenizeStateCharLiteralUnicode;
} else {
t.cur_tok->data.char_lit.c = c;
t.state = TokenizeStateCharLiteralEnd;
break;
}
break;
case TokenizeStateCharLiteralEnd:
@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c);
}
break;
case TokenizeStateCharLiteralUnicode:
if (c <= 0x7f || c >= 0xc0) {
invalid_char_error(&t, c);
}
t.cur_tok->data.char_lit.c <<= 6;
t.cur_tok->data.char_lit.c += c & 0x3f;
t.remaining_code_units--;
if (t.remaining_code_units == 0) {
t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateZero:
switch (c) {
case 'b':
@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break;
case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEnd:
case TokenizeStateCharLiteralUnicode:
tokenize_error(&t, "unterminated character literal");
break;
case TokenizeStateSymbol:

View File

@ -699,6 +699,10 @@ test "unicode escape in character literal" {
expect(a == 128169);
}
test "unicode character in character literal" {
expect('💩' == 128169);
}
test "result location zero sized array inside struct field implicit cast to slice" {
const E = struct {
entries: []u32,