unicode character literals

This commit is contained in:
Nick Erdmann 2019-10-06 19:52:35 +02:00
parent 571123465b
commit ae7392e504
No known key found for this signature in database
GPG Key ID: C174038EAF6578B2
4 changed files with 81 additions and 19 deletions

View File

@ -552,8 +552,7 @@ pub fn main() void {
<p> <p>
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented, and character literals.
character literals will be allowed to have a single UTF-8 encoded codepoint.
</p> </p>
{#code_begin|test#} {#code_begin|test#}
const assert = @import("std").debug.assert; const assert = @import("std").debug.assert;
@ -567,6 +566,7 @@ test "string literals" {
assert(normal_bytes[1] == 'e'); assert(normal_bytes[1] == 'e');
assert('e' == '\x65'); assert('e' == '\x65');
assert('\u{1f4a9}' == 128169); assert('\u{1f4a9}' == 128169);
assert('💯' == 128175);
assert(mem.eql(u8, "hello", "h\x65llo")); assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer. // A C string literal is a null terminated pointer.

View File

@ -371,6 +371,7 @@ pub const Tokenizer = struct {
CharLiteralUnicodeEscapeSawU, CharLiteralUnicodeEscapeSawU,
CharLiteralUnicodeEscape, CharLiteralUnicodeEscape,
CharLiteralUnicodeInvalid, CharLiteralUnicodeInvalid,
CharLiteralUnicode,
CharLiteralEnd, CharLiteralEnd,
Backslash, Backslash,
Equal, Equal,
@ -427,6 +428,7 @@ pub const Tokenizer = struct {
.end = undefined, .end = undefined,
}; };
var seen_escape_digits: usize = undefined; var seen_escape_digits: usize = undefined;
var remaining_code_units: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) { while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index]; const c = self.buffer[self.index];
switch (state) { switch (state) {
@ -774,16 +776,23 @@ pub const Tokenizer = struct {
'\\' => { '\\' => {
state = State.CharLiteralBackslash; state = State.CharLiteralBackslash;
}, },
'\'' => { '\'', 0x80...0xbf, 0xf8...0xff => {
result.id = Token.Id.Invalid; result.id = Token.Id.Invalid;
break; break;
}, },
0xc0...0xdf => { // 110xxxxx
remaining_code_units = 1;
state = State.CharLiteralUnicode;
},
0xe0...0xef => { // 1110xxxx
remaining_code_units = 2;
state = State.CharLiteralUnicode;
},
0xf0...0xf7 => { // 11110xxx
remaining_code_units = 3;
state = State.CharLiteralUnicode;
},
else => { else => {
if (c < 0x20 or c == 0x7f) {
result.id = Token.Id.Invalid;
break;
}
state = State.CharLiteralEnd; state = State.CharLiteralEnd;
}, },
}, },
@ -867,6 +876,19 @@ pub const Tokenizer = struct {
}, },
}, },
State.CharLiteralUnicode => switch (c) {
0x80...0xbf => {
remaining_code_units -= 1;
if (remaining_code_units == 0) {
state = State.CharLiteralEnd;
}
},
else => {
result.id = Token.Id.Invalid;
break;
},
},
State.MultilineStringLiteralLine => switch (c) { State.MultilineStringLiteralLine => switch (c) {
'\n' => { '\n' => {
self.index += 1; self.index += 1;
@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
State.CharLiteralUnicodeEscape, State.CharLiteralUnicodeEscape,
State.CharLiteralUnicodeInvalid, State.CharLiteralUnicodeInvalid,
State.CharLiteralEnd, State.CharLiteralEnd,
State.CharLiteralUnicode,
State.StringLiteralBackslash, State.StringLiteralBackslash,
State.LBracketStar, State.LBracketStar,
State.LBracketStarC, State.LBracketStarC,
@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
, [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
} }
test "tokenizer - char literal with unicode code point" {
testTokenize(
\\'💩'
, [_]Token.Id{.CharLiteral});
}
test "tokenizer - float literal e exponent" { test "tokenizer - float literal e exponent" {
testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{ testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
Token.Id.Identifier, Token.Id.Identifier,

View File

@ -193,6 +193,7 @@ enum TokenizeState {
TokenizeStateStringEscapeUnicodeStart, TokenizeStateStringEscapeUnicodeStart,
TokenizeStateCharLiteral, TokenizeStateCharLiteral,
TokenizeStateCharLiteralEnd, TokenizeStateCharLiteralEnd,
TokenizeStateCharLiteralUnicode,
TokenizeStateSawStar, TokenizeStateSawStar,
TokenizeStateSawStarPercent, TokenizeStateSawStarPercent,
TokenizeStateSawSlash, TokenizeStateSawSlash,
@ -247,6 +248,7 @@ struct Tokenize {
int exponent_in_bin_or_dec; int exponent_in_bin_or_dec;
BigInt specified_exponent; BigInt specified_exponent;
BigInt significand; BigInt significand;
size_t remaining_code_units;
}; };
ATTRIBUTE_PRINTF(2, 3) ATTRIBUTE_PRINTF(2, 3)
@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
} }
break; break;
case TokenizeStateCharLiteral: case TokenizeStateCharLiteral:
switch (c) { if (c == '\'') {
case '\'': tokenize_error(&t, "expected character");
tokenize_error(&t, "expected character"); } else if (c == '\\') {
break; t.state = TokenizeStateStringEscape;
case '\\': } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
t.state = TokenizeStateStringEscape; // 10xxxxxx
break; // 11111xxx
default: invalid_char_error(&t, c);
t.cur_tok->data.char_lit.c = c; } else if (c >= 0xc0 && c <= 0xdf) {
t.state = TokenizeStateCharLiteralEnd; // 110xxxxx
break; t.cur_tok->data.char_lit.c = c & 0x1f;
t.remaining_code_units = 1;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xe0 && c <= 0xef) {
// 1110xxxx
t.cur_tok->data.char_lit.c = c & 0x0f;
t.remaining_code_units = 2;
t.state = TokenizeStateCharLiteralUnicode;
} else if (c >= 0xf0 && c <= 0xf7) {
// 11110xxx
t.cur_tok->data.char_lit.c = c & 0x07;
t.remaining_code_units = 3;
t.state = TokenizeStateCharLiteralUnicode;
} else {
t.cur_tok->data.char_lit.c = c;
t.state = TokenizeStateCharLiteralEnd;
} }
break; break;
case TokenizeStateCharLiteralEnd: case TokenizeStateCharLiteralEnd:
@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
invalid_char_error(&t, c); invalid_char_error(&t, c);
} }
break; break;
case TokenizeStateCharLiteralUnicode:
if (c <= 0x7f || c >= 0xc0) {
invalid_char_error(&t, c);
}
t.cur_tok->data.char_lit.c <<= 6;
t.cur_tok->data.char_lit.c += c & 0x3f;
t.remaining_code_units--;
if (t.remaining_code_units == 0) {
t.state = TokenizeStateCharLiteralEnd;
}
break;
case TokenizeStateZero: case TokenizeStateZero:
switch (c) { switch (c) {
case 'b': case 'b':
@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
break; break;
case TokenizeStateCharLiteral: case TokenizeStateCharLiteral:
case TokenizeStateCharLiteralEnd: case TokenizeStateCharLiteralEnd:
case TokenizeStateCharLiteralUnicode:
tokenize_error(&t, "unterminated character literal"); tokenize_error(&t, "unterminated character literal");
break; break;
case TokenizeStateSymbol: case TokenizeStateSymbol:

View File

@ -699,6 +699,10 @@ test "unicode escape in character literal" {
expect(a == 128169); expect(a == 128169);
} }
test "unicode character in character literal" {
expect('💩' == 128169);
}
test "result location zero sized array inside struct field implicit cast to slice" { test "result location zero sized array inside struct field implicit cast to slice" {
const E = struct { const E = struct {
entries: []u32, entries: []u32,