character literals: allow unicode escapes

also make the documentation for character literals more clear.
closes #2089

see #2097
This commit is contained in:
Andrew Kelley 2019-03-23 17:35:21 -04:00
parent 55cb9ef138
commit 89953ec83d
No known key found for this signature in database
GPG Key ID: 7C5F548F728501A9
7 changed files with 57 additions and 31 deletions

View File

@ -501,7 +501,16 @@ pub fn main() void {
</div>
{#see_also|Optionals|undefined#}
{#header_close#}
{#header_open|String Literals#}
{#header_open|String Literals and Character Literals#}
<p>
String literals are UTF-8 encoded byte arrays.
</p>
<p>
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
character literals will be allowed to have a single UTF-8 encoded codepoint.
</p>
{#code_begin|test#}
const assert = @import("std").debug.assert;
const mem = @import("std").mem;
@ -513,6 +522,7 @@ test "string literals" {
assert(normal_bytes.len == 5);
assert(normal_bytes[1] == 'e');
assert('e' == '\x65');
assert('\U01f4a9' == 128169);
assert(mem.eql(u8, "hello", "h\x65llo"));
// A C string literal is a null terminated pointer.
@ -521,7 +531,7 @@ test "string literals" {
assert(null_terminated_bytes[5] == 0);
}
{#code_end#}
{#see_also|Arrays|Zig Test#}
{#see_also|Arrays|Zig Test|Source Encoding#}
{#header_open|Escape Sequences#}
<div class="table-wrapper">
<table>
@ -8530,7 +8540,7 @@ pub fn main() void {
);
}
{#code_end#}
{#see_also|String Literals#}
{#see_also|String Literals and Character Literals#}
{#header_close#}
{#header_open|Import from C Header File#}

View File

@ -845,7 +845,7 @@ struct AstNodeStringLiteral {
};
struct AstNodeCharLiteral {
uint8_t value;
uint32_t value;
};
struct AstNodeFloatLiteral {

View File

@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) {
if (t.char_code_index >= t.char_code_end) {
if (t.unicode) {
if (t.char_code <= 0x7f) {
if (t.char_code > 0x10ffff) {
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
}
if (t.cur_tok->id == TokenIdCharLiteral) {
t.cur_tok->data.char_lit.c = t.char_code;
t.state = TokenizeStateCharLiteralEnd;
} else if (t.char_code <= 0x7f) {
// 00000000 00000000 00000000 0xxxxxxx
handle_string_escape(&t, (uint8_t)t.char_code);
} else if (t.cur_tok->id == TokenIdCharLiteral) {
tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code);
} else if (t.char_code <= 0x7ff) {
// 00000000 00000000 00000xxx xx000000
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) {
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
// 00000000 00000000 00000000 00xxxxxx
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
} else {
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
}
} else {
if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) {
tokenize_error(&t, "value too large for character literal: '%x'",
t.char_code);
}
assert(t.char_code <= 255);
handle_string_escape(&t, (uint8_t)t.char_code);
}
}

View File

@ -148,7 +148,7 @@ struct TokenStrLit {
};
struct TokenCharLit {
uint8_t c;
uint32_t c;
};
struct Token {

View File

@ -1,3 +1,10 @@
test "zig fmt: character literal larger than u8" {
try testCanonical(
\\const x = '\U01f4a9';
\\
);
}
test "zig fmt: infix operator and then multiline string literal" {
try testCanonical(
\\const x = "" ++

View File

@ -236,8 +236,7 @@ pub const Tokenizer = struct {
MultilineStringLiteralLine,
CharLiteral,
CharLiteralBackslash,
CharLiteralEscape1,
CharLiteralEscape2,
CharLiteralHexEscape,
CharLiteralEnd,
Backslash,
Equal,
@ -293,6 +292,8 @@ pub const Tokenizer = struct {
.start = self.index,
.end = undefined,
};
var seen_escape_digits: usize = undefined;
var expected_escape_digits: usize = undefined;
while (self.index < self.buffer.len) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
@ -658,26 +659,31 @@ pub const Tokenizer = struct {
break;
},
'x' => {
state = State.CharLiteralEscape1;
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 2;
},
'u' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 4;
},
'U' => {
state = State.CharLiteralHexEscape;
seen_escape_digits = 0;
expected_escape_digits = 6;
},
else => {
state = State.CharLiteralEnd;
},
},
State.CharLiteralEscape1 => switch (c) {
State.CharLiteralHexEscape => switch (c) {
'0'...'9', 'a'...'z', 'A'...'F' => {
state = State.CharLiteralEscape2;
},
else => {
result.id = Token.Id.Invalid;
break;
},
},
State.CharLiteralEscape2 => switch (c) {
'0'...'9', 'a'...'z', 'A'...'F' => {
state = State.CharLiteralEnd;
seen_escape_digits += 1;
if (seen_escape_digits == expected_escape_digits) {
state = State.CharLiteralEnd;
}
},
else => {
result.id = Token.Id.Invalid;
@ -1045,8 +1051,7 @@ pub const Tokenizer = struct {
State.Backslash,
State.CharLiteral,
State.CharLiteralBackslash,
State.CharLiteralEscape1,
State.CharLiteralEscape2,
State.CharLiteralHexEscape,
State.CharLiteralEnd,
State.StringLiteralBackslash,
State.LBracketStar,

View File

@ -699,3 +699,8 @@ test "thread local variable" {
S.t += 1;
expect(S.t == 1235);
}
test "unicode escape in character literal" {
var a: u24 = '\U01f4a9';
expect(a == 128169);
}