mirror of
https://github.com/ziglang/zig.git
synced 2026-02-12 20:37:54 +00:00
character literals: allow unicode escapes
also make the documentation for character literals more clear. closes #2089 see #2097
This commit is contained in:
parent
55cb9ef138
commit
89953ec83d
@ -501,7 +501,16 @@ pub fn main() void {
|
||||
</div>
|
||||
{#see_also|Optionals|undefined#}
|
||||
{#header_close#}
|
||||
{#header_open|String Literals#}
|
||||
{#header_open|String Literals and Character Literals#}
|
||||
<p>
|
||||
String literals are UTF-8 encoded byte arrays.
|
||||
</p>
|
||||
<p>
|
||||
Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
|
||||
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
|
||||
and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
|
||||
character literals will be allowed to have a single UTF-8 encoded codepoint.
|
||||
</p>
|
||||
{#code_begin|test#}
|
||||
const assert = @import("std").debug.assert;
|
||||
const mem = @import("std").mem;
|
||||
@ -513,6 +522,7 @@ test "string literals" {
|
||||
assert(normal_bytes.len == 5);
|
||||
assert(normal_bytes[1] == 'e');
|
||||
assert('e' == '\x65');
|
||||
assert('\U01f4a9' == 128169);
|
||||
assert(mem.eql(u8, "hello", "h\x65llo"));
|
||||
|
||||
// A C string literal is a null terminated pointer.
|
||||
@ -521,7 +531,7 @@ test "string literals" {
|
||||
assert(null_terminated_bytes[5] == 0);
|
||||
}
|
||||
{#code_end#}
|
||||
{#see_also|Arrays|Zig Test#}
|
||||
{#see_also|Arrays|Zig Test|Source Encoding#}
|
||||
{#header_open|Escape Sequences#}
|
||||
<div class="table-wrapper">
|
||||
<table>
|
||||
@ -8530,7 +8540,7 @@ pub fn main() void {
|
||||
);
|
||||
}
|
||||
{#code_end#}
|
||||
{#see_also|String Literals#}
|
||||
{#see_also|String Literals and Character Literals#}
|
||||
{#header_close#}
|
||||
|
||||
{#header_open|Import from C Header File#}
|
||||
|
||||
@ -845,7 +845,7 @@ struct AstNodeStringLiteral {
|
||||
};
|
||||
|
||||
struct AstNodeCharLiteral {
|
||||
uint8_t value;
|
||||
uint32_t value;
|
||||
};
|
||||
|
||||
struct AstNodeFloatLiteral {
|
||||
|
||||
@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
|
||||
if (t.char_code_index >= t.char_code_end) {
|
||||
if (t.unicode) {
|
||||
if (t.char_code <= 0x7f) {
|
||||
if (t.char_code > 0x10ffff) {
|
||||
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
|
||||
}
|
||||
if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
t.cur_tok->data.char_lit.c = t.char_code;
|
||||
t.state = TokenizeStateCharLiteralEnd;
|
||||
} else if (t.char_code <= 0x7f) {
|
||||
// 00000000 00000000 00000000 0xxxxxxx
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
} else if (t.cur_tok->id == TokenIdCharLiteral) {
|
||||
tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code);
|
||||
} else if (t.char_code <= 0x7ff) {
|
||||
// 00000000 00000000 00000xxx xx000000
|
||||
handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
|
||||
@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) {
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
|
||||
// 00000000 00000000 00000000 00xxxxxx
|
||||
handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
|
||||
} else {
|
||||
tokenize_error(&t, "unicode value out of range: %x", t.char_code);
|
||||
}
|
||||
} else {
|
||||
if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) {
|
||||
tokenize_error(&t, "value too large for character literal: '%x'",
|
||||
t.char_code);
|
||||
}
|
||||
assert(t.char_code <= 255);
|
||||
handle_string_escape(&t, (uint8_t)t.char_code);
|
||||
}
|
||||
}
|
||||
|
||||
@ -148,7 +148,7 @@ struct TokenStrLit {
|
||||
};
|
||||
|
||||
struct TokenCharLit {
|
||||
uint8_t c;
|
||||
uint32_t c;
|
||||
};
|
||||
|
||||
struct Token {
|
||||
|
||||
@ -1,3 +1,10 @@
|
||||
test "zig fmt: character literal larger than u8" {
|
||||
try testCanonical(
|
||||
\\const x = '\U01f4a9';
|
||||
\\
|
||||
);
|
||||
}
|
||||
|
||||
test "zig fmt: infix operator and then multiline string literal" {
|
||||
try testCanonical(
|
||||
\\const x = "" ++
|
||||
|
||||
@ -236,8 +236,7 @@ pub const Tokenizer = struct {
|
||||
MultilineStringLiteralLine,
|
||||
CharLiteral,
|
||||
CharLiteralBackslash,
|
||||
CharLiteralEscape1,
|
||||
CharLiteralEscape2,
|
||||
CharLiteralHexEscape,
|
||||
CharLiteralEnd,
|
||||
Backslash,
|
||||
Equal,
|
||||
@ -293,6 +292,8 @@ pub const Tokenizer = struct {
|
||||
.start = self.index,
|
||||
.end = undefined,
|
||||
};
|
||||
var seen_escape_digits: usize = undefined;
|
||||
var expected_escape_digits: usize = undefined;
|
||||
while (self.index < self.buffer.len) : (self.index += 1) {
|
||||
const c = self.buffer[self.index];
|
||||
switch (state) {
|
||||
@ -658,26 +659,31 @@ pub const Tokenizer = struct {
|
||||
break;
|
||||
},
|
||||
'x' => {
|
||||
state = State.CharLiteralEscape1;
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 2;
|
||||
},
|
||||
'u' => {
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 4;
|
||||
},
|
||||
'U' => {
|
||||
state = State.CharLiteralHexEscape;
|
||||
seen_escape_digits = 0;
|
||||
expected_escape_digits = 6;
|
||||
},
|
||||
else => {
|
||||
state = State.CharLiteralEnd;
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralEscape1 => switch (c) {
|
||||
State.CharLiteralHexEscape => switch (c) {
|
||||
'0'...'9', 'a'...'z', 'A'...'F' => {
|
||||
state = State.CharLiteralEscape2;
|
||||
},
|
||||
else => {
|
||||
result.id = Token.Id.Invalid;
|
||||
break;
|
||||
},
|
||||
},
|
||||
|
||||
State.CharLiteralEscape2 => switch (c) {
|
||||
'0'...'9', 'a'...'z', 'A'...'F' => {
|
||||
state = State.CharLiteralEnd;
|
||||
seen_escape_digits += 1;
|
||||
if (seen_escape_digits == expected_escape_digits) {
|
||||
state = State.CharLiteralEnd;
|
||||
}
|
||||
},
|
||||
else => {
|
||||
result.id = Token.Id.Invalid;
|
||||
@ -1045,8 +1051,7 @@ pub const Tokenizer = struct {
|
||||
State.Backslash,
|
||||
State.CharLiteral,
|
||||
State.CharLiteralBackslash,
|
||||
State.CharLiteralEscape1,
|
||||
State.CharLiteralEscape2,
|
||||
State.CharLiteralHexEscape,
|
||||
State.CharLiteralEnd,
|
||||
State.StringLiteralBackslash,
|
||||
State.LBracketStar,
|
||||
|
||||
@ -699,3 +699,8 @@ test "thread local variable" {
|
||||
S.t += 1;
|
||||
expect(S.t == 1235);
|
||||
}
|
||||
|
||||
test "unicode escape in character literal" {
|
||||
var a: u24 = '\U01f4a9';
|
||||
expect(a == 128169);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user