diff --git a/doc/langref.html.in b/doc/langref.html.in index c42eeaf88b..0de6650c56 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -501,7 +501,16 @@ pub fn main() void { {#see_also|Optionals|undefined#} {#header_close#} - {#header_open|String Literals#} + {#header_open|String Literals and Character Literals#} +

+ String literals are UTF-8 encoded byte arrays. +

+

+ Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as + {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals + and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented, + character literals will be allowed to have a single UTF-8 encoded codepoint. +

{#code_begin|test#} const assert = @import("std").debug.assert; const mem = @import("std").mem; @@ -513,6 +522,7 @@ test "string literals" { assert(normal_bytes.len == 5); assert(normal_bytes[1] == 'e'); assert('e' == '\x65'); + assert('\U01f4a9' == 128169); assert(mem.eql(u8, "hello", "h\x65llo")); // A C string literal is a null terminated pointer. @@ -521,7 +531,7 @@ test "string literals" { assert(null_terminated_bytes[5] == 0); } {#code_end#} - {#see_also|Arrays|Zig Test#} + {#see_also|Arrays|Zig Test|Source Encoding#} {#header_open|Escape Sequences#}
@@ -8530,7 +8540,7 @@ pub fn main() void { ); } {#code_end#} - {#see_also|String Literals#} + {#see_also|String Literals and Character Literals#} {#header_close#} {#header_open|Import from C Header File#} diff --git a/src/all_types.hpp b/src/all_types.hpp index bd4b802e73..b49b42d495 100644 --- a/src/all_types.hpp +++ b/src/all_types.hpp @@ -845,7 +845,7 @@ struct AstNodeStringLiteral { }; struct AstNodeCharLiteral { - uint8_t value; + uint32_t value; }; struct AstNodeFloatLiteral { diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index dc9d61aa22..7d41343e3a 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) { if (t.char_code_index >= t.char_code_end) { if (t.unicode) { - if (t.char_code <= 0x7f) { + if (t.char_code > 0x10ffff) { + tokenize_error(&t, "unicode value out of range: %x", t.char_code); + } + if (t.cur_tok->id == TokenIdCharLiteral) { + t.cur_tok->data.char_lit.c = t.char_code; + t.state = TokenizeStateCharLiteralEnd; + } else if (t.char_code <= 0x7f) { // 00000000 00000000 00000000 0xxxxxxx handle_string_escape(&t, (uint8_t)t.char_code); - } else if (t.cur_tok->id == TokenIdCharLiteral) { - tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code); } else if (t.char_code <= 0x7ff) { // 00000000 00000000 00000xxx xx000000 handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6))); @@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) { handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f))); // 00000000 00000000 00000000 00xxxxxx handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f))); - } else { - tokenize_error(&t, "unicode value out of range: %x", t.char_code); } } else { - if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) { - tokenize_error(&t, "value too large for character literal: '%x'", - t.char_code); - } + assert(t.char_code <= 255); handle_string_escape(&t, (uint8_t)t.char_code); } } diff --git a/src/tokenizer.hpp b/src/tokenizer.hpp index 62117b5779..13ab0352d9 100644 --- a/src/tokenizer.hpp +++ b/src/tokenizer.hpp @@ -148,7 +148,7 @@ struct TokenStrLit { }; struct TokenCharLit { - uint8_t c; + uint32_t c; }; struct Token { diff --git a/std/zig/parser_test.zig b/std/zig/parser_test.zig index 508813759c..8b9c0c2d64 100644 --- a/std/zig/parser_test.zig +++ b/std/zig/parser_test.zig @@ -1,3 +1,10 @@ +test "zig fmt: character literal larger than u8" { + try testCanonical( + \\const x = '\U01f4a9'; + \\ + ); +} + test "zig fmt: infix operator and then multiline string literal" { try testCanonical( \\const x = "" ++ diff --git a/std/zig/tokenizer.zig b/std/zig/tokenizer.zig index 2159371ccf..19d64514a1 100644 --- a/std/zig/tokenizer.zig +++ b/std/zig/tokenizer.zig @@ -236,8 +236,7 @@ pub const Tokenizer = struct { MultilineStringLiteralLine, CharLiteral, CharLiteralBackslash, - CharLiteralEscape1, - CharLiteralEscape2, + CharLiteralHexEscape, CharLiteralEnd, Backslash, Equal, @@ -293,6 +292,8 @@ pub const Tokenizer = struct { .start = self.index, .end = undefined, }; + var seen_escape_digits: usize = undefined; + var expected_escape_digits: usize = undefined; while (self.index < self.buffer.len) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { @@ -658,26 +659,31 @@ pub const Tokenizer = struct { break; }, 'x' => { - state = State.CharLiteralEscape1; + state = State.CharLiteralHexEscape; + seen_escape_digits = 0; + expected_escape_digits = 2; + }, + 'u' => { + state = State.CharLiteralHexEscape; + seen_escape_digits = 0; + expected_escape_digits = 4; + }, + 'U' => { + state = State.CharLiteralHexEscape; + seen_escape_digits = 0; + expected_escape_digits = 6; }, else => { state = State.CharLiteralEnd; }, }, - State.CharLiteralEscape1 => switch (c) { + State.CharLiteralHexEscape => switch (c) { '0'...'9', 'a'...'z', 'A'...'F' => { - state = State.CharLiteralEscape2; - }, - else => { - result.id = Token.Id.Invalid; - break; - }, - }, - - State.CharLiteralEscape2 => switch (c) { - '0'...'9', 'a'...'z', 'A'...'F' => { - state = State.CharLiteralEnd; + seen_escape_digits += 1; + if (seen_escape_digits == expected_escape_digits) { + state = State.CharLiteralEnd; + } }, else => { result.id = Token.Id.Invalid; @@ -1045,8 +1051,7 @@ pub const Tokenizer = struct { State.Backslash, State.CharLiteral, State.CharLiteralBackslash, - State.CharLiteralEscape1, - State.CharLiteralEscape2, + State.CharLiteralHexEscape, State.CharLiteralEnd, State.StringLiteralBackslash, State.LBracketStar, diff --git a/test/stage1/behavior/misc.zig b/test/stage1/behavior/misc.zig index 36246162f5..fd407821e6 100644 --- a/test/stage1/behavior/misc.zig +++ b/test/stage1/behavior/misc.zig @@ -699,3 +699,8 @@ test "thread local variable" { S.t += 1; expect(S.t == 1235); } + +test "unicode escape in character literal" { + var a: u24 = '\U01f4a9'; + expect(a == 128169); +}