character literals: allow unicode escapes

also make the documentation for character literals more clear. closes #2089 see #2097
2026-02-12 20:37:54 +00:00 · 2019-03-23 17:35:21 -04:00 · 2019-03-23 17:35:21 -04:00 · 89953ec83d
commit 89953ec83d
parent 55cb9ef138
7 changed files with 57 additions and 31 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -501,7 +501,16 @@ pub fn main() void {
      </div>
      {#see_also|Optionals|undefined#}
      {#header_close#}
-      {#header_open|String Literals#}
+      {#header_open|String Literals and Character Literals#}
+      <p>
+      String literals are UTF-8 encoded byte arrays.
+      </p>
+      <p>
+      Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
+      {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
+      and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
+      character literals will be allowed to have a single UTF-8 encoded codepoint.
+      </p>
      {#code_begin|test#}
 const assert = @import("std").debug.assert;
 const mem = @import("std").mem;
@ -513,6 +522,7 @@ test "string literals" {
    assert(normal_bytes.len == 5);
    assert(normal_bytes[1] == 'e');
    assert('e' == '\x65');
+    assert('\U01f4a9' == 128169);
    assert(mem.eql(u8, "hello", "h\x65llo"));

    // A C string literal is a null terminated pointer.
@ -521,7 +531,7 @@ test "string literals" {
    assert(null_terminated_bytes[5] == 0);
 }
      {#code_end#}
-      {#see_also|Arrays|Zig Test#}
+      {#see_also|Arrays|Zig Test|Source Encoding#}
      {#header_open|Escape Sequences#}
      <div class="table-wrapper">
      <table>
@ -8530,7 +8540,7 @@ pub fn main() void {
    );
 }
      {#code_end#}
-      {#see_also|String Literals#}
+      {#see_also|String Literals and Character Literals#}
      {#header_close#}

      {#header_open|Import from C Header File#}
--- a/src/all_types.hpp
+++ b/src/all_types.hpp
@ -845,7 +845,7 @@ struct AstNodeStringLiteral {
 };

 struct AstNodeCharLiteral {
-    uint8_t value;
+    uint32_t value;
 };

 struct AstNodeFloatLiteral {
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -1103,11 +1103,15 @@ void tokenize(Buf *buf, Tokenization *out) {

                    if (t.char_code_index >= t.char_code_end) {
                        if (t.unicode) {
-                            if (t.char_code <= 0x7f) {
+                            if (t.char_code > 0x10ffff) {
+                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
+                            }
+                            if (t.cur_tok->id == TokenIdCharLiteral) {
+                                t.cur_tok->data.char_lit.c = t.char_code;
+                                t.state = TokenizeStateCharLiteralEnd;
+                            } else if (t.char_code <= 0x7f) {
                                // 00000000 00000000 00000000 0xxxxxxx
                                handle_string_escape(&t, (uint8_t)t.char_code);
-                            } else if (t.cur_tok->id == TokenIdCharLiteral) {
-                                tokenize_error(&t, "unicode value too large for character literal: %x", t.char_code);
                            } else if (t.char_code <= 0x7ff) {
                                // 00000000 00000000 00000xxx xx000000
                                handle_string_escape(&t, (uint8_t)(0xc0 | (t.char_code >> 6)));
@ -1129,14 +1133,9 @@ void tokenize(Buf *buf, Tokenization *out) {
                                handle_string_escape(&t, (uint8_t)(0x80 | ((t.char_code >> 6) & 0x3f)));
                                // 00000000 00000000 00000000 00xxxxxx
                                handle_string_escape(&t, (uint8_t)(0x80 | (t.char_code & 0x3f)));
-                            } else {
-                                tokenize_error(&t, "unicode value out of range: %x", t.char_code);
                            }
                        } else {
-                            if (t.cur_tok->id == TokenIdCharLiteral && t.char_code > UINT8_MAX) {
-                                tokenize_error(&t, "value too large for character literal: '%x'",
-                                        t.char_code);
-                            }
+                            assert(t.char_code <= 255);
                            handle_string_escape(&t, (uint8_t)t.char_code);
                        }
                    }
--- a/src/tokenizer.hpp
+++ b/src/tokenizer.hpp
@ -148,7 +148,7 @@ struct TokenStrLit {
 };

 struct TokenCharLit {
-    uint8_t c;
+    uint32_t c;
 };

 struct Token {
--- a/std/zig/parser_test.zig
+++ b/std/zig/parser_test.zig
@ -1,3 +1,10 @@
+test "zig fmt: character literal larger than u8" {
+    try testCanonical(
+        \\const x = '\U01f4a9';
+        \\
+    );
+}
+
 test "zig fmt: infix operator and then multiline string literal" {
    try testCanonical(
        \\const x = "" ++
--- a/std/zig/tokenizer.zig
+++ b/std/zig/tokenizer.zig
@ -236,8 +236,7 @@ pub const Tokenizer = struct {
        MultilineStringLiteralLine,
        CharLiteral,
        CharLiteralBackslash,
-        CharLiteralEscape1,
-        CharLiteralEscape2,
+        CharLiteralHexEscape,
        CharLiteralEnd,
        Backslash,
        Equal,
@ -293,6 +292,8 @@ pub const Tokenizer = struct {
            .start = self.index,
            .end = undefined,
        };
+        var seen_escape_digits: usize = undefined;
+        var expected_escape_digits: usize = undefined;
        while (self.index < self.buffer.len) : (self.index += 1) {
            const c = self.buffer[self.index];
            switch (state) {
@ -658,26 +659,31 @@ pub const Tokenizer = struct {
                        break;
                    },
                    'x' => {
-                        state = State.CharLiteralEscape1;
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 2;
+                    },
+                    'u' => {
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 4;
+                    },
+                    'U' => {
+                        state = State.CharLiteralHexEscape;
+                        seen_escape_digits = 0;
+                        expected_escape_digits = 6;
                    },
                    else => {
                        state = State.CharLiteralEnd;
                    },
                },

-                State.CharLiteralEscape1 => switch (c) {
+                State.CharLiteralHexEscape => switch (c) {
                    '0'...'9', 'a'...'z', 'A'...'F' => {
-                        state = State.CharLiteralEscape2;
-                    },
-                    else => {
-                        result.id = Token.Id.Invalid;
-                        break;
-                    },
-                },
-
-                State.CharLiteralEscape2 => switch (c) {
-                    '0'...'9', 'a'...'z', 'A'...'F' => {
-                        state = State.CharLiteralEnd;
+                        seen_escape_digits += 1;
+                        if (seen_escape_digits == expected_escape_digits) {
+                            state = State.CharLiteralEnd;
+                        }
                    },
                    else => {
                        result.id = Token.Id.Invalid;
@ -1045,8 +1051,7 @@ pub const Tokenizer = struct {
                State.Backslash,
                State.CharLiteral,
                State.CharLiteralBackslash,
-                State.CharLiteralEscape1,
-                State.CharLiteralEscape2,
+                State.CharLiteralHexEscape,
                State.CharLiteralEnd,
                State.StringLiteralBackslash,
                State.LBracketStar,
--- a/test/stage1/behavior/misc.zig
+++ b/test/stage1/behavior/misc.zig
@ -699,3 +699,8 @@ test "thread local variable" {
    S.t += 1;
    expect(S.t == 1235);
 }
+
+test "unicode escape in character literal" {
+    var a: u24 = '\U01f4a9';
+    expect(a == 128169);
+}