unicode character literals

2025-12-16 11:13:08 +00:00 · 2019-10-06 19:52:35 +02:00 · 2019-10-06 19:52:35 +02:00 · ae7392e504
commit ae7392e504
parent 571123465b
4 changed files with 81 additions and 19 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -552,8 +552,7 @@ pub fn main() void {
      <p>
      Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
      {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
-      and character literals. Once https://github.com/ziglang/zig/issues/2097 is implemented,
-      character literals will be allowed to have a single UTF-8 encoded codepoint.
+      and character literals.
      </p>
      {#code_begin|test#}
 const assert = @import("std").debug.assert;
@ -567,6 +566,7 @@ test "string literals" {
    assert(normal_bytes[1] == 'e');
    assert('e' == '\x65');
    assert('\u{1f4a9}' == 128169);
+    assert('💯' == 128175);
    assert(mem.eql(u8, "hello", "h\x65llo"));

    // A C string literal is a null terminated pointer.
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@ -371,6 +371,7 @@ pub const Tokenizer = struct {
        CharLiteralUnicodeEscapeSawU,
        CharLiteralUnicodeEscape,
        CharLiteralUnicodeInvalid,
+        CharLiteralUnicode,
        CharLiteralEnd,
        Backslash,
        Equal,
@ -427,6 +428,7 @@ pub const Tokenizer = struct {
            .end = undefined,
        };
        var seen_escape_digits: usize = undefined;
+        var remaining_code_units: usize = undefined;
        while (self.index < self.buffer.len) : (self.index += 1) {
            const c = self.buffer[self.index];
            switch (state) {
@ -774,16 +776,23 @@ pub const Tokenizer = struct {
                    '\\' => {
                        state = State.CharLiteralBackslash;
                    },
-                    '\'' => {
+                    '\'', 0x80...0xbf, 0xf8...0xff => {
                        result.id = Token.Id.Invalid;
                        break;
                    },
+                    0xc0...0xdf => { // 110xxxxx
+                        remaining_code_units = 1;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xe0...0xef => { // 1110xxxx
+                        remaining_code_units = 2;
+                        state = State.CharLiteralUnicode;
+                    },
+                    0xf0...0xf7 => { // 11110xxx
+                        remaining_code_units = 3;
+                        state = State.CharLiteralUnicode;
+                    },
                    else => {
-                        if (c < 0x20 or c == 0x7f) {
-                            result.id = Token.Id.Invalid;
-                            break;
-                        }
-
                        state = State.CharLiteralEnd;
                    },
                },
@ -867,6 +876,19 @@ pub const Tokenizer = struct {
                    },
                },

+                State.CharLiteralUnicode => switch (c) {
+                    0x80...0xbf => {
+                        remaining_code_units -= 1;
+                        if (remaining_code_units == 0) {
+                            state = State.CharLiteralEnd;
+                        }
+                    },
+                    else => {
+                        result.id = Token.Id.Invalid;
+                        break;
+                    },
+                },
+
                State.MultilineStringLiteralLine => switch (c) {
                    '\n' => {
                        self.index += 1;
@ -1220,6 +1242,7 @@ pub const Tokenizer = struct {
                State.CharLiteralUnicodeEscape,
                State.CharLiteralUnicodeInvalid,
                State.CharLiteralEnd,
+                State.CharLiteralUnicode,
                State.StringLiteralBackslash,
                State.LBracketStar,
                State.LBracketStarC,
@ -1428,6 +1451,12 @@ test "tokenizer - char literal with unicode escapes" {
    , [_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }

+test "tokenizer - char literal with unicode code point" {
+    testTokenize(
+        \\'💩'
+    , [_]Token.Id{.CharLiteral});
+}
+
 test "tokenizer - float literal e exponent" {
    testTokenize("a = 4.94065645841246544177e-324;\n", [_]Token.Id{
        Token.Id.Identifier,
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@ -193,6 +193,7 @@ enum TokenizeState {
    TokenizeStateStringEscapeUnicodeStart,
    TokenizeStateCharLiteral,
    TokenizeStateCharLiteralEnd,
+    TokenizeStateCharLiteralUnicode,
    TokenizeStateSawStar,
    TokenizeStateSawStarPercent,
    TokenizeStateSawSlash,
@ -247,6 +248,7 @@ struct Tokenize {
    int exponent_in_bin_or_dec;
    BigInt specified_exponent;
    BigInt significand;
+    size_t remaining_code_units;
 };

 ATTRIBUTE_PRINTF(2, 3)
@ -1176,17 +1178,32 @@ void tokenize(Buf *buf, Tokenization *out) {
                }
                break;
            case TokenizeStateCharLiteral:
-                switch (c) {
-                    case '\'':
+                if (c == '\'') {
                    tokenize_error(&t, "expected character");
-                        break;
-                    case '\\':
+                } else if (c == '\\') {
                    t.state = TokenizeStateStringEscape;
-                        break;
-                    default:
+                } else if ((c >= 0x80 && c <= 0xbf) || c >= 0xf8) {
+                    // 10xxxxxx
+                    // 11111xxx
+                    invalid_char_error(&t, c);
+                } else if (c >= 0xc0 && c <= 0xdf) {
+                    // 110xxxxx
+                    t.cur_tok->data.char_lit.c = c & 0x1f;
+                    t.remaining_code_units = 1;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else if (c >= 0xe0 && c <= 0xef) {
+                    // 1110xxxx
+                    t.cur_tok->data.char_lit.c = c & 0x0f;
+                    t.remaining_code_units = 2;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else if (c >= 0xf0 && c <= 0xf7) {
+                    // 11110xxx
+                    t.cur_tok->data.char_lit.c = c & 0x07;
+                    t.remaining_code_units = 3;
+                    t.state = TokenizeStateCharLiteralUnicode;
+                } else {
                    t.cur_tok->data.char_lit.c = c;
                    t.state = TokenizeStateCharLiteralEnd;
-                        break;
                }
                break;
            case TokenizeStateCharLiteralEnd:
@ -1199,6 +1216,17 @@ void tokenize(Buf *buf, Tokenization *out) {
                        invalid_char_error(&t, c);
                }
                break;
+            case TokenizeStateCharLiteralUnicode:
+                if (c <= 0x7f || c >= 0xc0) {
+                    invalid_char_error(&t, c);
+                }
+                t.cur_tok->data.char_lit.c <<= 6;
+                t.cur_tok->data.char_lit.c += c & 0x3f;
+                t.remaining_code_units--;
+                if (t.remaining_code_units == 0) {
+                    t.state = TokenizeStateCharLiteralEnd;
+                }
+                break;
            case TokenizeStateZero:
                switch (c) {
                    case 'b':
@ -1434,6 +1462,7 @@ void tokenize(Buf *buf, Tokenization *out) {
            break;
        case TokenizeStateCharLiteral:
        case TokenizeStateCharLiteralEnd:
+        case TokenizeStateCharLiteralUnicode:
            tokenize_error(&t, "unterminated character literal");
            break;
        case TokenizeStateSymbol:
--- a/test/stage1/behavior/misc.zig
+++ b/test/stage1/behavior/misc.zig
@ -699,6 +699,10 @@ test "unicode escape in character literal" {
    expect(a == 128169);
 }

+test "unicode character in character literal" {
+    expect('💩' == 128169);
+}
+
 test "result location zero sized array inside struct field implicit cast to slice" {
    const E = struct {
        entries: []u32,