Avoid concept of a "Unicode character" in documentation and error messages (#8059)

2026-02-20 08:14:48 +00:00 · 2021-02-24 08:26:13 -05:00 · 2021-02-24 08:26:13 -05:00 · 8b9434871e
commit 8b9434871e
parent d9e46dceec
4 changed files with 28 additions and 14 deletions
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@ -310,7 +310,7 @@ pub fn main() !void {
      <p>
        The two arguments passed to the <code>stdout.print()</code> function, <code>"Hello, {s}!\n"</code>
        and <code>.{"world"}</code>, are evaluated at {#link|compile-time|comptime#}. The code sample is
-        purposely written to show how to perform {#link|string|String Literals and Character Literals#}
+        purposely written to show how to perform {#link|string|String Literals and Unicode Code Point Literals#}
        substitution in the <code>print</code> function. The curly-braces inside of the first argument
        are substituted with the compile-time known value inside of the second argument
        (known as an {#link|anonymous struct literal|Anonymous Struct Literals#}). The <code>\n</code>
@ -682,18 +682,31 @@ pub fn main() void {
      </div>
      {#see_also|Optionals|undefined#}
      {#header_close#}
-      {#header_open|String Literals and Character Literals#}
+      {#header_open|String Literals and Unicode Code Point Literals#}
      <p>
-      String literals are single-item constant {#link|Pointers#} to null-terminated UTF-8 encoded byte arrays.
+      String literals are single-item constant {#link|Pointers#} to null-terminated byte arrays.
      The type of string literals encodes both the length, and the fact that they are null-terminated,
      and thus they can be {#link|coerced|Type Coercion#} to both {#link|Slices#} and
      {#link|Null-Terminated Pointers|Sentinel-Terminated Pointers#}.
      Dereferencing string literals converts them to {#link|Arrays#}.
      </p>
      <p>
-      Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
+      The encoding of a string in Zig is de-facto assumed to be UTF-8.
+      Because Zig source code is {#link|UTF-8 encoded|Source Encoding#}, any non-ASCII bytes appearing within a string literal
+      in source code carry their UTF-8 meaning into the content of the string in the Zig program;
+      the bytes are not modified by the compiler.
+      However, it is possible to embbed non-UTF-8 bytes into a string literal using <code>\xNN</code> notation.
+      </p>
+      <p>
+      Unicode code point literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
      {#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
-      and character literals.
+      and Unicode code point literals.
+      </p>
+      <p>
+      In many other programming languages, a Unicode code point literal is called a "character literal".
+      However, there is <a href="https://unicode.org/glossary">no precise technical definition of a "character"</a>
+      in recent versions of the Unicode specification (as of Unicode 13.0).
+      In Zig, a Unicode code point literal corresponds to the Unicode definition of a code point.
      </p>
      {#code_begin|test#}
 const expect = @import("std").testing.expect;
@ -709,6 +722,7 @@ test "string literals" {
    expect('\u{1f4a9}' == 128169);
    expect('💯' == 128175);
    expect(mem.eql(u8, "hello", "h\x65llo"));
+    expect("\xff"[0] == 0xff); // non-UTF-8 strings are possible with \xNN notation.
 }
      {#code_end#}
      {#see_also|Arrays|Zig Test|Source Encoding#}
@ -749,11 +763,11 @@ test "string literals" {
        </tr>
        <tr>
            <td><code>\xNN</code></td>
-          <td>hexadecimal 8-bit character code (2 digits)</td>
+          <td>hexadecimal 8-bit byte value (2 digits)</td>
        </tr>
        <tr>
            <td><code>\u{NNNNNN}</code></td>
-          <td>hexadecimal Unicode character code UTF-8 encoded (1 or more digits)</td>
+          <td>hexadecimal Unicode code point UTF-8 encoded (1 or more digits)</td>
        </tr>
      </table>
      </div>
@ -7414,7 +7428,7 @@ test "main" {
      This function returns a compile time constant pointer to null-terminated,
      fixed-size array with length equal to the byte count of the file given by
      {#syntax#}path{#endsyntax#}. The contents of the array are the contents of the file.
-      This is equivalent to a {#link|string literal|String Literals and Character Literals#}
+      This is equivalent to a {#link|string literal|String Literals and Unicode Code Point Literals#}
      with the file contents.
      </p>
      <p>
--- a/lib/std/zig/parser_test.zig
+++ b/lib/std/zig/parser_test.zig
@ -680,7 +680,7 @@ test "zig fmt: enum literal inside array literal" {
    );
 }

-test "zig fmt: character literal larger than u8" {
+test "zig fmt: Unicode code point literal larger than u8" {
    try testCanonical(
        \\const x = '\u{01f4a9}';
        \\
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@ -1513,7 +1513,7 @@ test "tokenizer - unknown length pointer and then c pointer" {
    });
 }

-test "tokenizer - char literal with hex escape" {
+test "tokenizer - code point literal with hex escape" {
    testTokenize(
        \\'\x1b'
    , &[_]Token.Id{.CharLiteral});
@ -1522,7 +1522,7 @@ test "tokenizer - char literal with hex escape" {
    , &[_]Token.Id{ .Invalid, .Invalid });
 }

-test "tokenizer - char literal with unicode escapes" {
+test "tokenizer - code point literal with unicode escapes" {
    // Valid unicode escapes
    testTokenize(
        \\'\u{3}'
@ -1572,7 +1572,7 @@ test "tokenizer - char literal with unicode escapes" {
    , &[_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid });
 }

-test "tokenizer - char literal with unicode code point" {
+test "tokenizer - code point literal with unicode code point" {
    testTokenize(
        \\'💩'
    , &[_]Token.Id{.CharLiteral});
--- a/src/stage1/tokenizer.cpp
+++ b/src/stage1/tokenizer.cpp
@ -1447,7 +1447,7 @@ void tokenize(Buf *buf, Tokenization *out) {
                tokenize_error(&t, "unterminated string");
                break;
            } else if (t.cur_tok->id == TokenIdCharLiteral) {
-                tokenize_error(&t, "unterminated character literal");
+                tokenize_error(&t, "unterminated Unicode code point literal");
                break;
            } else {
                zig_unreachable();
@ -1456,7 +1456,7 @@ void tokenize(Buf *buf, Tokenization *out) {
        case TokenizeStateCharLiteral:
        case TokenizeStateCharLiteralEnd:
        case TokenizeStateCharLiteralUnicode:
-            tokenize_error(&t, "unterminated character literal");
+            tokenize_error(&t, "unterminated Unicode code point literal");
            break;
        case TokenizeStateSymbol:
        case TokenizeStateZero: