From 8b9434871ea437840d25f073b945466359f402f9 Mon Sep 17 00:00:00 2001
From: Josh Wolfe
The two arguments passed to the
- String literals are single-item constant {#link|Pointers#} to null-terminated UTF-8 encoded byte arrays.
+ String literals are single-item constant {#link|Pointers#} to null-terminated byte arrays.
The type of string literals encodes both the length, and the fact that they are null-terminated,
and thus they can be {#link|coerced|Type Coercion#} to both {#link|Slices#} and
{#link|Null-Terminated Pointers|Sentinel-Terminated Pointers#}.
Dereferencing string literals converts them to {#link|Arrays#}.
- Character literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
+ The encoding of a string in Zig is de-facto assumed to be UTF-8.
+ Because Zig source code is {#link|UTF-8 encoded|Source Encoding#}, any non-ASCII bytes appearing within a string literal
+ in source code carry their UTF-8 meaning into the content of the string in the Zig program;
+ the bytes are not modified by the compiler.
+ However, it is possible to embbed non-UTF-8 bytes into a string literal using
+ Unicode code point literals have type {#syntax#}comptime_int{#endsyntax#}, the same as
{#link|Integer Literals#}. All {#link|Escape Sequences#} are valid in both string literals
- and character literals.
+ and Unicode code point literals.
+
+ In many other programming languages, a Unicode code point literal is called a "character literal".
+ However, there is no precise technical definition of a "character"
+ in recent versions of the Unicode specification (as of Unicode 13.0).
+ In Zig, a Unicode code point literal corresponds to the Unicode definition of a code point.
stdout.print() function, "Hello, {s}!\n"
and .{"world"}, are evaluated at {#link|compile-time|comptime#}. The code sample is
- purposely written to show how to perform {#link|string|String Literals and Character Literals#}
+ purposely written to show how to perform {#link|string|String Literals and Unicode Code Point Literals#}
substitution in the print function. The curly-braces inside of the first argument
are substituted with the compile-time known value inside of the second argument
(known as an {#link|anonymous struct literal|Anonymous Struct Literals#}). The \n
@@ -682,18 +682,31 @@ pub fn main() void {
{#see_also|Optionals|undefined#}
{#header_close#}
- {#header_open|String Literals and Character Literals#}
+ {#header_open|String Literals and Unicode Code Point Literals#}
\xNN notation.
+
- \xNNhexadecimal 8-bit character code (2 digits)
+ hexadecimal 8-bit byte value (2 digits)
@@ -7414,7 +7428,7 @@ test "main" {
This function returns a compile time constant pointer to null-terminated,
fixed-size array with length equal to the byte count of the file given by
{#syntax#}path{#endsyntax#}. The contents of the array are the contents of the file.
- This is equivalent to a {#link|string literal|String Literals and Character Literals#}
+ This is equivalent to a {#link|string literal|String Literals and Unicode Code Point Literals#}
with the file contents.
- \u{NNNNNN}hexadecimal Unicode character code UTF-8 encoded (1 or more digits)
+ hexadecimal Unicode code point UTF-8 encoded (1 or more digits)
diff --git a/lib/std/zig/parser_test.zig b/lib/std/zig/parser_test.zig index 822e9006c4..505e900c64 100644 --- a/lib/std/zig/parser_test.zig +++ b/lib/std/zig/parser_test.zig @@ -680,7 +680,7 @@ test "zig fmt: enum literal inside array literal" { ); } -test "zig fmt: character literal larger than u8" { +test "zig fmt: Unicode code point literal larger than u8" { try testCanonical( \\const x = '\u{01f4a9}'; \\ diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index dcbf717638..083f942db6 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -1513,7 +1513,7 @@ test "tokenizer - unknown length pointer and then c pointer" { }); } -test "tokenizer - char literal with hex escape" { +test "tokenizer - code point literal with hex escape" { testTokenize( \\'\x1b' , &[_]Token.Id{.CharLiteral}); @@ -1522,7 +1522,7 @@ test "tokenizer - char literal with hex escape" { , &[_]Token.Id{ .Invalid, .Invalid }); } -test "tokenizer - char literal with unicode escapes" { +test "tokenizer - code point literal with unicode escapes" { // Valid unicode escapes testTokenize( \\'\u{3}' @@ -1572,7 +1572,7 @@ test "tokenizer - char literal with unicode escapes" { , &[_]Token.Id{ .Invalid, .IntegerLiteral, .Invalid }); } -test "tokenizer - char literal with unicode code point" { +test "tokenizer - code point literal with unicode code point" { testTokenize( \\'💩' , &[_]Token.Id{.CharLiteral}); diff --git a/src/stage1/tokenizer.cpp b/src/stage1/tokenizer.cpp index 1d25bca17b..623169a313 100644 --- a/src/stage1/tokenizer.cpp +++ b/src/stage1/tokenizer.cpp @@ -1447,7 +1447,7 @@ void tokenize(Buf *buf, Tokenization *out) { tokenize_error(&t, "unterminated string"); break; } else if (t.cur_tok->id == TokenIdCharLiteral) { - tokenize_error(&t, "unterminated character literal"); + tokenize_error(&t, "unterminated Unicode code point literal"); break; } else { zig_unreachable(); @@ -1456,7 +1456,7 @@ void tokenize(Buf *buf, Tokenization *out) { case TokenizeStateCharLiteral: case TokenizeStateCharLiteralEnd: case TokenizeStateCharLiteralUnicode: - tokenize_error(&t, "unterminated character literal"); + tokenize_error(&t, "unterminated Unicode code point literal"); break; case TokenizeStateSymbol: case TokenizeStateZero: