From 62d717e2ffb1e9a1127652521de57c2e18cf7d3b Mon Sep 17 00:00:00 2001 From: r00ster91 Date: Wed, 13 Apr 2022 18:28:49 +0200 Subject: [PATCH] Add `std.unicode.replacement_character` --- lib/std/fmt.zig | 6 +++--- lib/std/unicode.zig | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index 3f230a445e..80a2e87c5c 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -966,10 +966,10 @@ pub fn formatUnicodeCodepoint( writer: anytype, ) !void { var buf: [4]u8 = undefined; - const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) { + const len = unicode.utf8Encode(c, &buf) catch |err| switch (err) { error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => { - // In case of error output the replacement char U+FFFD - return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer); + const len = unicode.utf8Encode(unicode.replacement_character, &buf) catch unreachable; + return formatBuf(buf[0..len], options, writer); }, }; return formatBuf(buf[0..len], options, writer); diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 487f3defdf..81a7ed838f 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -3,6 +3,11 @@ const assert = std.debug.assert; const testing = std.testing; const mem = std.mem; +/// Use this to replace an unknown, unrecognized, or unrepresentable character. +/// +/// See also: https://en.wikipedia.org/wiki/Specials_(Unicode_block)#Replacement_character +pub const replacement_character: u21 = 0xFFFD; + /// Returns how many bytes the UTF-8 representation would require /// for the given codepoint. pub fn utf8CodepointSequenceLength(c: u21) !u3 { @@ -777,15 +782,14 @@ fn formatUtf16le( options: std.fmt.FormatOptions, writer: anytype, ) !void { - const unknown_codepoint = 0xfffd; _ = fmt; _ = options; var buf: [300]u8 = undefined; // just a random size I chose var it = Utf16LeIterator.init(utf16le); var u8len: usize = 0; - while (it.nextCodepoint() catch unknown_codepoint) |codepoint| { + while (it.nextCodepoint() catch replacement_character) |codepoint| { u8len += utf8Encode(codepoint, buf[u8len..]) catch - utf8Encode(unknown_codepoint, buf[u8len..]) catch unreachable; + utf8Encode(replacement_character, buf[u8len..]) catch unreachable; if (u8len + 3 >= buf.len) { try writer.writeAll(buf[0..u8len]); u8len = 0;