From 405f4286f729f395ab1f9eb228fdf9cad9434b7c Mon Sep 17 00:00:00 2001 From: Jan Philipp Hafer Date: Tue, 17 May 2022 13:00:07 +0200 Subject: [PATCH] std.unicode: add utf16 byte length and codepoints counting routines * comptime and runtime tests are based on tests for counting utf8 code points --- lib/std/unicode.zig | 63 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 81a7ed838f..ae3403d11d 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -325,6 +325,42 @@ pub const Utf16LeIterator = struct { } }; +/// Returns the length of a supplied UTF-16 string literal in terms of unicode +/// codepoints. +pub fn utf16CountCodepoints(utf16le: []const u16) !usize { + var len: usize = 0; + var it = Utf16LeIterator.init(utf16le); + while (try it.nextCodepoint()) |_| len += 1; + return len; +} + +fn testUtf16CountCodepoints() !void { + try testing.expectEqual( + @as(usize, 1), + try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("a")), + ); + try testing.expectEqual( + @as(usize, 10), + try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("abcdefghij")), + ); + try testing.expectEqual( + @as(usize, 10), + try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("äåéëþüúíóö")), + ); + try testing.expectEqual( + @as(usize, 5), + try utf16CountCodepoints(utf8ToUtf16LeStringLiteral("こんにちは")), + ); + // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); +} + +test "utf16 count codepoints" { + try testUtf16CountCodepoints(); + // TODO stage1 error: out of bounds slice + if (@import("builtin").zig_backend != .stage1) + comptime try testUtf16CountCodepoints(); +} + test "utf8 encode" { comptime try testUtf8Encode(); try testUtf8Encode(); @@ -748,9 +784,9 @@ test "utf8ToUtf16LeWithNull" { } /// Converts a UTF-8 string literal into a UTF-16LE string literal. -pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8):0]u16 { +pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16LeLen(utf8) catch unreachable:0]u16 { comptime { - const len: usize = calcUtf16LeLen(utf8); + const len: usize = calcUtf16LeLen(utf8) catch |err| @compileError(err); var utf16le: [len:0]u16 = [_:0]u16{0} ** len; const utf16le_len = utf8ToUtf16Le(&utf16le, utf8[0..]) catch |err| @compileError(err); assert(len == utf16le_len); @@ -758,13 +794,17 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le } } -fn calcUtf16LeLen(utf8: []const u8) usize { +const CalcUtf16LeLenError = Utf8DecodeError || error{Utf8InvalidStartByte}; + +/// Returns length in UTF-16 of UTF-8 slice as length of []u16. +/// Length in []u8 is 2*len16. +pub fn calcUtf16LeLen(utf8: []const u8) CalcUtf16LeLenError!usize { var src_i: usize = 0; var dest_len: usize = 0; while (src_i < utf8.len) { - const n = utf8ByteSequenceLength(utf8[src_i]) catch unreachable; + const n = try utf8ByteSequenceLength(utf8[src_i]); const next_src_i = src_i + n; - const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch unreachable; + const codepoint = try utf8Decode(utf8[src_i..next_src_i]); if (codepoint < 0x10000) { dest_len += 1; } else { @@ -775,6 +815,19 @@ fn calcUtf16LeLen(utf8: []const u8) usize { return dest_len; } +fn testCalcUtf16LeLen() !void { + try testing.expectEqual(@as(usize, 1), try calcUtf16LeLen("a")); + try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("abcdefghij")); + try testing.expectEqual(@as(usize, 10), try calcUtf16LeLen("äåéëþüúíóö")); + try testing.expectEqual(@as(usize, 5), try calcUtf16LeLen("こんにちは")); + // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); +} + +test "calculate utf16 string length of given utf8 string in u16" { + try testCalcUtf16LeLen(); + comptime try testCalcUtf16LeLen(); +} + /// Print the given `utf16le` string fn formatUtf16le( utf16le: []const u16,