From 03117c52905e2a18cbfb5a1aeb50e319a1ec4fba Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Mon, 30 Oct 2023 16:38:40 -0700 Subject: [PATCH 1/2] std.unicode: Add ASCII fast path to UTF-8 -> UTF-16 conversion functions --- lib/std/unicode.zig | 51 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index e4a84b42c9..619d9e645f 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -2,6 +2,7 @@ const std = @import("./std.zig"); const assert = std.debug.assert; const testing = std.testing; const mem = std.mem; +const builtin = @import("builtin"); /// Use this to replace an unknown, unrecognized, or unrepresentable character. /// @@ -863,7 +864,27 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1 var result = try std.ArrayList(u16).initCapacity(allocator, utf8.len + 1); errdefer result.deinit(); - const view = try Utf8View.init(utf8); + var remaining = utf8; + if (builtin.zig_backend != .stage2_x86_64) { + const chunk_len = std.simd.suggestVectorSize(u8) orelse 1; + const Chunk = @Vector(chunk_len, u8); + + // Fast path. Check for and encode ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(0x80); + if (@reduce(.Or, chunk & mask == mask)) { + // found a non ASCII code unit + break; + } + const zeroes: Chunk = @splat(0); + const utf16_chunk: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes }); + result.appendSliceAssumeCapacity(std.mem.bytesAsSlice(u16, &utf16_chunk)); + remaining = remaining[chunk_len..]; + } + } + + const view = try Utf8View.init(remaining); var it = view.iterator(); while (it.nextCodepoint()) |codepoint| { if (codepoint < 0x10000) { @@ -886,11 +907,33 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1 /// Assumes there is enough space for the output. pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { var dest_i: usize = 0; + + var remaining = utf8; + if (builtin.zig_backend != .stage2_x86_64) { + const chunk_len = std.simd.suggestVectorSize(u8) orelse 1; + const Chunk = @Vector(chunk_len, u8); + + // Fast path. Check for and encode ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(0x80); + if (@reduce(.Or, chunk & mask == mask)) { + // found a non ASCII code unit + break; + } + const zeroes: Chunk = @splat(0); + const utf16_bytes: [chunk_len * 2]u8 align(@alignOf(u16)) = std.simd.interlace(.{ chunk, zeroes }); + @memcpy(utf16le[dest_i..][0..chunk_len], std.mem.bytesAsSlice(u16, &utf16_bytes)); + dest_i += chunk_len; + remaining = remaining[chunk_len..]; + } + } + var src_i: usize = 0; - while (src_i < utf8.len) { - const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8; + while (src_i < remaining.len) { + const n = utf8ByteSequenceLength(remaining[src_i]) catch return error.InvalidUtf8; const next_src_i = src_i + n; - const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8; + const codepoint = utf8Decode(remaining[src_i..next_src_i]) catch return error.InvalidUtf8; if (codepoint < 0x10000) { const short = @as(u16, @intCast(codepoint)); utf16le[dest_i] = mem.nativeToLittle(u16, short); From 13c8ec9db0365225e717140d282350be922a205d Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Mon, 30 Oct 2023 17:12:30 -0700 Subject: [PATCH 2/2] std.unicode: Add ASCII fast path to UTF-16 -> UTF-8 conversion functions --- lib/std/unicode.zig | 86 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 5 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 619d9e645f..4a4a1b10d1 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -757,8 +757,34 @@ pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8 // optimistically guess that it will all be ascii. var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len); errdefer result.deinit(); - var out_index: usize = 0; - var it = Utf16LeIterator.init(utf16le); + + var remaining = utf16le; + if (builtin.zig_backend != .stage2_x86_64) { + const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const Chunk = @Vector(chunk_len, u16); + + // Fast path. Check for and encode ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F)); + if (@reduce(.Or, chunk | mask != mask)) { + // found a non ASCII code unit + break; + } + const chunk_byte_len = chunk_len * 2; + const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*; + const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes); + const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0]; + // We allocated enough space to encode every UTF-16 code unit + // as ASCII, so if the entire string is ASCII then we are + // guaranteed to have enough space allocated + result.appendSliceAssumeCapacity(&ascii_bytes); + remaining = remaining[chunk_len..]; + } + } + + var out_index: usize = result.items.len; + var it = Utf16LeIterator.init(remaining); while (try it.nextCodepoint()) |codepoint| { const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; try result.resize(result.items.len + utf8_len); @@ -774,8 +800,34 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0] // optimistically guess that it will all be ascii (and allocate space for the null terminator) var result = try std.ArrayList(u8).initCapacity(allocator, utf16le.len + 1); errdefer result.deinit(); - var out_index: usize = 0; - var it = Utf16LeIterator.init(utf16le); + + var remaining = utf16le; + if (builtin.zig_backend != .stage2_x86_64) { + const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const Chunk = @Vector(chunk_len, u16); + + // Fast path. Check for and encode ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F)); + if (@reduce(.Or, chunk | mask != mask)) { + // found a non ASCII code unit + break; + } + const chunk_byte_len = chunk_len * 2; + const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*; + const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes); + const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0]; + // We allocated enough space to encode every UTF-16 code unit + // as ASCII, so if the entire string is ASCII then we are + // guaranteed to have enough space allocated + result.appendSliceAssumeCapacity(&ascii_bytes); + remaining = remaining[chunk_len..]; + } + } + + var out_index = result.items.len; + var it = Utf16LeIterator.init(remaining); while (try it.nextCodepoint()) |codepoint| { const utf8_len = utf8CodepointSequenceLength(codepoint) catch unreachable; try result.resize(result.items.len + utf8_len); @@ -789,7 +841,31 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0] /// Returns end byte index into utf8. pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize { var end_index: usize = 0; - var it = Utf16LeIterator.init(utf16le); + + var remaining = utf16le; + if (builtin.zig_backend != .stage2_x86_64) { + const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const Chunk = @Vector(chunk_len, u16); + + // Fast path. Check for and encode ASCII characters at the start of the input. + while (remaining.len >= chunk_len) { + const chunk: Chunk = remaining[0..chunk_len].*; + const mask: Chunk = @splat(std.mem.nativeToLittle(u16, 0x7F)); + if (@reduce(.Or, chunk | mask != mask)) { + // found a non ASCII code unit + break; + } + const chunk_byte_len = chunk_len * 2; + const chunk_bytes: @Vector(chunk_byte_len, u8) = (std.mem.sliceAsBytes(remaining)[0..chunk_byte_len]).*; + const deinterlaced_bytes = std.simd.deinterlace(2, chunk_bytes); + const ascii_bytes: [chunk_len]u8 = deinterlaced_bytes[0]; + @memcpy(utf8[end_index .. end_index + chunk_len], &ascii_bytes); + end_index += chunk_len; + remaining = remaining[chunk_len..]; + } + } + + var it = Utf16LeIterator.init(remaining); while (try it.nextCodepoint()) |codepoint| { end_index += try utf8Encode(codepoint, utf8[end_index..]); }