diff --git a/std/unicode.zig b/std/unicode.zig index 37a73d7500..6d47675ac3 100644 --- a/std/unicode.zig +++ b/std/unicode.zig @@ -560,18 +560,34 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 } /// Returns index of next character. If exact fit, returned index equals output slice length. -/// If ran out of room, returned index equals output slice length + 1. +/// Assumes there is enough space for the output. /// TODO support codepoints bigger than 16 bits pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { - const utf16le_as_bytes = @sliceToBytes(utf16le[0..]); - var end_index: usize = 0; - - var it = (try Utf8View.init(utf8)).iterator(); - while (it.nextCodepoint()) |codepoint| { - if (end_index == utf16le_as_bytes.len) return (end_index / 2) + 1; - // TODO surrogate pairs - mem.writeIntSliceLittle(u16, utf16le_as_bytes[end_index..], @intCast(u16, codepoint)); - end_index += 2; + var dest_i: usize = 0; + var src_i: usize = 0; + while (src_i < utf8.len) { + const byte = utf8[src_i]; + const n = @clz(u8, ~byte); + switch (n) { + 0 => { + utf16le[dest_i] = byte; + dest_i += 1; + src_i += 1; + continue; + }, + 2, 3, 4 => { + const next_src_i = src_i + n; + const codepoint = try utf8Decode(utf8[src_i..next_src_i]); + const short = @intCast(u16, codepoint); // TODO surrogate pairs + utf16le[dest_i] = switch (builtin.endian) { + .Little => short, + .Big => @byteSwap(u16, short), + }; + dest_i += 1; + src_i = next_src_i; + }, + else => return error.Utf8InvalidStartByte, + } } - return end_index / 2; + return dest_i; } diff --git a/std/unicode/throughput_test.zig b/std/unicode/throughput_test.zig new file mode 100644 index 0000000000..f8b18af734 --- /dev/null +++ b/std/unicode/throughput_test.zig @@ -0,0 +1,37 @@ +const builtin = @import("builtin"); +const std = @import("std"); + +pub fn main() !void { + var stdout_file = try std.io.getStdOut(); + var stdout_out_stream = stdout_file.outStream(); + const stdout = &stdout_out_stream.stream; + + const args = try std.process.argsAlloc(std.heap.direct_allocator); + + @fence(.SeqCst); + var timer = try std.time.Timer.start(); + @fence(.SeqCst); + + var buffer1: [32767]u16 = undefined; + _ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]); + + @fence(.SeqCst); + const elapsed_ns_orig = timer.lap(); + @fence(.SeqCst); + + var buffer2: [32767]u16 = undefined; + _ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]); + + @fence(.SeqCst); + const elapsed_ns_better = timer.lap(); + @fence(.SeqCst); + + std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", elapsed_ns_orig, elapsed_ns_orig / 1000000); + std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", elapsed_ns_better, elapsed_ns_better / 1000000); + asm volatile ("nop" + : + : [a] "r" (&buffer1), + [b] "r" (&buffer2) + : "memory" + ); +}