mirror of
https://github.com/ziglang/zig.git
synced 2026-01-20 22:35:24 +00:00
implement proper utf16leToUtf8
This commit is contained in:
parent
cbfe9a4077
commit
843529d234
@ -4,6 +4,7 @@ const io = std.io;
|
||||
const mem = std.mem;
|
||||
const Allocator = mem.Allocator;
|
||||
const Buffer = std.Buffer;
|
||||
const unicode = std.unicode;
|
||||
const llvm = @import("llvm.zig");
|
||||
const c = @import("c.zig");
|
||||
const builtin = @import("builtin");
|
||||
@ -998,14 +999,90 @@ test "getAppDataDir" {
|
||||
std.debug.warn("{}...", result);
|
||||
}
|
||||
|
||||
// TODO full utf-16 LE support
|
||||
// TODO: put general purpose stuff in std.unicode
|
||||
fn utf16leToUtf8(allocator: *mem.Allocator, utf16le: []const u16) ![]u8 {
|
||||
const utf8_bytes = try allocator.alloc(u8, utf16le.len);
|
||||
for (utf16le) |codepoint, i| {
|
||||
assert(codepoint < 127); // TODO full utf-16 LE support
|
||||
utf8_bytes[i] = @intCast(u8, codepoint);
|
||||
var result = ArrayList(u8).init(allocator);
|
||||
// optimistically guess that it will all be ascii.
|
||||
try result.ensureCapacity(utf16le.len);
|
||||
|
||||
const utf16le_as_bytes = @sliceToBytes(utf16le);
|
||||
var i: usize = 0;
|
||||
var out_index: usize = 0;
|
||||
while (i < utf16le_as_bytes.len) : (i += 2) {
|
||||
// decode
|
||||
const c0: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]);
|
||||
var codepoint: u32 = undefined;
|
||||
if (c0 & ~u32(0x03ff) == 0xd800) {
|
||||
// surrogate pair
|
||||
i += 2;
|
||||
if (i >= utf16le_as_bytes.len) return error.DanglingSurrogateHalf;
|
||||
const c1: u32 = mem.readIntLE(u16, utf16le_as_bytes[i..i + 2]);
|
||||
if (c1 & ~u32(0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
|
||||
codepoint = 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
|
||||
} else if (c0 & ~u32(0x03ff) == 0xdc00) {
|
||||
return error.UnexpectedSecondSurrogateHalf;
|
||||
} else {
|
||||
codepoint = c0;
|
||||
}
|
||||
|
||||
// encode
|
||||
const utf8_len = unicode.utf8CodepointSequenceLength(codepoint) catch unreachable;
|
||||
try result.resize(result.len + utf8_len);
|
||||
_ = unicode.utf8Encode(codepoint, result.items[out_index..]) catch unreachable;
|
||||
out_index += utf8_len;
|
||||
}
|
||||
|
||||
return result.toOwnedSlice();
|
||||
}
|
||||
|
||||
test "utf16leToUtf8" {
|
||||
var utf16le: [2]u16 = undefined;
|
||||
const utf16le_as_bytes = @sliceToBytes(utf16le[0..]);
|
||||
|
||||
{
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16('A'), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16('a'), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "Aa"));
|
||||
}
|
||||
|
||||
{
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16(0x80), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16(0xffff), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "\xc2\x80" ++ "\xef\xbf\xbf"));
|
||||
}
|
||||
|
||||
{
|
||||
// the values just outside the surrogate half range
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16(0xd7ff), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16(0xe000), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "\xed\x9f\xbf" ++ "\xee\x80\x80"));
|
||||
}
|
||||
|
||||
{
|
||||
// smallest surrogate pair
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16(0xd800), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "\xf0\x90\x80\x80"));
|
||||
}
|
||||
|
||||
{
|
||||
// largest surrogate pair
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16(0xdfff), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "\xf4\x8f\xbf\xbf"));
|
||||
}
|
||||
|
||||
{
|
||||
mem.writeInt(utf16le_as_bytes[0..], u16(0xdbff), builtin.Endian.Little);
|
||||
mem.writeInt(utf16le_as_bytes[2..], u16(0xdc00), builtin.Endian.Little);
|
||||
const utf8 = try utf16leToUtf8(std.debug.global_allocator, utf16le);
|
||||
assert(mem.eql(u8, utf8, "\xf4\x8f\xb0\x80"));
|
||||
}
|
||||
return utf8_bytes;
|
||||
}
|
||||
|
||||
fn utf16lePtrSlice(ptr: [*]const u16) []const u16 {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user