From edb5deb39cc923d6fe2d3de507757d8ba9d74d68 Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 14:45:52 +1100 Subject: [PATCH] std: unicode codepoints are 21 bits --- lib/std/unicode.zig | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 52fd5889e4..4dfe1c87d3 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -6,7 +6,7 @@ const mem = std.mem; /// Returns how many bytes the UTF-8 representation would require /// for the given codepoint. -pub fn utf8CodepointSequenceLength(c: u32) !u3 { +pub fn utf8CodepointSequenceLength(c: u21) !u3 { if (c < 0x80) return @as(u3, 1); if (c < 0x800) return @as(u3, 2); if (c < 0x10000) return @as(u3, 3); @@ -32,7 +32,7 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { /// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). /// Errors: if c cannot be encoded in UTF-8. /// Returns: the number of bytes written to out. -pub fn utf8Encode(c: u32, out: []u8) !u3 { +pub fn utf8Encode(c: u21, out: []u8) !u3 { const length = try utf8CodepointSequenceLength(c); assert(out.len >= length); switch (length) { @@ -68,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u32, bytes[0]), + 1 => @as(u21, bytes[0]), 2 => utf8Decode2(bytes), 3 => utf8Decode3(bytes), 4 => utf8Decode4(bytes), @@ -82,10 +82,10 @@ const Utf8Decode2Error = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { assert(bytes.len == 2); assert(bytes[0] & 0b11100000 == 0b11000000); - var value: u32 = bytes[0] & 0b00011111; + var value: u21 = bytes[0] & 0b00011111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -101,10 +101,10 @@ const Utf8Decode3Error = error{ Utf8OverlongEncoding, Utf8EncodesSurrogateHalf, }; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 { assert(bytes.len == 3); assert(bytes[0] & 0b11110000 == 0b11100000); - var value: u32 = bytes[0] & 0b00001111; + var value: u21 = bytes[0] & 0b00001111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -125,10 +125,10 @@ const Utf8Decode4Error = error{ Utf8OverlongEncoding, Utf8CodepointTooLarge, }; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { assert(bytes.len == 4); assert(bytes[0] & 0b11111000 == 0b11110000); - var value: u32 = bytes[0] & 0b00000111; + var value: u21 = bytes[0] & 0b00000111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -224,11 +224,11 @@ pub const Utf8Iterator = struct { return it.bytes[it.i - cp_len .. it.i]; } - pub fn nextCodepoint(it: *Utf8Iterator) ?u32 { + pub fn nextCodepoint(it: *Utf8Iterator) ?u21 { const slice = it.nextCodepointSlice() orelse return null; switch (slice.len) { - 1 => return @as(u32, slice[0]), + 1 => return @as(u21, slice[0]), 2 => return utf8Decode2(slice) catch unreachable, 3 => return utf8Decode3(slice) catch unreachable, 4 => return utf8Decode4(slice) catch unreachable, @@ -248,19 +248,19 @@ pub const Utf16LeIterator = struct { }; } - pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 { assert(it.i <= it.bytes.len); if (it.i == it.bytes.len) return null; - const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); - if (c0 & ~@as(u32, 0x03ff) == 0xd800) { + const c0: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c0 & ~@as(u21, 0x03ff) == 0xd800) { // surrogate pair it.i += 2; if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf; - const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); - if (c1 & ~@as(u32, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; + const c1: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c1 & ~@as(u21, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; it.i += 2; return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); - } else if (c0 & ~@as(u32, 0x03ff) == 0xdc00) { + } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { return error.UnexpectedSecondSurrogateHalf; } else { it.i += 2; @@ -304,10 +304,10 @@ fn testUtf8EncodeError() void { testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); - testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); + testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge); } -fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void { +fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void { testing.expectError(expectedErr, utf8Encode(codePoint, array)); } @@ -455,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void { testing.expectError(expected_err, testDecode(bytes)); } -fn testValid(bytes: []const u8, expected_codepoint: u32) void { +fn testValid(bytes: []const u8, expected_codepoint: u21) void { testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint); } -fn testDecode(bytes: []const u8) !u32 { +fn testDecode(bytes: []const u8) !u21 { const length = try utf8ByteSequenceLength(bytes[0]); if (bytes.len < length) return error.UnexpectedEof; testing.expect(bytes.len == length);