diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cf2f2d8e6..c3a2d6bf7d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -605,6 +605,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/index.zig" DESTINATION "${ZIG_ install(FILES "${CMAKE_SOURCE_DIR}/std/os/windows/util.zig" DESTINATION "${ZIG_STD_DEST}/os/windows") install(FILES "${CMAKE_SOURCE_DIR}/std/rand.zig" DESTINATION "${ZIG_STD_DEST}") install(FILES "${CMAKE_SOURCE_DIR}/std/sort.zig" DESTINATION "${ZIG_STD_DEST}") +install(FILES "${CMAKE_SOURCE_DIR}/std/unicode.zig" DESTINATION "${ZIG_STD_DEST}") install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap.zig" DESTINATION "${ZIG_STD_DEST}/special") install(FILES "${CMAKE_SOURCE_DIR}/std/special/bootstrap_lib.zig" DESTINATION "${ZIG_STD_DEST}/special") install(FILES "${CMAKE_SOURCE_DIR}/std/special/build_file_template.zig" DESTINATION "${ZIG_STD_DEST}/special") diff --git a/build.zig b/build.zig index cfc83cf424..7c0570bf46 100644 --- a/build.zig +++ b/build.zig @@ -276,6 +276,7 @@ pub fn installStdLib(b: &Builder) { "os/windows/util.zig", "rand.zig", "sort.zig", + "unicode.zig", "special/bootstrap.zig", "special/bootstrap_lib.zig", "special/build_file_template.zig", diff --git a/doc/langref.html.in b/doc/langref.html.in index c35e326254..84f03e8f84 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -298,7 +298,7 @@ pub fn main() -> %void {
  • Ascii control characters, except for U+000a (LF): U+0000 - U+0009, U+000b - U+0001f, U+007f. (Note that Windows line endings (CRLF) are not allowed, and hard tabs are not allowed.)
  • Non-Ascii Unicode line endings: U+0085 (NEL), U+2028 (LS), U+2029 (PS).
  • -

    The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code. A non-empty zig source must end with the line terminator character.

    +

    The codepoint U+000a (LF) (which is encoded as the single-byte value 0x0a) is the line terminator character. This character always terminates a line of zig source code (except possbly the last line of the file).

    For some discussion on the rationale behind these design decisions, see issue #663

    Values

    const warn = @import("std").debug.warn;
    diff --git a/src-self-hosted/module.zig b/src-self-hosted/module.zig
    index a0cbe9c864..a8098217ec 100644
    --- a/src-self-hosted/module.zig
    +++ b/src-self-hosted/module.zig
    @@ -213,11 +213,14 @@ pub const Module = struct {
             };
             %defer self.allocator.free(root_src_real_path);
     
    -        const source_code = io.readFileAlloc(root_src_real_path, self.allocator) %% |err| {
    +        const source_code = io.readFileAllocExtra(root_src_real_path, self.allocator, 3) %% |err| {
                 %return printError("unable to open '{}': {}", root_src_real_path, err);
                 return err;
             };
             %defer self.allocator.free(source_code);
    +        source_code[source_code.len - 3] = '\n';
    +        source_code[source_code.len - 2] = '\n';
    +        source_code[source_code.len - 1] = '\n';
     
             warn("====input:====\n");
     
    diff --git a/src-self-hosted/tokenizer.zig b/src-self-hosted/tokenizer.zig
    index 49225447a8..de2fbdc1ee 100644
    --- a/src-self-hosted/tokenizer.zig
    +++ b/src-self-hosted/tokenizer.zig
    @@ -70,7 +70,6 @@ pub const Token = struct {
             Identifier,
             StringLiteral: StrLitKind,
             Eof,
    -        NoEolAtEof,
             Builtin,
             Bang,
             Equal,
    @@ -140,7 +139,6 @@ pub const Token = struct {
     pub const Tokenizer = struct {
         buffer: []const u8,
         index: usize,
    -    actual_file_end: usize,
         pending_invalid_token: ?Token,
     
         pub const Location = struct {
    @@ -179,17 +177,15 @@ pub const Tokenizer = struct {
             std.debug.warn("{} \"{}\"\n", @tagName(token.id), self.buffer[token.start..token.end]);
         }
     
    +    /// buffer must end with "\n\n\n". This is so that attempting to decode
    +    /// a the 3 trailing bytes of a 4-byte utf8 sequence is never a buffer overflow.
         pub fn init(buffer: []const u8) -> Tokenizer {
    -        var source_len = buffer.len;
    -        while (source_len > 0) : (source_len -= 1) {
    -            if (buffer[source_len - 1] == '\n') break;
    -            // last line is incomplete, so skip it, and give an error when we get there.
    -        }
    -
    +        std.debug.assert(buffer[buffer.len - 1] == '\n');
    +        std.debug.assert(buffer[buffer.len - 2] == '\n');
    +        std.debug.assert(buffer[buffer.len - 3] == '\n');
             return Tokenizer {
    -            .buffer = buffer[0..source_len],
    +            .buffer = buffer,
                 .index = 0,
    -            .actual_file_end = buffer.len,
                 .pending_invalid_token = null,
             };
         }
    @@ -512,17 +508,14 @@ pub const Tokenizer = struct {
                 }
             }
             result.end = self.index;
    +
             if (result.id == Token.Id.Eof) {
                 if (self.pending_invalid_token) |token| {
                     self.pending_invalid_token = null;
                     return token;
                 }
    -            if (self.actual_file_end != self.buffer.len) {
    -                // instead of an Eof, give an error token
    -                result.id = Token.Id.NoEolAtEof;
    -                result.end = self.actual_file_end;
    -            }
             }
    +
             return result;
         }
     
    @@ -553,161 +546,96 @@ pub const Tokenizer = struct {
                 return 0;
             } else {
                 // check utf8-encoded character.
    -            // remember that the last byte in the buffer is guaranteed to be '\n',
    -            // which means we really don't need to do bounds checks here,
    -            // as long as we check one byte at a time for being a continuation byte.
    -            var value: u32 = undefined;
    -            var length: u3 = undefined;
    -            if      (c0 & 0b11100000 == 0b11000000) {value = c0 & 0b00011111; length = 2;}
    -            else if (c0 & 0b11110000 == 0b11100000) {value = c0 & 0b00001111; length = 3;}
    -            else if (c0 & 0b11111000 == 0b11110000) {value = c0 & 0b00000111; length = 4;}
    -            else return 1; // unexpected continuation or too many leading 1's
    -
    -            const c1 = self.buffer[self.index + 1];
    -            if (c1 & 0b11000000 != 0b10000000) return 1; // expected continuation
    -            value <<= 6;
    -            value |= c1 & 0b00111111;
    -            if (length == 2) {
    -                if (value < 0x80) return length; // overlong
    -                if (value == 0x85) return length; // U+0085 (NEL)
    -                self.index += length - 1;
    -                return 0;
    +            const length = std.unicode.utf8ByteSequenceLength(c0) %% return 1;
    +            // the last 3 bytes in the buffer are guaranteed to be '\n',
    +            // which means we don't need to do any bounds checking here.
    +            const bytes = self.buffer[self.index..self.index + length];
    +            switch (length) {
    +                2 => {
    +                    const value = std.unicode.utf8Decode2(bytes) %% return length;
    +                    if (value == 0x85) return length; // U+0085 (NEL)
    +                },
    +                3 => {
    +                    const value = std.unicode.utf8Decode3(bytes) %% return length;
    +                    if (value == 0x2028) return length; // U+2028 (LS)
    +                    if (value == 0x2029) return length; // U+2029 (PS)
    +                },
    +                4 => {
    +                    _ = std.unicode.utf8Decode4(bytes) %% return length;
    +                },
    +                else => unreachable,
                 }
    -            const c2 = self.buffer[self.index + 2];
    -            if (c2 & 0b11000000 != 0b10000000) return 2; // expected continuation
    -            value <<= 6;
    -            value |= c2 & 0b00111111;
    -            if (length == 3) {
    -                if (value < 0x800) return length; // overlong
    -                if (value == 0x2028) return length; // U+2028 (LS)
    -                if (value == 0x2029) return length; // U+2029 (PS)
    -                if (0xd800 <= value and value <= 0xdfff) return length; // surrogate halves not allowed in utf8
    -                self.index += length - 1;
    -                return 0;
    -            }
    -            const c3 = self.buffer[self.index + 3];
    -            if (c3 & 0b11000000 != 0b10000000) return 3; // expected continuation
    -            value <<= 6;
    -            value |= c3 & 0b00111111;
    -            if (length == 4) {
    -                if (value < 0x10000) return length; // overlong
    -                if (value > 0x10FFFF) return length; // out of bounds
    -                self.index += length - 1;
    -                return 0;
    -            }
    -            unreachable;
    +            self.index += length - 1;
    +            return 0;
             }
         }
     };
     
     
     
    -test "tokenizer - source must end with eol" {
    -    testTokenizeWithEol("", []Token.Id {
    -    }, true);
    -    testTokenizeWithEol("no newline", []Token.Id {
    -    }, false);
    -    testTokenizeWithEol("test\n", []Token.Id {
    +test "tokenizer" {
    +    testTokenize("test", []Token.Id {
             Token.Id.Keyword_test,
    -    }, true);
    -    testTokenizeWithEol("test\nno newline", []Token.Id {
    -        Token.Id.Keyword_test,
    -    }, false);
    +    });
     }
     
     test "tokenizer - invalid token characters" {
    -    testTokenize("#\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("`\n", []Token.Id{Token.Id.Invalid});
    +    testTokenize("#", []Token.Id{Token.Id.Invalid});
    +    testTokenize("`", []Token.Id{Token.Id.Invalid});
     }
     
     test "tokenizer - invalid literal/comment characters" {
    -    testTokenize("\"\x00\"\n", []Token.Id {
    +    testTokenize("\"\x00\"", []Token.Id {
             Token.Id { .StringLiteral = Token.StrLitKind.Normal },
             Token.Id.Invalid,
         });
    -    testTokenize("//\x00\n", []Token.Id {
    +    testTokenize("//\x00", []Token.Id {
             Token.Id.Invalid,
         });
    -    testTokenize("//\x1f\n", []Token.Id {
    +    testTokenize("//\x1f", []Token.Id {
             Token.Id.Invalid,
         });
    -    testTokenize("//\x7f\n", []Token.Id {
    +    testTokenize("//\x7f", []Token.Id {
             Token.Id.Invalid,
         });
     }
     
    -test "tokenizer - valid unicode" {
    -    testTokenize("//\xc2\x80\n", []Token.Id{});
    -    testTokenize("//\xdf\xbf\n", []Token.Id{});
    -    testTokenize("//\xe0\xa0\x80\n", []Token.Id{});
    -    testTokenize("//\xe1\x80\x80\n", []Token.Id{});
    -    testTokenize("//\xef\xbf\xbf\n", []Token.Id{});
    -    testTokenize("//\xf0\x90\x80\x80\n", []Token.Id{});
    -    testTokenize("//\xf1\x80\x80\x80\n", []Token.Id{});
    -    testTokenize("//\xf3\xbf\xbf\xbf\n", []Token.Id{});
    -    testTokenize("//\xf4\x8f\xbf\xbf\n", []Token.Id{});
    +test "tokenizer - utf8" {
    +    testTokenize("//\xc2\x80", []Token.Id{});
    +    testTokenize("//\xf4\x8f\xbf\xbf", []Token.Id{});
     }
     
    -test "tokenizer - invalid unicode continuation bytes" {
    -    // unexpected continuation
    -    testTokenize("//\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xbf\n", []Token.Id{Token.Id.Invalid});
    -    // too many leading 1's
    -    testTokenize("//\xf8\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xff\n", []Token.Id{Token.Id.Invalid});
    -    // expected continuation for 2 byte sequences
    -    testTokenize("//\xc2\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xc2\xc0\n", []Token.Id{Token.Id.Invalid});
    -    // expected continuation for 3 byte sequences
    -    testTokenize("//\xe0\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\xc0\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\xa0\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\xa0\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\xa0\xc0\n", []Token.Id{Token.Id.Invalid});
    -    // expected continuation for 4 byte sequences
    -    testTokenize("//\xf0\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\xc0\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x90\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x90\xc0\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x90\x80\x00\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x90\x80\xc0\n", []Token.Id{Token.Id.Invalid});
    +test "tokenizer - invalid utf8" {
    +    testTokenize("//\x80", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xbf", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xf8", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xff", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xc2\xc0", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xe0", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xf0", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xf0\x90\x80\xc0", []Token.Id{Token.Id.Invalid});
     }
     
    -test "tokenizer - overlong utf8 codepoint" {
    -    testTokenize("//\xc0\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xc1\xbf\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\x80\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe0\x9f\xbf\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x80\x80\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf0\x8f\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
    -}
    -
    -test "tokenizer - misc invalid utf8" {
    -    // codepoint out of bounds
    -    testTokenize("//\xf4\x90\x80\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xf7\xbf\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
    +test "tokenizer - illegal unicode codepoints" {
         // unicode newline characters.U+0085, U+2028, U+2029
    -    testTokenize("//\xc2\x84\n", []Token.Id{});
    -    testTokenize("//\xc2\x85\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xc2\x86\n", []Token.Id{});
    -    testTokenize("//\xe2\x80\xa7\n", []Token.Id{});
    -    testTokenize("//\xe2\x80\xa8\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe2\x80\xa9\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xe2\x80\xaa\n", []Token.Id{});
    -    // surrogate halves
    -    testTokenize("//\xed\x9f\x80\n", []Token.Id{});
    -    testTokenize("//\xed\xa0\x80\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xed\xbf\xbf\n", []Token.Id{Token.Id.Invalid});
    -    testTokenize("//\xee\x80\x80\n", []Token.Id{});
    -    // surrogate halves are invalid, even in surrogate pairs
    -    testTokenize("//\xed\xa0\xad\xed\xb2\xa9\n", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xc2\x84", []Token.Id{});
    +    testTokenize("//\xc2\x85", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xc2\x86", []Token.Id{});
    +    testTokenize("//\xe2\x80\xa7", []Token.Id{});
    +    testTokenize("//\xe2\x80\xa8", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xe2\x80\xa9", []Token.Id{Token.Id.Invalid});
    +    testTokenize("//\xe2\x80\xaa", []Token.Id{});
     }
     
     fn testTokenize(source: []const u8, expected_tokens: []const Token.Id) {
    -    testTokenizeWithEol(source, expected_tokens, true);
    -}
    -fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, expected_eol_at_eof: bool) {
    -    var tokenizer = Tokenizer.init(source);
    +    // (test authors, just make this bigger if you need it)
    +    var padded_source: [0x100]u8 = undefined;
    +    std.mem.copy(u8, padded_source[0..source.len], source);
    +    padded_source[source.len + 0] = '\n';
    +    padded_source[source.len + 1] = '\n';
    +    padded_source[source.len + 2] = '\n';
    +
    +    var tokenizer = Tokenizer.init(padded_source[0..source.len + 3]);
         for (expected_tokens) |expected_token_id| {
             const token = tokenizer.next();
             std.debug.assert(@TagType(Token.Id)(token.id) == @TagType(Token.Id)(expected_token_id));
    @@ -718,5 +646,5 @@ fn testTokenizeWithEol(source: []const u8, expected_tokens: []const Token.Id, ex
                 else => {},
             }
         }
    -    std.debug.assert(tokenizer.next().id == if (expected_eol_at_eof) Token.Id.Eof else Token.Id.NoEolAtEof);
    +    std.debug.assert(tokenizer.next().id == Token.Id.Eof);
     }
    diff --git a/std/index.zig b/std/index.zig
    index 07da469b5e..a9a0038e60 100644
    --- a/std/index.zig
    +++ b/std/index.zig
    @@ -25,6 +25,7 @@ pub const net = @import("net.zig");
     pub const os = @import("os/index.zig");
     pub const rand = @import("rand.zig");
     pub const sort = @import("sort.zig");
    +pub const unicode = @import("unicode.zig");
     
     test "std" {
         // run tests from these
    @@ -53,4 +54,5 @@ test "std" {
         _ = @import("os/index.zig");
         _ = @import("rand.zig");
         _ = @import("sort.zig");
    +    _ = @import("unicode.zig");
     }
    diff --git a/std/io.zig b/std/io.zig
    index cbf2e0c216..44e5634ae0 100644
    --- a/std/io.zig
    +++ b/std/io.zig
    @@ -500,11 +500,16 @@ pub fn writeFile(path: []const u8, data: []const u8, allocator: ?&mem.Allocator)
     
     /// On success, caller owns returned buffer.
     pub fn readFileAlloc(path: []const u8, allocator: &mem.Allocator) -> %[]u8 {
    +    return readFileAllocExtra(path, allocator, 0);
    +}
    +/// On success, caller owns returned buffer.
    +/// Allocates extra_len extra bytes at the end of the file buffer, which are uninitialized.
    +pub fn readFileAllocExtra(path: []const u8, allocator: &mem.Allocator, extra_len: usize) -> %[]u8 {
         var file = %return File.openRead(path, allocator);
         defer file.close();
     
         const size = %return file.getEndPos();
    -    const buf = %return allocator.alloc(u8, size);
    +    const buf = %return allocator.alloc(u8, size + extra_len);
         %defer allocator.free(buf);
     
         var adapter = FileInStream.init(&file);
    diff --git a/std/unicode.zig b/std/unicode.zig
    new file mode 100644
    index 0000000000..6c06eeb73a
    --- /dev/null
    +++ b/std/unicode.zig
    @@ -0,0 +1,169 @@
    +const std = @import("./index.zig");
    +
    +error Utf8InvalidStartByte;
    +
    +/// Given the first byte of a UTF-8 codepoint,
    +/// returns a number 1-4 indicating the total length of the codepoint in bytes.
    +/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
    +pub fn utf8ByteSequenceLength(first_byte: u8) -> %u3 {
    +    if (first_byte < 0b10000000) return u3(1);
    +    if (first_byte & 0b11100000 == 0b11000000) return u3(2);
    +    if (first_byte & 0b11110000 == 0b11100000) return u3(3);
    +    if (first_byte & 0b11111000 == 0b11110000) return u3(4);
    +    return error.Utf8InvalidStartByte;
    +}
    +
    +error Utf8OverlongEncoding;
    +error Utf8ExpectedContinuation;
    +error Utf8EncodesSurrogateHalf;
    +error Utf8CodepointTooLarge;
    +
    +/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
    +/// bytes.len must be equal to %%utf8ByteSequenceLength(bytes[0]).
    +/// If you already know the length at comptime, you can call one of
    +/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
    +pub fn utf8Decode(bytes: []const u8) -> %u32 {
    +    return switch (bytes.len) {
    +        1 => u32(bytes[0]),
    +        2 => utf8Decode2(bytes),
    +        3 => utf8Decode3(bytes),
    +        4 => utf8Decode4(bytes),
    +        else => unreachable,
    +    };
    +}
    +pub fn utf8Decode2(bytes: []const u8) -> %u32 {
    +    std.debug.assert(bytes.len == 2);
    +    std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);
    +    var value: u32 = bytes[0] & 0b00011111;
    +
    +    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[1] & 0b00111111;
    +
    +    if (value < 0x80) return error.Utf8OverlongEncoding;
    +
    +    return value;
    +}
    +pub fn utf8Decode3(bytes: []const u8) -> %u32 {
    +    std.debug.assert(bytes.len == 3);
    +    std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);
    +    var value: u32 = bytes[0] & 0b00001111;
    +
    +    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[1] & 0b00111111;
    +
    +    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[2] & 0b00111111;
    +
    +    if (value < 0x800) return error.Utf8OverlongEncoding;
    +    if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
    +
    +    return value;
    +}
    +pub fn utf8Decode4(bytes: []const u8) -> %u32 {
    +    std.debug.assert(bytes.len == 4);
    +    std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);
    +    var value: u32 = bytes[0] & 0b00000111;
    +
    +    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[1] & 0b00111111;
    +
    +    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[2] & 0b00111111;
    +
    +    if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
    +    value <<= 6;
    +    value |= bytes[3] & 0b00111111;
    +
    +    if (value < 0x10000) return error.Utf8OverlongEncoding;
    +    if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
    +
    +    return value;
    +}
    +
    +error UnexpectedEof;
    +test "valid utf8" {
    +    testValid("\x00", 0x0);
    +    testValid("\x20", 0x20);
    +    testValid("\x7f", 0x7f);
    +    testValid("\xc2\x80", 0x80);
    +    testValid("\xdf\xbf", 0x7ff);
    +    testValid("\xe0\xa0\x80", 0x800);
    +    testValid("\xe1\x80\x80", 0x1000);
    +    testValid("\xef\xbf\xbf", 0xffff);
    +    testValid("\xf0\x90\x80\x80", 0x10000);
    +    testValid("\xf1\x80\x80\x80", 0x40000);
    +    testValid("\xf3\xbf\xbf\xbf", 0xfffff);
    +    testValid("\xf4\x8f\xbf\xbf", 0x10ffff);
    +}
    +
    +test "invalid utf8 continuation bytes" {
    +    // unexpected continuation
    +    testError("\x80", error.Utf8InvalidStartByte);
    +    testError("\xbf", error.Utf8InvalidStartByte);
    +    // too many leading 1's
    +    testError("\xf8", error.Utf8InvalidStartByte);
    +    testError("\xff", error.Utf8InvalidStartByte);
    +    // expected continuation for 2 byte sequences
    +    testError("\xc2", error.UnexpectedEof);
    +    testError("\xc2\x00", error.Utf8ExpectedContinuation);
    +    testError("\xc2\xc0", error.Utf8ExpectedContinuation);
    +    // expected continuation for 3 byte sequences
    +    testError("\xe0", error.UnexpectedEof);
    +    testError("\xe0\x00", error.UnexpectedEof);
    +    testError("\xe0\xc0", error.UnexpectedEof);
    +    testError("\xe0\xa0", error.UnexpectedEof);
    +    testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
    +    testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
    +    // expected continuation for 4 byte sequences
    +    testError("\xf0", error.UnexpectedEof);
    +    testError("\xf0\x00", error.UnexpectedEof);
    +    testError("\xf0\xc0", error.UnexpectedEof);
    +    testError("\xf0\x90\x00", error.UnexpectedEof);
    +    testError("\xf0\x90\xc0", error.UnexpectedEof);
    +    testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
    +    testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
    +}
    +
    +test "overlong utf8 codepoint" {
    +    testError("\xc0\x80", error.Utf8OverlongEncoding);
    +    testError("\xc1\xbf", error.Utf8OverlongEncoding);
    +    testError("\xe0\x80\x80", error.Utf8OverlongEncoding);
    +    testError("\xe0\x9f\xbf", error.Utf8OverlongEncoding);
    +    testError("\xf0\x80\x80\x80", error.Utf8OverlongEncoding);
    +    testError("\xf0\x8f\xbf\xbf", error.Utf8OverlongEncoding);
    +}
    +
    +test "misc invalid utf8" {
    +    // codepoint out of bounds
    +    testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
    +    testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
    +    // surrogate halves
    +    testValid("\xed\x9f\xbf", 0xd7ff);
    +    testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
    +    testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
    +    testValid("\xee\x80\x80", 0xe000);
    +}
    +
    +fn testError(bytes: []const u8, expected_err: error) {
    +    if (testDecode(bytes)) |_| {
    +        unreachable;
    +    } else |err| {
    +        std.debug.assert(err == expected_err);
    +    }
    +}
    +
    +fn testValid(bytes: []const u8, expected_codepoint: u32) {
    +    std.debug.assert(%%testDecode(bytes) == expected_codepoint);
    +}
    +
    +fn testDecode(bytes: []const u8) -> %u32 {
    +    const length = %return utf8ByteSequenceLength(bytes[0]);
    +    if (bytes.len < length) return error.UnexpectedEof;
    +    std.debug.assert(bytes.len == length);
    +    return utf8Decode(bytes);
    +}