Merge pull request #20885 from ziglang/simplify-tokenizer

std.zig.tokenizer: simplification and spec conformance
2026-02-20 00:08:56 +00:00 · 2024-07-31 19:52:34 -07:00 · 2024-07-31 19:52:34 -07:00 · eb1a199dff
commit eb1a199dff
parent 059856acfc c2b8afcac9
19 changed files with 433 additions and 544 deletions
--- a/lib/std/crypto/ml_kem.zig
+++ b/lib/std/crypto/ml_kem.zig
@ -677,10 +677,10 @@ fn montReduce(x: i32) i16 {
    // Note gcd(2¹⁶, q) = 1 as q is prime.  Write q' := 62209 = q⁻¹ mod R.
    // First we compute
    //
-    //	m := ((x mod R) q') mod R
+    // m := ((x mod R) q') mod R
    //         = x q' mod R
-    //	   = int16(x q')
-    //	   = int16(int32(x) * int32(q'))
+    //    = int16(x q')
+    //    = int16(int32(x) * int32(q'))
    //
    // Note that x q' might be as big as 2³² and could overflow the int32
    // multiplication in the last line.  However for any int32s a and b,
--- a/lib/std/macho.zig
+++ b/lib/std/macho.zig
@ -203,8 +203,7 @@ pub const symtab_command = extern struct {
 ///  local symbols (static and debugging symbols) - grouped by module
 ///  defined external symbols - grouped by module (sorted by name if not lib)
 ///  undefined external symbols (sorted by name if MH_BINDATLOAD is not set,
-///       			    and in order the were seen by the static
-///  			    linker if MH_BINDATLOAD is set)
+///  and in order the were seen by the static linker if MH_BINDATLOAD is set)
 /// In this load command there are offsets and counts to each of the three groups
 /// of symbols.
 ///
@ -219,9 +218,9 @@ pub const symtab_command = extern struct {
 /// shared library.  For executable and object modules, which are files
 /// containing only one module, the information that would be in these three
 /// tables is determined as follows:
-/// 	table of contents - the defined external symbols are sorted by name
-///  module table - the file contains only one module so everything in the
-///  	       file is part of the module.
+///  table of contents - the defined external symbols are sorted by name
+///  module table - the file contains only one module so everything in the file
+///  is part of the module.
 ///  reference symbol table - is the defined and undefined external symbols
 ///
 /// For dynamically linked shared library files this load command also contains
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@ -95,16 +95,13 @@ pub inline fn utf8EncodeComptime(comptime c: u21) [

 const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;

-/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
-/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
-/// If you already know the length at comptime, you can call one of
-/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
 pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
    return switch (bytes.len) {
-        1 => @as(u21, bytes[0]),
-        2 => utf8Decode2(bytes),
-        3 => utf8Decode3(bytes),
-        4 => utf8Decode4(bytes),
+        1 => bytes[0],
+        2 => utf8Decode2(bytes[0..2].*),
+        3 => utf8Decode3(bytes[0..3].*),
+        4 => utf8Decode4(bytes[0..4].*),
        else => unreachable,
    };
 }
@ -113,8 +110,7 @@ const Utf8Decode2Error = error{
    Utf8ExpectedContinuation,
    Utf8OverlongEncoding,
 };
-pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
-    assert(bytes.len == 2);
+pub fn utf8Decode2(bytes: [2]u8) Utf8Decode2Error!u21 {
    assert(bytes[0] & 0b11100000 == 0b11000000);
    var value: u21 = bytes[0] & 0b00011111;

@ -130,7 +126,7 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
 const Utf8Decode3Error = Utf8Decode3AllowSurrogateHalfError || error{
    Utf8EncodesSurrogateHalf,
 };
-pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
+pub fn utf8Decode3(bytes: [3]u8) Utf8Decode3Error!u21 {
    const value = try utf8Decode3AllowSurrogateHalf(bytes);

    if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
@ -142,8 +138,7 @@ const Utf8Decode3AllowSurrogateHalfError = error{
    Utf8ExpectedContinuation,
    Utf8OverlongEncoding,
 };
-pub fn utf8Decode3AllowSurrogateHalf(bytes: []const u8) Utf8Decode3AllowSurrogateHalfError!u21 {
-    assert(bytes.len == 3);
+pub fn utf8Decode3AllowSurrogateHalf(bytes: [3]u8) Utf8Decode3AllowSurrogateHalfError!u21 {
    assert(bytes[0] & 0b11110000 == 0b11100000);
    var value: u21 = bytes[0] & 0b00001111;

@ -165,8 +160,7 @@ const Utf8Decode4Error = error{
    Utf8OverlongEncoding,
    Utf8CodepointTooLarge,
 };
-pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
-    assert(bytes.len == 4);
+pub fn utf8Decode4(bytes: [4]u8) Utf8Decode4Error!u21 {
    assert(bytes[0] & 0b11111000 == 0b11110000);
    var value: u21 = bytes[0] & 0b00000111;

@ -1637,12 +1631,13 @@ pub fn wtf8Encode(c: u21, out: []u8) error{CodepointTooLarge}!u3 {

 const Wtf8DecodeError = Utf8Decode2Error || Utf8Decode3AllowSurrogateHalfError || Utf8Decode4Error;

+/// Deprecated. This function has an awkward API that is too easy to use incorrectly.
 pub fn wtf8Decode(bytes: []const u8) Wtf8DecodeError!u21 {
    return switch (bytes.len) {
-        1 => @as(u21, bytes[0]),
-        2 => utf8Decode2(bytes),
-        3 => utf8Decode3AllowSurrogateHalf(bytes),
-        4 => utf8Decode4(bytes),
+        1 => bytes[0],
+        2 => utf8Decode2(bytes[0..2].*),
+        3 => utf8Decode3AllowSurrogateHalf(bytes[0..3].*),
+        4 => utf8Decode4(bytes[0..4].*),
        else => unreachable,
    };
 }
--- a/lib/std/zig/Ast.zig
+++ b/lib/std/zig/Ast.zig
@ -69,7 +69,7 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
        const token = tokenizer.next();
        try tokens.append(gpa, .{
            .tag = token.tag,
-            .start = @as(u32, @intCast(token.loc.start)),
+            .start = @intCast(token.loc.start),
        });
        if (token.tag == .eof) break;
    }
--- a/lib/std/zig/AstGen.zig
+++ b/lib/std/zig/AstGen.zig
@ -11351,6 +11351,9 @@ fn failWithStrLitError(astgen: *AstGen, err: std.zig.string_literal.Error, token
                .{raw_string[bad_index]},
            );
        },
+        .empty_char_literal => {
+            return astgen.failOff(token, offset, "empty character literal", .{});
+        },
    }
 }

@ -13820,21 +13823,9 @@ fn lowerAstErrors(astgen: *AstGen) !void {
    var msg: std.ArrayListUnmanaged(u8) = .{};
    defer msg.deinit(gpa);

-    const token_starts = tree.tokens.items(.start);
-    const token_tags = tree.tokens.items(.tag);
-
    var notes: std.ArrayListUnmanaged(u32) = .{};
    defer notes.deinit(gpa);

-    const tok = parse_err.token + @intFromBool(parse_err.token_is_prev);
-    if (token_tags[tok] == .invalid) {
-        const bad_off: u32 = @intCast(tree.tokenSlice(tok).len);
-        const byte_abs = token_starts[tok] + bad_off;
-        try notes.append(gpa, try astgen.errNoteTokOff(tok, bad_off, "invalid byte: '{'}'", .{
-            std.zig.fmtEscapes(tree.source[byte_abs..][0..1]),
-        }));
-    }
-
    for (tree.errors[1..]) |note| {
        if (!note.is_note) break;

--- a/lib/std/zig/parser_test.zig
+++ b/lib/std/zig/parser_test.zig
@ -6061,7 +6061,6 @@ test "recovery: invalid container members" {
    , &[_]Error{
        .expected_expr,
        .expected_comma_after_field,
-        .expected_type_expr,
        .expected_semi_after_stmt,
    });
 }
--- a/lib/std/zig/string_literal.zig
+++ b/lib/std/zig/string_literal.zig
@ -1,6 +1,5 @@
 const std = @import("../std.zig");
 const assert = std.debug.assert;
-const utf8Decode = std.unicode.utf8Decode;
 const utf8Encode = std.unicode.utf8Encode;

 pub const ParseError = error{
@ -37,12 +36,16 @@ pub const Error = union(enum) {
    expected_single_quote: usize,
    /// The character at this index cannot be represented without an escape sequence.
    invalid_character: usize,
+    /// `''`. Not returned for string literals.
+    empty_char_literal,
 };

-/// Only validates escape sequence characters.
-/// Slice must be valid utf8 starting and ending with "'" and exactly one codepoint in between.
+/// Asserts the slice starts and ends with single-quotes.
+/// Returns an error if there is not exactly one UTF-8 codepoint in between.
 pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
-    assert(slice.len >= 3 and slice[0] == '\'' and slice[slice.len - 1] == '\'');
+    if (slice.len < 3) return .{ .failure = .empty_char_literal };
+    assert(slice[0] == '\'');
+    assert(slice[slice.len - 1] == '\'');

    switch (slice[1]) {
        '\\' => {
@ -55,7 +58,18 @@ pub fn parseCharLiteral(slice: []const u8) ParsedCharLiteral {
        },
        0 => return .{ .failure = .{ .invalid_character = 1 } },
        else => {
-            const codepoint = utf8Decode(slice[1 .. slice.len - 1]) catch unreachable;
+            const inner = slice[1 .. slice.len - 1];
+            const n = std.unicode.utf8ByteSequenceLength(inner[0]) catch return .{
+                .failure = .{ .invalid_unicode_codepoint = 1 },
+            };
+            if (inner.len > n) return .{ .failure = .{ .expected_single_quote = 1 + n } };
+            const codepoint = switch (n) {
+                1 => inner[0],
+                2 => std.unicode.utf8Decode2(inner[0..2].*),
+                3 => std.unicode.utf8Decode3(inner[0..3].*),
+                4 => std.unicode.utf8Decode4(inner[0..4].*),
+                else => unreachable,
+            } catch return .{ .failure = .{ .invalid_unicode_codepoint = 1 } };
            return .{ .success = codepoint };
        },
    }
--- a/lib/std/zig/system/darwin/macos.zig
+++ b/lib/std/zig/system/darwin/macos.zig
@ -303,16 +303,16 @@ test "detect" {
            \\<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
            \\<plist version="1.0">
            \\<dict>
-            \\	<key>ProductBuildVersion</key>
-            \\	<string>7W98</string>
-            \\	<key>ProductCopyright</key>
-            \\	<string>Apple Computer, Inc. 1983-2004</string>
-            \\	<key>ProductName</key>
-            \\	<string>Mac OS X</string>
-            \\	<key>ProductUserVisibleVersion</key>
-            \\	<string>10.3.9</string>
-            \\	<key>ProductVersion</key>
-            \\	<string>10.3.9</string>
+            \\ <key>ProductBuildVersion</key>
+            \\ <string>7W98</string>
+            \\ <key>ProductCopyright</key>
+            \\ <string>Apple Computer, Inc. 1983-2004</string>
+            \\ <key>ProductName</key>
+            \\ <string>Mac OS X</string>
+            \\ <key>ProductUserVisibleVersion</key>
+            \\ <string>10.3.9</string>
+            \\ <key>ProductVersion</key>
+            \\ <string>10.3.9</string>
            \\</dict>
            \\</plist>
            ,
@ -323,18 +323,18 @@ test "detect" {
            \\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
            \\<plist version="1.0">
            \\<dict>
-            \\	<key>ProductBuildVersion</key>
-            \\	<string>19G68</string>
-            \\	<key>ProductCopyright</key>
-            \\	<string>1983-2020 Apple Inc.</string>
-            \\	<key>ProductName</key>
-            \\	<string>Mac OS X</string>
-            \\	<key>ProductUserVisibleVersion</key>
-            \\	<string>10.15.6</string>
-            \\	<key>ProductVersion</key>
-            \\	<string>10.15.6</string>
-            \\	<key>iOSSupportVersion</key>
-            \\	<string>13.6</string>
+            \\ <key>ProductBuildVersion</key>
+            \\ <string>19G68</string>
+            \\ <key>ProductCopyright</key>
+            \\ <string>1983-2020 Apple Inc.</string>
+            \\ <key>ProductName</key>
+            \\ <string>Mac OS X</string>
+            \\ <key>ProductUserVisibleVersion</key>
+            \\ <string>10.15.6</string>
+            \\ <key>ProductVersion</key>
+            \\ <string>10.15.6</string>
+            \\ <key>iOSSupportVersion</key>
+            \\ <string>13.6</string>
            \\</dict>
            \\</plist>
            ,
@ -345,18 +345,18 @@ test "detect" {
            \\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
            \\<plist version="1.0">
            \\<dict>
-            \\	<key>ProductBuildVersion</key>
-            \\	<string>20A2408</string>
-            \\	<key>ProductCopyright</key>
-            \\	<string>1983-2020 Apple Inc.</string>
-            \\	<key>ProductName</key>
-            \\	<string>macOS</string>
-            \\	<key>ProductUserVisibleVersion</key>
-            \\	<string>11.0</string>
-            \\	<key>ProductVersion</key>
-            \\	<string>11.0</string>
-            \\	<key>iOSSupportVersion</key>
-            \\	<string>14.2</string>
+            \\ <key>ProductBuildVersion</key>
+            \\ <string>20A2408</string>
+            \\ <key>ProductCopyright</key>
+            \\ <string>1983-2020 Apple Inc.</string>
+            \\ <key>ProductName</key>
+            \\ <string>macOS</string>
+            \\ <key>ProductUserVisibleVersion</key>
+            \\ <string>11.0</string>
+            \\ <key>ProductVersion</key>
+            \\ <string>11.0</string>
+            \\ <key>iOSSupportVersion</key>
+            \\ <string>14.2</string>
            \\</dict>
            \\</plist>
            ,
@ -367,18 +367,18 @@ test "detect" {
            \\<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
            \\<plist version="1.0">
            \\<dict>
-            \\	<key>ProductBuildVersion</key>
-            \\	<string>20C63</string>
-            \\	<key>ProductCopyright</key>
-            \\	<string>1983-2020 Apple Inc.</string>
-            \\	<key>ProductName</key>
-            \\	<string>macOS</string>
-            \\	<key>ProductUserVisibleVersion</key>
-            \\	<string>11.1</string>
-            \\	<key>ProductVersion</key>
-            \\	<string>11.1</string>
-            \\	<key>iOSSupportVersion</key>
-            \\	<string>14.3</string>
+            \\ <key>ProductBuildVersion</key>
+            \\ <string>20C63</string>
+            \\ <key>ProductCopyright</key>
+            \\ <string>1983-2020 Apple Inc.</string>
+            \\ <key>ProductName</key>
+            \\ <string>macOS</string>
+            \\ <key>ProductUserVisibleVersion</key>
+            \\ <string>11.1</string>
+            \\ <key>ProductVersion</key>
+            \\ <string>11.1</string>
+            \\ <key>iOSSupportVersion</key>
+            \\ <string>14.3</string>
            \\</dict>
            \\</plist>
            ,
--- a/lib/std/zig/system/linux.zig
+++ b/lib/std/zig/system/linux.zig
@ -109,12 +109,12 @@ const RiscvCpuinfoParser = CpuinfoParser(RiscvCpuinfoImpl);

 test "cpuinfo: RISC-V" {
    try testParser(RiscvCpuinfoParser, .riscv64, &Target.riscv.cpu.sifive_u74,
-        \\processor	: 0
-        \\hart		: 1
-        \\isa		: rv64imafdc
-        \\mmu		: sv39
-        \\isa-ext       :
-        \\uarch		: sifive,u74-mc
+        \\processor : 0
+        \\hart      : 1
+        \\isa       : rv64imafdc
+        \\mmu       : sv39
+        \\isa-ext   :
+        \\uarch     : sifive,u74-mc
    );
 }

@ -177,16 +177,16 @@ const PowerpcCpuinfoParser = CpuinfoParser(PowerpcCpuinfoImpl);

 test "cpuinfo: PowerPC" {
    try testParser(PowerpcCpuinfoParser, .powerpc, &Target.powerpc.cpu.@"970",
-        \\processor	: 0
-        \\cpu		: PPC970MP, altivec supported
-        \\clock		: 1250.000000MHz
-        \\revision	: 1.1 (pvr 0044 0101)
+        \\processor : 0
+        \\cpu       : PPC970MP, altivec supported
+        \\clock     : 1250.000000MHz
+        \\revision  : 1.1 (pvr 0044 0101)
    );
    try testParser(PowerpcCpuinfoParser, .powerpc64le, &Target.powerpc.cpu.pwr8,
-        \\processor	: 0
-        \\cpu		: POWER8 (raw), altivec supported
-        \\clock		: 2926.000000MHz
-        \\revision	: 2.0 (pvr 004d 0200)
+        \\processor : 0
+        \\cpu       : POWER8 (raw), altivec supported
+        \\clock     : 2926.000000MHz
+        \\revision  : 2.0 (pvr 004d 0200)
    );
 }

@ -304,25 +304,25 @@ test "cpuinfo: ARM" {
        \\CPU revision    : 7
    );
    try testParser(ArmCpuinfoParser, .arm, &Target.arm.cpu.cortex_a7,
-        \\processor	: 0
-        \\model name	: ARMv7 Processor rev 3 (v7l)
-        \\BogoMIPS	: 18.00
-        \\Features	: half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
-        \\CPU implementer	: 0x41
+        \\processor : 0
+        \\model name : ARMv7 Processor rev 3 (v7l)
+        \\BogoMIPS : 18.00
+        \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
+        \\CPU implementer : 0x41
        \\CPU architecture: 7
-        \\CPU variant	: 0x0
-        \\CPU part	: 0xc07
-        \\CPU revision	: 3
+        \\CPU variant : 0x0
+        \\CPU part : 0xc07
+        \\CPU revision : 3
        \\
-        \\processor	: 4
-        \\model name	: ARMv7 Processor rev 3 (v7l)
-        \\BogoMIPS	: 90.00
-        \\Features	: half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
-        \\CPU implementer	: 0x41
+        \\processor : 4
+        \\model name : ARMv7 Processor rev 3 (v7l)
+        \\BogoMIPS : 90.00
+        \\Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae
+        \\CPU implementer : 0x41
        \\CPU architecture: 7
-        \\CPU variant	: 0x2
-        \\CPU part	: 0xc0f
-        \\CPU revision	: 3
+        \\CPU variant : 0x2
+        \\CPU part : 0xc0f
+        \\CPU revision : 3
    );
    try testParser(ArmCpuinfoParser, .aarch64, &Target.aarch64.cpu.cortex_a72,
        \\processor       : 0
--- a/lib/std/zig/tokenizer.zig
+++ b/lib/std/zig/tokenizer.zig
@ -320,7 +320,7 @@ pub const Token = struct {

        pub fn symbol(tag: Tag) []const u8 {
            return tag.lexeme() orelse switch (tag) {
-                .invalid => "invalid bytes",
+                .invalid => "invalid token",
                .identifier => "an identifier",
                .string_literal, .multiline_string_literal_line => "a string literal",
                .char_literal => "a character literal",
@ -338,22 +338,22 @@ pub const Tokenizer = struct {
    buffer: [:0]const u8,
    index: usize,

-    /// For debugging purposes
+    /// For debugging purposes.
    pub fn dump(self: *Tokenizer, token: *const Token) void {
        std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.loc.start..token.loc.end] });
    }

    pub fn init(buffer: [:0]const u8) Tokenizer {
-        // Skip the UTF-8 BOM if present
-        const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0;
-        return Tokenizer{
+        // Skip the UTF-8 BOM if present.
+        return .{
            .buffer = buffer,
-            .index = src_start,
+            .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
        };
    }

    const State = enum {
        start,
+        expect_newline,
        identifier,
        builtin,
        string_literal,
@ -361,10 +361,6 @@ pub const Tokenizer = struct {
        multiline_string_literal_line,
        char_literal,
        char_literal_backslash,
-        char_literal_hex_escape,
-        char_literal_unicode_escape_saw_u,
-        char_literal_unicode_escape,
-        char_literal_end,
        backslash,
        equal,
        bang,
@ -400,30 +396,33 @@ pub const Tokenizer = struct {
        period_2,
        period_asterisk,
        saw_at_sign,
+        invalid,
    };

+    /// After this returns invalid, it will reset on the next newline, returning tokens starting from there.
+    /// An eof token will always be returned at the end.
    pub fn next(self: *Tokenizer) Token {
        var state: State = .start;
-        var result = Token{
-            .tag = .eof,
+        var result: Token = .{
+            .tag = undefined,
            .loc = .{
                .start = self.index,
                .end = undefined,
            },
        };
-        var seen_escape_digits: usize = undefined;
        while (true) : (self.index += 1) {
            const c = self.buffer[self.index];
            switch (state) {
                .start => switch (c) {
                    0 => {
-                        if (self.index != self.buffer.len) {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += 1;
-                            return result;
-                        }
-                        break;
+                        if (self.index == self.buffer.len) return .{
+                            .tag = .eof,
+                            .loc = .{
+                                .start = self.index,
+                                .end = self.index,
+                            },
+                        };
+                        state = .invalid;
                    },
                    ' ', '\n', '\t', '\r' => {
                        result.loc.start = self.index + 1;
@ -434,6 +433,7 @@ pub const Tokenizer = struct {
                    },
                    '\'' => {
                        state = .char_literal;
+                        result.tag = .char_literal;
                    },
                    'a'...'z', 'A'...'Z', '_' => {
                        state = .identifier;
@ -545,14 +545,44 @@ pub const Tokenizer = struct {
                        result.tag = .number_literal;
                    },
                    else => {
-                        result.tag = .invalid;
-                        result.loc.end = self.index;
-                        self.index += std.unicode.utf8ByteSequenceLength(c) catch 1;
-                        return result;
+                        state = .invalid;
                    },
                },

+                .expect_newline => switch (c) {
+                    0 => {
+                        if (self.index == self.buffer.len) {
+                            result.tag = .invalid;
+                            break;
+                        }
+                        state = .invalid;
+                    },
+                    '\n' => {
+                        result.loc.start = self.index + 1;
+                        state = .start;
+                    },
+                    else => {
+                        state = .invalid;
+                    },
+                },
+
+                .invalid => switch (c) {
+                    0 => if (self.index == self.buffer.len) {
+                        result.tag = .invalid;
+                        break;
+                    },
+                    '\n' => {
+                        result.tag = .invalid;
+                        break;
+                    },
+                    else => continue,
+                },
+
                .saw_at_sign => switch (c) {
+                    0, '\n' => {
+                        result.tag = .invalid;
+                        break;
+                    },
                    '"' => {
                        result.tag = .identifier;
                        state = .string_literal;
@ -562,8 +592,7 @@ pub const Tokenizer = struct {
                        result.tag = .builtin;
                    },
                    else => {
-                        result.tag = .invalid;
-                        break;
+                        state = .invalid;
                    },
                },

@ -698,7 +727,7 @@ pub const Tokenizer = struct {
                },

                .identifier => switch (c) {
-                    'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
+                    'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
                    else => {
                        if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
                            result.tag = tag;
@ -707,26 +736,37 @@ pub const Tokenizer = struct {
                    },
                },
                .builtin => switch (c) {
-                    'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
+                    'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
                    else => break,
                },
                .backslash => switch (c) {
-                    '\\' => {
-                        state = .multiline_string_literal_line;
-                    },
-                    else => {
+                    0 => {
                        result.tag = .invalid;
                        break;
                    },
+                    '\\' => {
+                        state = .multiline_string_literal_line;
+                    },
+                    '\n' => {
+                        result.tag = .invalid;
+                        break;
+                    },
+                    else => {
+                        state = .invalid;
+                    },
                },
                .string_literal => switch (c) {
-                    0, '\n' => {
-                        result.tag = .invalid;
-                        result.loc.end = self.index;
+                    0 => {
                        if (self.index != self.buffer.len) {
-                            self.index += 1;
+                            state = .invalid;
+                            continue;
                        }
-                        return result;
+                        result.tag = .invalid;
+                        break;
+                    },
+                    '\n' => {
+                        result.tag = .invalid;
+                        break;
                    },
                    '\\' => {
                        state = .string_literal_backslash;
@ -735,150 +775,74 @@ pub const Tokenizer = struct {
                        self.index += 1;
                        break;
                    },
-                    else => {
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    0x01...0x09, 0x0b...0x1f, 0x7f => {
+                        state = .invalid;
                    },
+                    else => continue,
                },

                .string_literal_backslash => switch (c) {
                    0, '\n' => {
                        result.tag = .invalid;
-                        result.loc.end = self.index;
-                        if (self.index != self.buffer.len) {
-                            self.index += 1;
-                        }
-                        return result;
+                        break;
                    },
                    else => {
                        state = .string_literal;
-
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },

                .char_literal => switch (c) {
-                    0, '\n', '\'' => {
-                        result.tag = .invalid;
-                        result.loc.end = self.index;
+                    0 => {
                        if (self.index != self.buffer.len) {
-                            self.index += 1;
+                            state = .invalid;
+                            continue;
                        }
-                        return result;
+                        result.tag = .invalid;
+                        break;
+                    },
+                    '\n' => {
+                        result.tag = .invalid;
+                        break;
                    },
                    '\\' => {
                        state = .char_literal_backslash;
                    },
-                    else => {
-                        state = .char_literal_end;
-
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
-                    },
-                },
-
-                .char_literal_backslash => switch (c) {
-                    0, '\n' => {
-                        result.tag = .invalid;
-                        result.loc.end = self.index;
-                        if (self.index != self.buffer.len) {
-                            self.index += 1;
-                        }
-                        return result;
-                    },
-                    'x' => {
-                        state = .char_literal_hex_escape;
-                        seen_escape_digits = 0;
-                    },
-                    'u' => {
-                        state = .char_literal_unicode_escape_saw_u;
-                    },
-                    else => {
-                        state = .char_literal_end;
-
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
-                    },
-                },
-
-                .char_literal_hex_escape => switch (c) {
-                    '0'...'9', 'a'...'f', 'A'...'F' => {
-                        seen_escape_digits += 1;
-                        if (seen_escape_digits == 2) {
-                            state = .char_literal_end;
-                        }
-                    },
-                    else => {
-                        result.tag = .invalid;
-                        break;
-                    },
-                },
-
-                .char_literal_unicode_escape_saw_u => switch (c) {
-                    '{' => {
-                        state = .char_literal_unicode_escape;
-                    },
-                    else => {
-                        result.tag = .invalid;
-                        break;
-                    },
-                },
-
-                .char_literal_unicode_escape => switch (c) {
-                    '0'...'9', 'a'...'f', 'A'...'F' => {},
-                    '}' => {
-                        state = .char_literal_end; // too many/few digits handled later
-                    },
-                    else => {
-                        result.tag = .invalid;
-                        break;
-                    },
-                },
-
-                .char_literal_end => switch (c) {
                    '\'' => {
-                        result.tag = .char_literal;
                        self.index += 1;
                        break;
                    },
-                    else => {
+                    0x01...0x09, 0x0b...0x1f, 0x7f => {
+                        state = .invalid;
+                    },
+                    else => continue,
+                },
+
+                .char_literal_backslash => switch (c) {
+                    0 => {
+                        if (self.index != self.buffer.len) {
+                            state = .invalid;
+                            continue;
+                        }
                        result.tag = .invalid;
                        break;
                    },
+                    '\n' => {
+                        result.tag = .invalid;
+                        break;
+                    },
+                    0x01...0x09, 0x0b...0x1f, 0x7f => {
+                        state = .invalid;
+                    },
+                    else => {
+                        state = .char_literal;
+                    },
                },

                .multiline_string_literal_line => switch (c) {
                    0 => {
                        if (self.index != self.buffer.len) {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += 1;
-                            return result;
+                            state = .invalid;
+                            continue;
                        }
                        break;
                    },
@ -886,17 +850,18 @@ pub const Tokenizer = struct {
                        self.index += 1;
                        break;
                    },
-                    '\t' => {},
-                    else => {
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 2;
+                            break;
+                        } else {
+                            state = .invalid;
                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                        state = .invalid;
+                    },
+                    else => continue,
                },

                .bang => switch (c) {
@ -1113,12 +1078,16 @@ pub const Tokenizer = struct {
                .line_comment_start => switch (c) {
                    0 => {
                        if (self.index != self.buffer.len) {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += 1;
-                            return result;
+                            state = .invalid;
+                            continue;
                        }
-                        break;
+                        return .{
+                            .tag = .eof,
+                            .loc = .{
+                                .start = self.index,
+                                .end = self.index,
+                            },
+                        };
                    },
                    '/' => {
                        state = .doc_comment_start;
@ -1127,105 +1096,91 @@ pub const Tokenizer = struct {
                        result.tag = .container_doc_comment;
                        state = .doc_comment;
                    },
+                    '\r' => {
+                        state = .expect_newline;
+                    },
                    '\n' => {
                        state = .start;
                        result.loc.start = self.index + 1;
                    },
-                    '\t' => {
-                        state = .line_comment;
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                        state = .invalid;
                    },
                    else => {
                        state = .line_comment;
-
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },
                .doc_comment_start => switch (c) {
+                    0, '\n' => {
+                        result.tag = .doc_comment;
+                        break;
+                    },
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 1;
+                            result.tag = .doc_comment;
+                            break;
+                        } else {
+                            state = .invalid;
+                        }
+                    },
                    '/' => {
                        state = .line_comment;
                    },
-                    0 => {
-                        if (self.index != self.buffer.len) {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += 1;
-                            return result;
-                        }
-                        result.tag = .doc_comment;
-                        break;
-                    },
-                    '\n' => {
-                        result.tag = .doc_comment;
-                        break;
-                    },
-                    '\t' => {
-                        state = .doc_comment;
-                        result.tag = .doc_comment;
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                        state = .invalid;
                    },
                    else => {
                        state = .doc_comment;
                        result.tag = .doc_comment;
-
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
                    },
                },
                .line_comment => switch (c) {
                    0 => {
                        if (self.index != self.buffer.len) {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += 1;
-                            return result;
+                            state = .invalid;
+                            continue;
                        }
-                        break;
+                        return .{
+                            .tag = .eof,
+                            .loc = .{
+                                .start = self.index,
+                                .end = self.index,
+                            },
+                        };
+                    },
+                    '\r' => {
+                        state = .expect_newline;
                    },
                    '\n' => {
                        state = .start;
                        result.loc.start = self.index + 1;
                    },
-                    '\t' => {},
-                    else => {
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                        state = .invalid;
                    },
+                    else => continue,
                },
                .doc_comment => switch (c) {
-                    0, '\n' => break,
-                    '\t' => {},
-                    else => {
-                        if (self.invalidCharacterLength()) |len| {
-                            result.tag = .invalid;
-                            result.loc.end = self.index;
-                            self.index += len;
-                            return result;
-                        }
-
-                        self.index += (std.unicode.utf8ByteSequenceLength(c) catch unreachable) - 1;
+                    0, '\n' => {
+                        break;
                    },
+                    '\r' => {
+                        if (self.buffer[self.index + 1] == '\n') {
+                            self.index += 1;
+                            break;
+                        } else {
+                            state = .invalid;
+                        }
+                    },
+                    0x01...0x09, 0x0b...0x0c, 0x0e...0x1f, 0x7f => {
+                        state = .invalid;
+                    },
+                    else => continue,
                },
                .int => switch (c) {
                    '.' => state = .int_period,
-                    '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
+                    '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
                    'e', 'E', 'p', 'P' => state = .int_exponent,
                    else => break,
                },
@ -1249,7 +1204,7 @@ pub const Tokenizer = struct {
                    },
                },
                .float => switch (c) {
-                    '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {},
+                    '_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
                    'e', 'E', 'p', 'P' => state = .float_exponent,
                    else => break,
                },
@ -1263,57 +1218,9 @@ pub const Tokenizer = struct {
            }
        }

-        if (result.tag == .eof) {
-            result.loc.start = self.index;
-        }
-
        result.loc.end = self.index;
        return result;
    }
-
-    fn invalidCharacterLength(self: *Tokenizer) ?u3 {
-        const c0 = self.buffer[self.index];
-        if (std.ascii.isAscii(c0)) {
-            if (c0 == '\r') {
-                if (self.index + 1 < self.buffer.len and self.buffer[self.index + 1] == '\n') {
-                    // Carriage returns are *only* allowed just before a linefeed as part of a CRLF pair, otherwise
-                    // they constitute an illegal byte!
-                    return null;
-                } else {
-                    return 1;
-                }
-            } else if (std.ascii.isControl(c0)) {
-                // ascii control codes are never allowed
-                // (note that \n was checked before we got here)
-                return 1;
-            }
-            // looks fine to me.
-            return null;
-        } else {
-            // check utf8-encoded character.
-            const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
-            if (self.index + length > self.buffer.len) {
-                return @as(u3, @intCast(self.buffer.len - self.index));
-            }
-            const bytes = self.buffer[self.index .. self.index + length];
-            switch (length) {
-                2 => {
-                    const value = std.unicode.utf8Decode2(bytes) catch return length;
-                    if (value == 0x85) return length; // U+0085 (NEL)
-                },
-                3 => {
-                    const value = std.unicode.utf8Decode3(bytes) catch return length;
-                    if (value == 0x2028) return length; // U+2028 (LS)
-                    if (value == 0x2029) return length; // U+2029 (PS)
-                },
-                4 => {
-                    _ = std.unicode.utf8Decode4(bytes) catch return length;
-                },
-                else => unreachable,
-            }
-            return null;
-        }
-    }
 };

 test "keywords" {
@ -1355,7 +1262,7 @@ test "code point literal with hex escape" {
    , &.{.char_literal});
    try testTokenize(
        \\'\x1'
-    , &.{ .invalid, .invalid });
+    , &.{.char_literal});
 }

 test "newline in char literal" {
@ -1396,40 +1303,30 @@ test "code point literal with unicode escapes" {
    // Invalid unicode escapes
    try testTokenize(
        \\'\u'
-    , &.{ .invalid, .invalid });
+    , &.{.char_literal});
    try testTokenize(
        \\'\u{{'
-    , &.{ .invalid, .l_brace, .invalid });
+    , &.{.char_literal});
    try testTokenize(
        \\'\u{}'
    , &.{.char_literal});
    try testTokenize(
        \\'\u{s}'
-    , &.{
-        .invalid,
-        .identifier,
-        .r_brace,
-        .invalid,
-    });
+    , &.{.char_literal});
    try testTokenize(
        \\'\u{2z}'
-    , &.{
-        .invalid,
-        .identifier,
-        .r_brace,
-        .invalid,
-    });
+    , &.{.char_literal});
    try testTokenize(
        \\'\u{4a'
-    , &.{ .invalid, .invalid }); // 4a is valid
+    , &.{.char_literal});

    // Test old-style unicode literals
    try testTokenize(
        \\'\u0333'
-    , &.{ .invalid, .number_literal, .invalid });
+    , &.{.char_literal});
    try testTokenize(
        \\'\U0333'
-    , &.{ .invalid, .number_literal, .invalid });
+    , &.{.char_literal});
 }

 test "code point literal with unicode code point" {
@ -1465,24 +1362,15 @@ test "invalid token characters" {
    try testTokenize("`", &.{.invalid});
    try testTokenize("'c", &.{.invalid});
    try testTokenize("'", &.{.invalid});
-    try testTokenize("''", &.{.invalid});
+    try testTokenize("''", &.{.char_literal});
    try testTokenize("'\n'", &.{ .invalid, .invalid });
 }

 test "invalid literal/comment characters" {
-    try testTokenize("\"\x00\"", &.{
-        .invalid,
-        .invalid, // Incomplete string literal starting after invalid
-    });
-    try testTokenize("//\x00", &.{
-        .invalid,
-    });
-    try testTokenize("//\x1f", &.{
-        .invalid,
-    });
-    try testTokenize("//\x7f", &.{
-        .invalid,
-    });
+    try testTokenize("\"\x00\"", &.{.invalid});
+    try testTokenize("//\x00", &.{.invalid});
+    try testTokenize("//\x1f", &.{.invalid});
+    try testTokenize("//\x7f", &.{.invalid});
 }

 test "utf8" {
@ -1491,46 +1379,24 @@ test "utf8" {
 }

 test "invalid utf8" {
-    try testTokenize("//\x80", &.{
-        .invalid,
-    });
-    try testTokenize("//\xbf", &.{
-        .invalid,
-    });
-    try testTokenize("//\xf8", &.{
-        .invalid,
-    });
-    try testTokenize("//\xff", &.{
-        .invalid,
-    });
-    try testTokenize("//\xc2\xc0", &.{
-        .invalid,
-    });
-    try testTokenize("//\xe0", &.{
-        .invalid,
-    });
-    try testTokenize("//\xf0", &.{
-        .invalid,
-    });
-    try testTokenize("//\xf0\x90\x80\xc0", &.{
-        .invalid,
-    });
+    try testTokenize("//\x80", &.{});
+    try testTokenize("//\xbf", &.{});
+    try testTokenize("//\xf8", &.{});
+    try testTokenize("//\xff", &.{});
+    try testTokenize("//\xc2\xc0", &.{});
+    try testTokenize("//\xe0", &.{});
+    try testTokenize("//\xf0", &.{});
+    try testTokenize("//\xf0\x90\x80\xc0", &.{});
 }

 test "illegal unicode codepoints" {
    // unicode newline characters.U+0085, U+2028, U+2029
    try testTokenize("//\xc2\x84", &.{});
-    try testTokenize("//\xc2\x85", &.{
-        .invalid,
-    });
+    try testTokenize("//\xc2\x85", &.{});
    try testTokenize("//\xc2\x86", &.{});
    try testTokenize("//\xe2\x80\xa7", &.{});
-    try testTokenize("//\xe2\x80\xa8", &.{
-        .invalid,
-    });
-    try testTokenize("//\xe2\x80\xa9", &.{
-        .invalid,
-    });
+    try testTokenize("//\xe2\x80\xa8", &.{});
+    try testTokenize("//\xe2\x80\xa9", &.{});
    try testTokenize("//\xe2\x80\xaa", &.{});
 }

@ -1549,30 +1415,6 @@ test "string identifier and builtin fns" {
    });
 }

-test "multiline string literal with literal tab" {
-    try testTokenize(
-        \\\\foo	bar
-    , &.{
-        .multiline_string_literal_line,
-    });
-}
-
-test "comments with literal tab" {
-    try testTokenize(
-        \\//foo	bar
-        \\//!foo	bar
-        \\///foo	bar
-        \\//	foo
-        \\///	foo
-        \\///	/foo
-    , &.{
-        .container_doc_comment,
-        .doc_comment,
-        .doc_comment,
-        .doc_comment,
-    });
-}
-
 test "pipe and then invalid" {
    try testTokenize("||=", &.{
        .pipe_pipe,
@ -1892,8 +1734,8 @@ test "multi line string literal with only 1 backslash" {
 }

 test "invalid builtin identifiers" {
-    try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
-    try testTokenize("@0()", &.{ .invalid, .number_literal, .l_paren, .r_paren });
+    try testTokenize("@()", &.{.invalid});
+    try testTokenize("@0()", &.{.invalid});
 }

 test "invalid token with unfinished escape right before eof" {
@ -1921,21 +1763,78 @@ test "saturating operators" {
 }

 test "null byte before eof" {
-    try testTokenize("123 \x00 456", &.{ .number_literal, .invalid, .number_literal });
+    try testTokenize("123 \x00 456", &.{ .number_literal, .invalid });
    try testTokenize("//\x00", &.{.invalid});
    try testTokenize("\\\\\x00", &.{.invalid});
    try testTokenize("\x00", &.{.invalid});
    try testTokenize("// NUL\x00\n", &.{.invalid});
-    try testTokenize("///\x00\n", &.{.invalid});
+    try testTokenize("///\x00\n", &.{ .doc_comment, .invalid });
    try testTokenize("/// NUL\x00\n", &.{ .doc_comment, .invalid });
 }

+test "invalid tabs and carriage returns" {
+    // "Inside Line Comments and Documentation Comments, Any TAB is rejected by
+    // the grammar since it is ambiguous how it should be rendered."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("//\t", &.{.invalid});
+    try testTokenize("// \t", &.{.invalid});
+    try testTokenize("///\t", &.{.invalid});
+    try testTokenize("/// \t", &.{.invalid});
+    try testTokenize("//!\t", &.{.invalid});
+    try testTokenize("//! \t", &.{.invalid});
+
+    // "Inside Line Comments and Documentation Comments, CR directly preceding
+    // NL is unambiguously part of the newline sequence. It is accepted by the
+    // grammar and removed by zig fmt, leaving only NL. CR anywhere else is
+    // rejected by the grammar."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("//\r", &.{.invalid});
+    try testTokenize("// \r", &.{.invalid});
+    try testTokenize("///\r", &.{.invalid});
+    try testTokenize("/// \r", &.{.invalid});
+    try testTokenize("//\r ", &.{.invalid});
+    try testTokenize("// \r ", &.{.invalid});
+    try testTokenize("///\r ", &.{.invalid});
+    try testTokenize("/// \r ", &.{.invalid});
+    try testTokenize("//\r\n", &.{});
+    try testTokenize("// \r\n", &.{});
+    try testTokenize("///\r\n", &.{.doc_comment});
+    try testTokenize("/// \r\n", &.{.doc_comment});
+    try testTokenize("//!\r", &.{.invalid});
+    try testTokenize("//! \r", &.{.invalid});
+    try testTokenize("//!\r ", &.{.invalid});
+    try testTokenize("//! \r ", &.{.invalid});
+    try testTokenize("//!\r\n", &.{.container_doc_comment});
+    try testTokenize("//! \r\n", &.{.container_doc_comment});
+
+    // The control characters TAB and CR are rejected by the grammar inside multi-line string literals,
+    // except if CR is directly before NL.
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("\\\\\r", &.{.invalid});
+    try testTokenize("\\\\\r ", &.{.invalid});
+    try testTokenize("\\\\ \r", &.{.invalid});
+    try testTokenize("\\\\\t", &.{.invalid});
+    try testTokenize("\\\\\t ", &.{.invalid});
+    try testTokenize("\\\\ \t", &.{.invalid});
+    try testTokenize("\\\\\r\n", &.{.multiline_string_literal_line});
+
+    // "TAB used as whitespace is...accepted by the grammar. CR used as
+    // whitespace, whether directly preceding NL or stray, is...accepted by the
+    // grammar."
+    // https://github.com/ziglang/zig-spec/issues/38
+    try testTokenize("\tpub\tswitch\t", &.{ .keyword_pub, .keyword_switch });
+    try testTokenize("\rpub\rswitch\r", &.{ .keyword_pub, .keyword_switch });
+}
+
 fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
    var tokenizer = Tokenizer.init(source);
    for (expected_token_tags) |expected_token_tag| {
        const token = tokenizer.next();
        try std.testing.expectEqual(expected_token_tag, token.tag);
    }
+    // Last token should always be eof, even when the last token was invalid,
+    // in which case the tokenizer is in an invalid state, which can only be
+    // recovered by opinionated means outside the scope of this implementation.
    const last_token = tokenizer.next();
    try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
    try std.testing.expectEqual(source.len, last_token.loc.start);
--- a/src/Package/Manifest.zig
+++ b/src/Package/Manifest.zig
@ -549,6 +549,9 @@ const Parse = struct {
                    .{raw_string[bad_index]},
                );
            },
+            .empty_char_literal => {
+                try p.appendErrorOff(token, offset, "empty character literal", .{});
+            },
        }
    }

--- a/src/link/tapi/yaml/test.zig
+++ b/src/link/tapi/yaml/test.zig
@ -237,10 +237,7 @@ test "double quoted string" {
    try testing.expectEqualStrings(
        \\"here" are some escaped quotes
    , arr[1]);
-    try testing.expectEqualStrings(
-        \\newlines and tabs
-        \\are	supported
-    , arr[2]);
+    try testing.expectEqualStrings("newlines and tabs\nare\tsupported", arr[2]);
    try testing.expectEqualStrings(
        \\let's have
        \\some fun!
--- a/test/cases/compile_errors/empty_char_lit.zig
+++ b/test/cases/compile_errors/empty_char_lit.zig
@ -0,0 +1,9 @@
+export fn entry() u8 {
+    return '';
+}
+
+// error
+// backend=stage2
+// target=native
+//
+// :2:12: error: empty character literal
--- a/test/cases/compile_errors/invalid_legacy_unicode_escape.zig
+++ b/test/cases/compile_errors/invalid_legacy_unicode_escape.zig
@ -6,5 +6,4 @@ export fn entry() void {
 // backend=stage2
 // target=native
 //
-// :2:15: error: expected expression, found 'invalid bytes'
-// :2:18: note: invalid byte: '1'
+// :2:17: error: invalid escape character: 'U'
--- a/test/cases/compile_errors/invalid_unicode_escape.zig
+++ b/test/cases/compile_errors/invalid_unicode_escape.zig
@ -6,6 +6,5 @@ export fn entry() void {
 // backend=stage2
 // target=native
 //
-// :2:15: error: expected expression, found 'invalid bytes'
-// :2:21: note: invalid byte: 'z'
+// :2:21: error: expected hex digit or '}', found 'z'

--- a/test/cases/compile_errors/normal_string_with_newline.zig
+++ b/test/cases/compile_errors/normal_string_with_newline.zig
@ -5,5 +5,4 @@ b";
 // backend=stage2
 // target=native
 //
-// :1:13: error: expected expression, found 'invalid bytes'
-// :1:15: note: invalid byte: '\n'
+// :1:13: error: expected expression, found 'invalid token'
--- a/test/compile_errors.zig
+++ b/test/compile_errors.zig
@ -38,15 +38,6 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        });
    }

-    {
-        const case = ctx.obj("isolated carriage return in multiline string literal", b.graph.host);
-
-        case.addError("const foo = \\\\\test\r\r rogue carriage return\n;", &[_][]const u8{
-            ":1:13: error: expected expression, found 'invalid bytes'",
-            ":1:19: note: invalid byte: '\\r'",
-        });
-    }
-
    {
        const case = ctx.obj("missing semicolon at EOF", b.graph.host);
        case.addError(
@ -179,8 +170,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
            \\    return true;
            \\}
        , &[_][]const u8{
-            ":1:1: error: expected type expression, found 'invalid bytes'",
-            ":1:1: note: invalid byte: '\\xff'",
+            ":1:1: error: expected type expression, found 'invalid token'",
        });
    }

@ -222,8 +212,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        const case = ctx.obj("invalid byte in string", b.graph.host);

        case.addError("_ = \"\x01Q\";", &[_][]const u8{
-            ":1:5: error: expected expression, found 'invalid bytes'",
-            ":1:6: note: invalid byte: '\\x01'",
+            ":1:5: error: expected expression, found 'invalid token'",
        });
    }

@ -231,8 +220,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        const case = ctx.obj("invalid byte in comment", b.graph.host);

        case.addError("//\x01Q", &[_][]const u8{
-            ":1:1: error: expected type expression, found 'invalid bytes'",
-            ":1:3: note: invalid byte: '\\x01'",
+            ":1:1: error: expected type expression, found 'invalid token'",
        });
    }

@ -240,8 +228,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        const case = ctx.obj("control character in character literal", b.graph.host);

        case.addError("const c = '\x01';", &[_][]const u8{
-            ":1:11: error: expected expression, found 'invalid bytes'",
-            ":1:12: note: invalid byte: '\\x01'",
+            ":1:11: error: expected expression, found 'invalid token'",
        });
    }

@ -249,8 +236,7 @@ pub fn addCases(ctx: *Cases, b: *std.Build) !void {
        const case = ctx.obj("invalid byte at start of token", b.graph.host);

        case.addError("x = \x00Q", &[_][]const u8{
-            ":1:5: error: expected expression, found 'invalid bytes'",
-            ":1:5: note: invalid byte: '\\x00'",
+            ":1:5: error: expected expression, found 'invalid token'",
        });
    }
 }
--- a/test/run_translated_c.zig
+++ b/test/run_translated_c.zig
@ -26,17 +26,17 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void {
        \\void baz(void);
        \\struct foo { int x; };
        \\void bar() {
-        \\	struct foo tmp;
+        \\    struct foo tmp;
        \\}
        \\
        \\void baz() {
-        \\	struct foo tmp;
+        \\    struct foo tmp;
        \\}
        \\
        \\int main(void) {
-        \\	bar();
-        \\	baz();
-        \\	return 0;
+        \\    bar();
+        \\    baz();
+        \\    return 0;
        \\}
    , "");

@ -53,7 +53,7 @@ pub fn addCases(cases: *tests.RunTranslatedCContext) void {
    cases.add("parenthesized string literal",
        \\void foo(const char *s) {}
        \\int main(void) {
-        \\	foo(("bar"));
+        \\    foo(("bar"));
        \\}
    , "");

--- a/test/translate_c.zig
+++ b/test/translate_c.zig
@ -133,20 +133,20 @@ pub fn addCases(cases: *tests.TranslateCContext) void {

    cases.add("scoped typedef",
        \\void foo() {
-        \\	typedef union {
-        \\		int A;
-        \\		int B;
-        \\		int C;
-        \\	} Foo;
-        \\	Foo a = {0};
-        \\	{
-        \\		typedef union {
-        \\			int A;
-        \\			int B;
-        \\			int C;
-        \\		} Foo;
-        \\		Foo a = {0};
-        \\	}
+        \\ typedef union {
+        \\  int A;
+        \\  int B;
+        \\  int C;
+        \\ } Foo;
+        \\ Foo a = {0};
+        \\ {
+        \\  typedef union {
+        \\   int A;
+        \\   int B;
+        \\   int C;
+        \\  } Foo;
+        \\  Foo a = {0};
+        \\ }
        \\}
    , &[_][]const u8{
        \\pub export fn foo() void {
@ -2004,18 +2004,18 @@ pub fn addCases(cases: *tests.TranslateCContext) void {
        \\           break;
        \\        }
        \\        case 4:
-        \\		case 5:
+        \\        case 5:
        \\            res = 69;
        \\        {
        \\            res = 5;
-        \\			  return;
+        \\            return;
        \\        }
        \\        case 6:
        \\            switch (res) {
        \\                case 9: break;
        \\            }
        \\            res = 1;
-        \\			  return;
+        \\            return;
        \\    }
        \\}
    , &[_][]const u8{