zig/lib/compiler/aro/aro/Tokenizer.zig

const std = @import("std");
const assert = std.debug.assert;

const Compilation = @import("Compilation.zig");
const LangOpts = @import("LangOpts.zig");
const Source = @import("Source.zig");

/// Value for valid escapes indicates how many characters to consume, not counting leading backslash
const UCNKind = enum(u8) {
    /// Just `\`
    none,
    /// \u or \U followed by an insufficient number of hex digits
    incomplete,
    /// `\uxxxx`
    hex4 = 5,
    /// `\Uxxxxxxxx`
    hex8 = 9,

    /// In the classification phase we do not care if the escape represents a valid universal character name
    /// e.g. \UFFFFFFFF is acceptable.
    fn classify(buf: []const u8) UCNKind {
        assert(buf[0] == '\\');
        if (buf.len == 1) return .none;
        switch (buf[1]) {
            'u' => {
                if (buf.len < 6) return .incomplete;
                for (buf[2..6]) |c| {
                    if (!std.ascii.isHex(c)) return .incomplete;
                }
                return .hex4;
            },
            'U' => {
                if (buf.len < 10) return .incomplete;
                for (buf[2..10]) |c| {
                    if (!std.ascii.isHex(c)) return .incomplete;
                }
                return .hex8;
            },
            else => return .none,
        }
    }
};

pub const Token = struct {
    id: Id,
    source: Source.Id,
    start: u32 = 0,
    end: u32 = 0,
    line: u32 = 0,

    pub const Id = enum(u8) {
        invalid,
        nl,
        whitespace,
        eof,
        /// identifier containing solely basic character set characters
        identifier,
        /// identifier with at least one extended character or UCN escape sequence
        extended_identifier,

        // string literals with prefixes
        string_literal,
        string_literal_utf_16,
        string_literal_utf_8,
        string_literal_utf_32,
        string_literal_wide,

        /// Any string literal with an embedded newline or EOF
        /// Always a parser error; by default just a warning from preprocessor
        unterminated_string_literal,

        // <foobar> only generated by preprocessor
        macro_string,

        // char literals with prefixes
        char_literal,
        char_literal_utf_8,
        char_literal_utf_16,
        char_literal_utf_32,
        char_literal_wide,

        /// Any character literal with nothing inside the quotes
        /// Always a parser error; by default just a warning from preprocessor
        empty_char_literal,

        /// Any character literal with an embedded newline or EOF
        /// Always a parser error; by default just a warning from preprocessor
        unterminated_char_literal,

        /// `/* */` style comment without a closing `*/` before EOF
        unterminated_comment,

        /// Integer literal tokens generated by preprocessor.
        one,
        zero,

        bang,
        bang_equal,
        pipe,
        pipe_pipe,
        pipe_equal,
        equal,
        equal_equal,
        l_paren,
        r_paren,
        l_brace,
        r_brace,
        l_bracket,
        r_bracket,
        period,
        ellipsis,
        caret,
        caret_equal,
        plus,
        plus_plus,
        plus_equal,
        minus,
        minus_minus,
        minus_equal,
        asterisk,
        asterisk_equal,
        percent,
        percent_equal,
        arrow,
        colon,
        colon_colon,
        semicolon,
        slash,
        slash_equal,
        comma,
        ampersand,
        ampersand_ampersand,
        ampersand_equal,
        question_mark,
        angle_bracket_left,
        angle_bracket_left_equal,
        angle_bracket_angle_bracket_left,
        angle_bracket_angle_bracket_left_equal,
        angle_bracket_right,
        angle_bracket_right_equal,
        angle_bracket_angle_bracket_right,
        angle_bracket_angle_bracket_right_equal,
        tilde,
        hash,
        hash_hash,

        /// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
        macro_param,
        /// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
        macro_param_no_expand,
        /// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
        stringify_param,
        /// Same as stringify_param, but for var args
        stringify_va_args,
        /// Special macro whitespace, always equal to a single space
        macro_ws,
        /// Special token for implementing __has_attribute
        macro_param_has_attribute,
        /// Special token for implementing __has_c_attribute
        macro_param_has_c_attribute,
        /// Special token for implementing __has_declspec_attribute
        macro_param_has_declspec_attribute,
        /// Special token for implementing __has_warning
        macro_param_has_warning,
        /// Special token for implementing __has_feature
        macro_param_has_feature,
        /// Special token for implementing __has_extension
        macro_param_has_extension,
        /// Special token for implementing __has_builtin
        macro_param_has_builtin,
        /// Special token for implementing __has_include
        macro_param_has_include,
        /// Special token for implementing __has_include_next
        macro_param_has_include_next,
        /// Special token for implementing __has_embed
        macro_param_has_embed,
        /// Special token for implementing __is_identifier
        macro_param_is_identifier,
        /// Special token for implementing __FILE__
        macro_file,
        /// Special token for implementing __LINE__
        macro_line,
        /// Special token for implementing __COUNTER__
        macro_counter,
        /// Special token for implementing _Pragma
        macro_param_pragma_operator,
        /// Special token for implementing __identifier (MS extension)
        macro_param_ms_identifier,
        /// Special token for implementing __pragma (MS extension)
        macro_param_ms_pragma,

        /// Special identifier for implementing __func__
        macro_func,
        /// Special identifier for implementing __FUNCTION__
        macro_function,
        /// Special identifier for implementing __PRETTY_FUNCTION__
        macro_pretty_func,
        /// Special identifier for implementing __DATE__
        macro_date,
        /// Special identifier for implementing __TIME__
        macro_time,
        /// Special identifier for implementing __TIMESTAMP__
        macro_timestamp,

        keyword_auto,
        keyword_auto_type,
        keyword_break,
        keyword_case,
        keyword_char,
        keyword_const,
        keyword_continue,
        keyword_default,
        keyword_do,
        keyword_double,
        keyword_else,
        keyword_enum,
        keyword_extern,
        keyword_float,
        keyword_for,
        keyword_goto,
        keyword_if,
        keyword_int,
        keyword_long,
        keyword_register,
        keyword_return,
        keyword_short,
        keyword_signed,
        keyword_signed1,
        keyword_signed2,
        keyword_sizeof,
        keyword_static,
        keyword_struct,
        keyword_switch,
        keyword_typedef,
        keyword_typeof1,
        keyword_typeof2,
        keyword_union,
        keyword_unsigned,
        keyword_void,
        keyword_volatile,
        keyword_while,

        // ISO C99
        keyword_bool,
        keyword_complex,
        keyword_imaginary,
        keyword_inline,
        keyword_restrict,

        // ISO C11
        keyword_alignas,
        keyword_alignof,
        keyword_atomic,
        keyword_generic,
        keyword_noreturn,
        keyword_static_assert,
        keyword_thread_local,

        // ISO C23
        keyword_bit_int,
        keyword_c23_alignas,
        keyword_c23_alignof,
        keyword_c23_bool,
        keyword_c23_static_assert,
        keyword_c23_thread_local,
        keyword_constexpr,
        keyword_true,
        keyword_false,
        keyword_nullptr,
        keyword_typeof_unqual,

        // Preprocessor directives
        keyword_include,
        keyword_include_next,
        keyword_embed,
        keyword_define,
        keyword_defined,
        keyword_undef,
        keyword_ifdef,
        keyword_ifndef,
        keyword_elif,
        keyword_elifdef,
        keyword_elifndef,
        keyword_endif,
        keyword_error,
        keyword_warning,
        keyword_pragma,
        keyword_line,
        keyword_va_args,
        keyword_va_opt,

        // gcc keywords
        keyword_const1,
        keyword_const2,
        keyword_inline1,
        keyword_inline2,
        keyword_volatile1,
        keyword_volatile2,
        keyword_restrict1,
        keyword_restrict2,
        keyword_alignof1,
        keyword_alignof2,
        keyword_typeof,
        keyword_attribute1,
        keyword_attribute2,
        keyword_extension,
        keyword_asm,
        keyword_asm1,
        keyword_asm2,
        /// _Float128
        keyword_float128_1,
        /// __float128
        keyword_float128_2,
        keyword_int128,
        keyword_imag1,
        keyword_imag2,
        keyword_real1,
        keyword_real2,
        keyword_float16,

        // clang keywords
        keyword_fp16,

        // ms keywords
        keyword_declspec,
        keyword_int64,
        keyword_int64_2,
        keyword_int32,
        keyword_int32_2,
        keyword_int16,
        keyword_int16_2,
        keyword_int8,
        keyword_int8_2,
        keyword_stdcall,
        keyword_stdcall2,
        keyword_thiscall,
        keyword_thiscall2,
        keyword_vectorcall,
        keyword_vectorcall2,
        keyword_fastcall,
        keyword_fastcall2,
        keyword_regcall,
        keyword_cdecl,
        keyword_cdecl2,
        keyword_forceinline,
        keyword_forceinline2,
        keyword_unaligned,
        keyword_unaligned2,

        // Type nullability
        keyword_nonnull,
        keyword_nullable,
        keyword_nullable_result,
        keyword_null_unspecified,

        /// Generated by #embed directive
        /// Decimal value with no prefix or suffix
        embed_byte,

        /// preprocessor number
        /// An optional period, followed by a digit 0-9, followed by any number of letters
        /// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
        pp_num,

        /// preprocessor placemarker token
        /// generated if `##` is used with a zero-token argument
        /// removed after substitution, so the parser should never see this
        /// See C99 6.10.3.3.2
        placemarker,

        /// Virtual linemarker token output from preprocessor to indicate start of a new include
        include_start,

        /// Virtual linemarker token output from preprocessor to indicate resuming a file after
        /// completion of the preceding #include
        include_resume,

        /// A comment token if asked to preserve comments.
        comment,

        /// Incomplete universal character name
        /// This happens if the source text contains `\u` or `\U` followed by an insufficient number of hex
        /// digits. This token id represents just the backslash; the subsequent `u` or `U` will be treated as the
        /// leading character of the following identifier token.
        incomplete_ucn,

        /// Return true if token is identifier or keyword.
        pub fn isMacroIdentifier(id: Id) bool {
            switch (id) {
                .keyword_include,
                .keyword_include_next,
                .keyword_embed,
                .keyword_define,
                .keyword_defined,
                .keyword_undef,
                .keyword_ifdef,
                .keyword_ifndef,
                .keyword_elif,
                .keyword_elifdef,
                .keyword_elifndef,
                .keyword_endif,
                .keyword_error,
                .keyword_warning,
                .keyword_pragma,
                .keyword_line,
                .keyword_va_args,
                .keyword_va_opt,
                .macro_func,
                .macro_function,
                .macro_pretty_func,
                .macro_date,
                .macro_time,
                .macro_timestamp,
                .keyword_auto,
                .keyword_auto_type,
                .keyword_break,
                .keyword_case,
                .keyword_char,
                .keyword_const,
                .keyword_continue,
                .keyword_default,
                .keyword_do,
                .keyword_double,
                .keyword_else,
                .keyword_enum,
                .keyword_extern,
                .keyword_float,
                .keyword_for,
                .keyword_goto,
                .keyword_if,
                .keyword_int,
                .keyword_long,
                .keyword_register,
                .keyword_return,
                .keyword_short,
                .keyword_signed,
                .keyword_signed1,
                .keyword_signed2,
                .keyword_sizeof,
                .keyword_static,
                .keyword_struct,
                .keyword_switch,
                .keyword_typedef,
                .keyword_union,
                .keyword_unsigned,
                .keyword_void,
                .keyword_volatile,
                .keyword_while,
                .keyword_bool,
                .keyword_complex,
                .keyword_imaginary,
                .keyword_inline,
                .keyword_restrict,
                .keyword_alignas,
                .keyword_alignof,
                .keyword_atomic,
                .keyword_generic,
                .keyword_noreturn,
                .keyword_static_assert,
                .keyword_thread_local,
                .identifier,
                .extended_identifier,
                .keyword_typeof,
                .keyword_typeof1,
                .keyword_typeof2,
                .keyword_const1,
                .keyword_const2,
                .keyword_inline1,
                .keyword_inline2,
                .keyword_volatile1,
                .keyword_volatile2,
                .keyword_restrict1,
                .keyword_restrict2,
                .keyword_alignof1,
                .keyword_alignof2,
                .keyword_attribute1,
                .keyword_attribute2,
                .keyword_extension,
                .keyword_asm,
                .keyword_asm1,
                .keyword_asm2,
                .keyword_float128_1,
                .keyword_float128_2,
                .keyword_int128,
                .keyword_imag1,
                .keyword_imag2,
                .keyword_real1,
                .keyword_real2,
                .keyword_float16,
                .keyword_fp16,
                .keyword_declspec,
                .keyword_int64,
                .keyword_int64_2,
                .keyword_int32,
                .keyword_int32_2,
                .keyword_int16,
                .keyword_int16_2,
                .keyword_int8,
                .keyword_int8_2,
                .keyword_stdcall,
                .keyword_stdcall2,
                .keyword_thiscall,
                .keyword_thiscall2,
                .keyword_vectorcall,
                .keyword_vectorcall2,
                .keyword_fastcall,
                .keyword_fastcall2,
                .keyword_regcall,
                .keyword_cdecl,
                .keyword_cdecl2,
                .keyword_forceinline,
                .keyword_forceinline2,
                .keyword_unaligned,
                .keyword_unaligned2,
                .keyword_nonnull,
                .keyword_nullable,
                .keyword_nullable_result,
                .keyword_null_unspecified,
                .keyword_bit_int,
                .keyword_c23_alignas,
                .keyword_c23_alignof,
                .keyword_c23_bool,
                .keyword_c23_static_assert,
                .keyword_c23_thread_local,
                .keyword_constexpr,
                .keyword_true,
                .keyword_false,
                .keyword_nullptr,
                .keyword_typeof_unqual,
                => return true,
                else => return false,
            }
        }

        /// Turn macro keywords into identifiers.
        /// `keyword_defined` is special since it should only turn into an identifier if
        /// we are *not* in an #if or #elif expression
        pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
            switch (id.*) {
                .keyword_include,
                .keyword_include_next,
                .keyword_embed,
                .keyword_define,
                .keyword_undef,
                .keyword_ifdef,
                .keyword_ifndef,
                .keyword_elif,
                .keyword_elifdef,
                .keyword_elifndef,
                .keyword_endif,
                .keyword_error,
                .keyword_warning,
                .keyword_pragma,
                .keyword_line,
                .keyword_va_args,
                .keyword_va_opt,
                => id.* = .identifier,
                .keyword_defined => if (defined_to_identifier) {
                    id.* = .identifier;
                },
                else => {},
            }
        }

        pub fn simplifyMacroKeyword(id: *Id) void {
            simplifyMacroKeywordExtra(id, false);
        }

        pub fn lexeme(id: Id) ?[]const u8 {
            return switch (id) {
                .include_start,
                .include_resume,
                => unreachable,

                .unterminated_comment,
                .invalid,
                .identifier,
                .extended_identifier,
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,
                .unterminated_string_literal,
                .unterminated_char_literal,
                .empty_char_literal,
                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                .macro_string,
                .whitespace,
                .pp_num,
                .embed_byte,
                .comment,
                => null,

                .zero => "0",
                .one => "1",

                .nl,
                .eof,
                .macro_param,
                .macro_param_no_expand,
                .stringify_param,
                .stringify_va_args,
                .macro_param_has_attribute,
                .macro_param_has_c_attribute,
                .macro_param_has_declspec_attribute,
                .macro_param_has_warning,
                .macro_param_has_feature,
                .macro_param_has_extension,
                .macro_param_has_builtin,
                .macro_param_has_include,
                .macro_param_has_include_next,
                .macro_param_has_embed,
                .macro_param_is_identifier,
                .macro_file,
                .macro_line,
                .macro_counter,
                .macro_time,
                .macro_date,
                .macro_timestamp,
                .macro_param_pragma_operator,
                .macro_param_ms_identifier,
                .macro_param_ms_pragma,
                .placemarker,
                => "",
                .macro_ws => " ",

                .incomplete_ucn => "\\",

                .macro_func => "__func__",
                .macro_function => "__FUNCTION__",
                .macro_pretty_func => "__PRETTY_FUNCTION__",

                .bang => "!",
                .bang_equal => "!=",
                .pipe => "|",
                .pipe_pipe => "||",
                .pipe_equal => "|=",
                .equal => "=",
                .equal_equal => "==",
                .l_paren => "(",
                .r_paren => ")",
                .l_brace => "{",
                .r_brace => "}",
                .l_bracket => "[",
                .r_bracket => "]",
                .period => ".",
                .ellipsis => "...",
                .caret => "^",
                .caret_equal => "^=",
                .plus => "+",
                .plus_plus => "++",
                .plus_equal => "+=",
                .minus => "-",
                .minus_minus => "--",
                .minus_equal => "-=",
                .asterisk => "*",
                .asterisk_equal => "*=",
                .percent => "%",
                .percent_equal => "%=",
                .arrow => "->",
                .colon => ":",
                .colon_colon => "::",
                .semicolon => ";",
                .slash => "/",
                .slash_equal => "/=",
                .comma => ",",
                .ampersand => "&",
                .ampersand_ampersand => "&&",
                .ampersand_equal => "&=",
                .question_mark => "?",
                .angle_bracket_left => "<",
                .angle_bracket_left_equal => "<=",
                .angle_bracket_angle_bracket_left => "<<",
                .angle_bracket_angle_bracket_left_equal => "<<=",
                .angle_bracket_right => ">",
                .angle_bracket_right_equal => ">=",
                .angle_bracket_angle_bracket_right => ">>",
                .angle_bracket_angle_bracket_right_equal => ">>=",
                .tilde => "~",
                .hash => "#",
                .hash_hash => "##",

                .keyword_auto => "auto",
                .keyword_auto_type => "__auto_type",
                .keyword_break => "break",
                .keyword_case => "case",
                .keyword_char => "char",
                .keyword_const => "const",
                .keyword_continue => "continue",
                .keyword_default => "default",
                .keyword_do => "do",
                .keyword_double => "double",
                .keyword_else => "else",
                .keyword_enum => "enum",
                .keyword_extern => "extern",
                .keyword_float => "float",
                .keyword_for => "for",
                .keyword_goto => "goto",
                .keyword_if => "if",
                .keyword_int => "int",
                .keyword_long => "long",
                .keyword_register => "register",
                .keyword_return => "return",
                .keyword_short => "short",
                .keyword_signed => "signed",
                .keyword_signed1 => "__signed",
                .keyword_signed2 => "__signed__",
                .keyword_sizeof => "sizeof",
                .keyword_static => "static",
                .keyword_struct => "struct",
                .keyword_switch => "switch",
                .keyword_typedef => "typedef",
                .keyword_typeof => "typeof",
                .keyword_union => "union",
                .keyword_unsigned => "unsigned",
                .keyword_void => "void",
                .keyword_volatile => "volatile",
                .keyword_while => "while",
                .keyword_bool => "_Bool",
                .keyword_complex => "_Complex",
                .keyword_imaginary => "_Imaginary",
                .keyword_inline => "inline",
                .keyword_restrict => "restrict",
                .keyword_alignas => "_Alignas",
                .keyword_alignof => "_Alignof",
                .keyword_atomic => "_Atomic",
                .keyword_generic => "_Generic",
                .keyword_noreturn => "_Noreturn",
                .keyword_static_assert => "_Static_assert",
                .keyword_thread_local => "_Thread_local",
                .keyword_bit_int => "_BitInt",
                .keyword_c23_alignas => "alignas",
                .keyword_c23_alignof => "alignof",
                .keyword_c23_bool => "bool",
                .keyword_c23_static_assert => "static_assert",
                .keyword_c23_thread_local => "thread_local",
                .keyword_constexpr => "constexpr",
                .keyword_true => "true",
                .keyword_false => "false",
                .keyword_nullptr => "nullptr",
                .keyword_typeof_unqual => "typeof_unqual",
                .keyword_include => "include",
                .keyword_include_next => "include_next",
                .keyword_embed => "embed",
                .keyword_define => "define",
                .keyword_defined => "defined",
                .keyword_undef => "undef",
                .keyword_ifdef => "ifdef",
                .keyword_ifndef => "ifndef",
                .keyword_elif => "elif",
                .keyword_elifdef => "elifdef",
                .keyword_elifndef => "elifndef",
                .keyword_endif => "endif",
                .keyword_error => "error",
                .keyword_warning => "warning",
                .keyword_pragma => "pragma",
                .keyword_line => "line",
                .keyword_va_args => "__VA_ARGS__",
                .keyword_va_opt => "__VA_OPT__",
                .keyword_const1 => "__const",
                .keyword_const2 => "__const__",
                .keyword_inline1 => "__inline",
                .keyword_inline2 => "__inline__",
                .keyword_volatile1 => "__volatile",
                .keyword_volatile2 => "__volatile__",
                .keyword_restrict1 => "__restrict",
                .keyword_restrict2 => "__restrict__",
                .keyword_alignof1 => "__alignof",
                .keyword_alignof2 => "__alignof__",
                .keyword_typeof1 => "__typeof",
                .keyword_typeof2 => "__typeof__",
                .keyword_attribute1 => "__attribute",
                .keyword_attribute2 => "__attribute__",
                .keyword_extension => "__extension__",
                .keyword_asm => "asm",
                .keyword_asm1 => "__asm",
                .keyword_asm2 => "__asm__",
                .keyword_float128_1 => "_Float128",
                .keyword_float128_2 => "__float128",
                .keyword_int128 => "__int128",
                .keyword_imag1 => "__imag",
                .keyword_imag2 => "__imag__",
                .keyword_real1 => "__real",
                .keyword_real2 => "__real__",
                .keyword_float16 => "_Float16",
                .keyword_fp16 => "__fp16",
                .keyword_declspec => "__declspec",
                .keyword_int64 => "__int64",
                .keyword_int64_2 => "_int64",
                .keyword_int32 => "__int32",
                .keyword_int32_2 => "_int32",
                .keyword_int16 => "__int16",
                .keyword_int16_2 => "_int16",
                .keyword_int8 => "__int8",
                .keyword_int8_2 => "_int8",
                .keyword_stdcall => "__stdcall",
                .keyword_stdcall2 => "_stdcall",
                .keyword_thiscall => "__thiscall",
                .keyword_thiscall2 => "_thiscall",
                .keyword_vectorcall => "__vectorcall",
                .keyword_vectorcall2 => "_vectorcall",
                .keyword_fastcall => "__fastcall",
                .keyword_fastcall2 => "_fastcall",
                .keyword_regcall => "__regcall",
                .keyword_cdecl => "__cdecl",
                .keyword_cdecl2 => "_cdecl",
                .keyword_forceinline => "__forceinline",
                .keyword_forceinline2 => "_forceinline",
                .keyword_unaligned => "__unaligned",
                .keyword_unaligned2 => "_unaligned",
                .keyword_nonnull => "_Nonnull",
                .keyword_nullable => "_Nullable",
                .keyword_nullable_result => "_Nullable_result",
                .keyword_null_unspecified => "_Null_unspecified",
            };
        }

        pub fn symbol(id: Id) []const u8 {
            return switch (id) {
                .macro_string => unreachable,
                .invalid => "invalid bytes",
                .identifier,
                .extended_identifier,
                .macro_func,
                .macro_function,
                .macro_pretty_func,
                => "an identifier",
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,
                .unterminated_string_literal,
                => "a string literal",
                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                .unterminated_char_literal,
                .empty_char_literal,
                => "a character literal",
                .pp_num, .embed_byte => "a number",
                else => id.lexeme().?,
            };
        }

        /// tokens that can start an expression parsed by Preprocessor.expr
        /// Note that eof, r_paren, and string literals cannot actually start a
        /// preprocessor expression, but we include them here so that a nicer
        /// error message can be generated by the parser.
        pub fn validPreprocessorExprStart(id: Id) bool {
            return switch (id) {
                .eof,
                .r_paren,
                .string_literal,
                .string_literal_utf_16,
                .string_literal_utf_8,
                .string_literal_utf_32,
                .string_literal_wide,

                .char_literal,
                .char_literal_utf_8,
                .char_literal_utf_16,
                .char_literal_utf_32,
                .char_literal_wide,
                .l_paren,
                .plus,
                .minus,
                .tilde,
                .bang,
                .identifier,
                .extended_identifier,
                .keyword_defined,
                .one,
                .zero,
                .pp_num,
                .keyword_true,
                .keyword_false,
                => true,
                else => false,
            };
        }

        pub fn allowsDigraphs(id: Id, langopts: LangOpts) bool {
            return switch (id) {
                .l_bracket,
                .r_bracket,
                .l_brace,
                .r_brace,
                .hash,
                .hash_hash,
                => langopts.hasDigraphs(),
                else => false,
            };
        }

        pub fn canOpenGCCAsmStmt(id: Id) bool {
            return switch (id) {
                .keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
                else => false,
            };
        }

        pub fn isStringLiteral(id: Id) bool {
            return switch (id) {
                .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
                else => false,
            };
        }
    };

    /// double underscore and underscore + capital letter identifiers
    /// belong to the implementation namespace, so we always convert them
    /// to keywords.
    pub fn getTokenId(langopts: LangOpts, str: []const u8) Token.Id {
        const kw = all_kws.get(str) orelse return .identifier;
        const standard = langopts.standard;
        return switch (kw) {
            .keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
            .keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
            .keyword_typeof => if (standard.isGNU() or standard.atLeast(.c23)) kw else .identifier,
            .keyword_asm => if (standard.isGNU()) kw else .identifier,
            .keyword_declspec => if (langopts.declspec_attrs) kw else .identifier,

            .keyword_c23_alignas,
            .keyword_c23_alignof,
            .keyword_c23_bool,
            .keyword_c23_static_assert,
            .keyword_c23_thread_local,
            .keyword_constexpr,
            .keyword_true,
            .keyword_false,
            .keyword_nullptr,
            .keyword_typeof_unqual,
            .keyword_elifdef,
            .keyword_elifndef,
            => if (standard.atLeast(.c23)) kw else .identifier,

            .keyword_int64,
            .keyword_int64_2,
            .keyword_int32,
            .keyword_int32_2,
            .keyword_int16,
            .keyword_int16_2,
            .keyword_int8,
            .keyword_int8_2,
            .keyword_stdcall2,
            .keyword_thiscall2,
            .keyword_vectorcall2,
            .keyword_fastcall2,
            .keyword_cdecl2,
            .keyword_forceinline,
            .keyword_forceinline2,
            .keyword_unaligned,
            .keyword_unaligned2,
            => if (langopts.ms_extensions) kw else .identifier,
            else => kw,
        };
    }

    const all_kws = std.StaticStringMap(Id).initComptime(.{
        .{ "auto", .keyword_auto },
        .{ "break", .keyword_break },
        .{ "case", .keyword_case },
        .{ "char", .keyword_char },
        .{ "const", .keyword_const },
        .{ "continue", .keyword_continue },
        .{ "default", .keyword_default },
        .{ "do", .keyword_do },
        .{ "double", .keyword_double },
        .{ "else", .keyword_else },
        .{ "enum", .keyword_enum },
        .{ "extern", .keyword_extern },
        .{ "float", .keyword_float },
        .{ "for", .keyword_for },
        .{ "goto", .keyword_goto },
        .{ "if", .keyword_if },
        .{ "int", .keyword_int },
        .{ "long", .keyword_long },
        .{ "register", .keyword_register },
        .{ "return", .keyword_return },
        .{ "short", .keyword_short },
        .{ "signed", .keyword_signed },
        .{ "__signed", .keyword_signed1 },
        .{ "__signed__", .keyword_signed2 },
        .{ "sizeof", .keyword_sizeof },
        .{ "static", .keyword_static },
        .{ "struct", .keyword_struct },
        .{ "switch", .keyword_switch },
        .{ "typedef", .keyword_typedef },
        .{ "union", .keyword_union },
        .{ "unsigned", .keyword_unsigned },
        .{ "void", .keyword_void },
        .{ "volatile", .keyword_volatile },
        .{ "while", .keyword_while },
        .{ "__typeof__", .keyword_typeof2 },
        .{ "__typeof", .keyword_typeof1 },

        // ISO C99
        .{ "_Bool", .keyword_bool },
        .{ "_Complex", .keyword_complex },
        .{ "_Imaginary", .keyword_imaginary },
        .{ "inline", .keyword_inline },
        .{ "restrict", .keyword_restrict },

        // ISO C11
        .{ "_Alignas", .keyword_alignas },
        .{ "_Alignof", .keyword_alignof },
        .{ "_Atomic", .keyword_atomic },
        .{ "_Generic", .keyword_generic },
        .{ "_Noreturn", .keyword_noreturn },
        .{ "_Static_assert", .keyword_static_assert },
        .{ "_Thread_local", .keyword_thread_local },

        // ISO C23
        .{ "_BitInt", .keyword_bit_int },
        .{ "alignas", .keyword_c23_alignas },
        .{ "alignof", .keyword_c23_alignof },
        .{ "bool", .keyword_c23_bool },
        .{ "static_assert", .keyword_c23_static_assert },
        .{ "thread_local", .keyword_c23_thread_local },
        .{ "constexpr", .keyword_constexpr },
        .{ "true", .keyword_true },
        .{ "false", .keyword_false },
        .{ "nullptr", .keyword_nullptr },
        .{ "typeof_unqual", .keyword_typeof_unqual },

        // Preprocessor directives
        .{ "include", .keyword_include },
        .{ "include_next", .keyword_include_next },
        .{ "embed", .keyword_embed },
        .{ "define", .keyword_define },
        .{ "defined", .keyword_defined },
        .{ "undef", .keyword_undef },
        .{ "ifdef", .keyword_ifdef },
        .{ "ifndef", .keyword_ifndef },
        .{ "elif", .keyword_elif },
        .{ "elifdef", .keyword_elifdef },
        .{ "elifndef", .keyword_elifndef },
        .{ "endif", .keyword_endif },
        .{ "error", .keyword_error },
        .{ "warning", .keyword_warning },
        .{ "pragma", .keyword_pragma },
        .{ "line", .keyword_line },
        .{ "__VA_ARGS__", .keyword_va_args },
        .{ "__VA_OPT__", .keyword_va_opt },
        .{ "__func__", .macro_func },
        .{ "__FUNCTION__", .macro_function },
        .{ "__PRETTY_FUNCTION__", .macro_pretty_func },

        // gcc keywords
        .{ "__auto_type", .keyword_auto_type },
        .{ "__const", .keyword_const1 },
        .{ "__const__", .keyword_const2 },
        .{ "__inline", .keyword_inline1 },
        .{ "__inline__", .keyword_inline2 },
        .{ "__volatile", .keyword_volatile1 },
        .{ "__volatile__", .keyword_volatile2 },
        .{ "__restrict", .keyword_restrict1 },
        .{ "__restrict__", .keyword_restrict2 },
        .{ "__alignof", .keyword_alignof1 },
        .{ "__alignof__", .keyword_alignof2 },
        .{ "typeof", .keyword_typeof },
        .{ "__attribute", .keyword_attribute1 },
        .{ "__attribute__", .keyword_attribute2 },
        .{ "__extension__", .keyword_extension },
        .{ "asm", .keyword_asm },
        .{ "__asm", .keyword_asm1 },
        .{ "__asm__", .keyword_asm2 },
        .{ "_Float128", .keyword_float128_1 },
        .{ "__float128", .keyword_float128_2 },
        .{ "__int128", .keyword_int128 },
        .{ "__imag", .keyword_imag1 },
        .{ "__imag__", .keyword_imag2 },
        .{ "__real", .keyword_real1 },
        .{ "__real__", .keyword_real2 },
        .{ "_Float16", .keyword_float16 },

        // clang keywords
        .{ "__fp16", .keyword_fp16 },

        // ms keywords
        .{ "__declspec", .keyword_declspec },
        .{ "__int64", .keyword_int64 },
        .{ "_int64", .keyword_int64_2 },
        .{ "__int32", .keyword_int32 },
        .{ "_int32", .keyword_int32_2 },
        .{ "__int16", .keyword_int16 },
        .{ "_int16", .keyword_int16_2 },
        .{ "__int8", .keyword_int8 },
        .{ "_int8", .keyword_int8_2 },
        .{ "__stdcall", .keyword_stdcall },
        .{ "_stdcall", .keyword_stdcall2 },
        .{ "__thiscall", .keyword_thiscall },
        .{ "_thiscall", .keyword_thiscall2 },
        .{ "__vectorcall", .keyword_vectorcall },
        .{ "_vectorcall", .keyword_vectorcall2 },
        .{ "__fastcall", .keyword_fastcall },
        .{ "_fastcall", .keyword_fastcall2 },
        .{ "_regcall", .keyword_regcall },
        .{ "__cdecl", .keyword_cdecl },
        .{ "_cdecl", .keyword_cdecl2 },
        .{ "__forceinline", .keyword_forceinline },
        .{ "_forceinline", .keyword_forceinline2 },
        .{ "__unaligned", .keyword_unaligned },
        .{ "_unaligned", .keyword_unaligned2 },

        // Type nullability
        .{ "_Nonnull", .keyword_nonnull },
        .{ "_Nullable", .keyword_nullable },
        .{ "_Nullable_result", .keyword_nullable_result },
        .{ "_Null_unspecified", .keyword_null_unspecified },
    });
};

const Tokenizer = @This();

buf: []const u8,
index: u32 = 0,
source: Source.Id,
langopts: LangOpts,
line: u32 = 1,

pub fn next(self: *Tokenizer) Token {
    var state: enum {
        start,
        whitespace,
        u,
        u8,
        U,
        L,
        string_literal,
        char_literal_start,
        char_literal,
        char_escape_sequence,
        string_escape_sequence,
        identifier,
        extended_identifier,
        equal,
        bang,
        pipe,
        colon,
        percent,
        asterisk,
        plus,
        angle_bracket_left,
        angle_bracket_angle_bracket_left,
        angle_bracket_right,
        angle_bracket_angle_bracket_right,
        caret,
        period,
        period2,
        minus,
        slash,
        ampersand,
        hash,
        hash_digraph,
        hash_hash_digraph_partial,
        line_comment,
        multi_line_comment,
        multi_line_comment_asterisk,
        multi_line_comment_done,
        pp_num,
        pp_num_exponent,
        pp_num_digit_separator,
    } = .start;

    var start = self.index;
    var id: Token.Id = .eof;

    while (self.index < self.buf.len) : (self.index += 1) {
        const c = self.buf[self.index];
        switch (state) {
            .start => switch (c) {
                '\n' => {
                    id = .nl;
                    self.index += 1;
                    self.line += 1;
                    break;
                },
                '"' => {
                    id = .string_literal;
                    state = .string_literal;
                },
                '\'' => {
                    id = .char_literal;
                    state = .char_literal_start;
                },
                'u' => state = .u,
                'U' => state = .U,
                'L' => state = .L,
                '\\' => {
                    const ucn_kind = UCNKind.classify(self.buf[self.index..]);
                    switch (ucn_kind) {
                        .none => {
                            self.index += 1;
                            id = .invalid;
                            break;
                        },
                        .incomplete => {
                            self.index += 1;
                            id = .incomplete_ucn;
                            break;
                        },
                        .hex4, .hex8 => {
                            self.index += @intFromEnum(ucn_kind);
                            id = .extended_identifier;
                            state = .extended_identifier;
                        },
                    }
                },
                'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
                '=' => state = .equal,
                '!' => state = .bang,
                '|' => state = .pipe,
                '(' => {
                    id = .l_paren;
                    self.index += 1;
                    break;
                },
                ')' => {
                    id = .r_paren;
                    self.index += 1;
                    break;
                },
                '[' => {
                    id = .l_bracket;
                    self.index += 1;
                    break;
                },
                ']' => {
                    id = .r_bracket;
                    self.index += 1;
                    break;
                },
                ';' => {
                    id = .semicolon;
                    self.index += 1;
                    break;
                },
                ',' => {
                    id = .comma;
                    self.index += 1;
                    break;
                },
                '?' => {
                    id = .question_mark;
                    self.index += 1;
                    break;
                },
                ':' => state = .colon,
                '%' => state = .percent,
                '*' => state = .asterisk,
                '+' => state = .plus,
                '<' => state = .angle_bracket_left,
                '>' => state = .angle_bracket_right,
                '^' => state = .caret,
                '{' => {
                    id = .l_brace;
                    self.index += 1;
                    break;
                },
                '}' => {
                    id = .r_brace;
                    self.index += 1;
                    break;
                },
                '~' => {
                    id = .tilde;
                    self.index += 1;
                    break;
                },
                '.' => state = .period,
                '-' => state = .minus,
                '/' => state = .slash,
                '&' => state = .ampersand,
                '#' => state = .hash,
                '0'...'9' => state = .pp_num,
                '\t', '\x0B', '\x0C', ' ' => state = .whitespace,
                '$' => if (self.langopts.dollars_in_identifiers) {
                    state = .extended_identifier;
                } else {
                    id = .invalid;
                    self.index += 1;
                    break;
                },
                0x1A => if (self.langopts.ms_extensions) {
                    id = .eof;
                    break;
                } else {
                    id = .invalid;
                    self.index += 1;
                    break;
                },
                0x80...0xFF => state = .extended_identifier,
                else => {
                    id = .invalid;
                    self.index += 1;
                    break;
                },
            },
            .whitespace => switch (c) {
                '\t', '\x0B', '\x0C', ' ' => {},
                else => {
                    id = .whitespace;
                    break;
                },
            },
            .u => switch (c) {
                '8' => {
                    state = .u8;
                },
                '\'' => {
                    id = .char_literal_utf_16;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_utf_16;
                    state = .string_literal;
                },
                else => {
                    self.index -= 1;
                    state = .identifier;
                },
            },
            .u8 => switch (c) {
                '\"' => {
                    id = .string_literal_utf_8;
                    state = .string_literal;
                },
                '\'' => {
                    id = .char_literal_utf_8;
                    state = .char_literal_start;
                },
                else => {
                    self.index -= 1;
                    state = .identifier;
                },
            },
            .U => switch (c) {
                '\'' => {
                    id = .char_literal_utf_32;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_utf_32;
                    state = .string_literal;
                },
                else => {
                    self.index -= 1;
                    state = .identifier;
                },
            },
            .L => switch (c) {
                '\'' => {
                    id = .char_literal_wide;
                    state = .char_literal_start;
                },
                '\"' => {
                    id = .string_literal_wide;
                    state = .string_literal;
                },
                else => {
                    self.index -= 1;
                    state = .identifier;
                },
            },
            .string_literal => switch (c) {
                '\\' => {
                    state = .string_escape_sequence;
                },
                '"' => {
                    self.index += 1;
                    break;
                },
                '\n' => {
                    id = .unterminated_string_literal;
                    break;
                },
                '\r' => unreachable,
                else => {},
            },
            .char_literal_start => switch (c) {
                '\\' => {
                    state = .char_escape_sequence;
                },
                '\'' => {
                    id = .empty_char_literal;
                    self.index += 1;
                    break;
                },
                '\n' => {
                    id = .unterminated_char_literal;
                    break;
                },
                else => {
                    state = .char_literal;
                },
            },
            .char_literal => switch (c) {
                '\\' => {
                    state = .char_escape_sequence;
                },
                '\'' => {
                    self.index += 1;
                    break;
                },
                '\n' => {
                    id = .unterminated_char_literal;
                    break;
                },
                else => {},
            },
            .char_escape_sequence => switch (c) {
                '\r', '\n' => {
                    id = .unterminated_char_literal;
                    break;
                },
                else => state = .char_literal,
            },
            .string_escape_sequence => switch (c) {
                '\r', '\n' => {
                    id = .unterminated_string_literal;
                    break;
                },
                else => state = .string_literal,
            },
            .identifier, .extended_identifier => switch (c) {
                'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
                '$' => if (self.langopts.dollars_in_identifiers) {
                    state = .extended_identifier;
                } else {
                    id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
                    break;
                },
                0x80...0xFF => state = .extended_identifier,
                '\\' => {
                    const ucn_kind = UCNKind.classify(self.buf[self.index..]);
                    switch (ucn_kind) {
                        .none, .incomplete => {
                            id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
                            break;
                        },
                        .hex4, .hex8 => {
                            state = .extended_identifier;
                            self.index += @intFromEnum(ucn_kind);
                        },
                    }
                },

                else => {
                    id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
                    break;
                },
            },
            .equal => switch (c) {
                '=' => {
                    id = .equal_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .equal;
                    break;
                },
            },
            .bang => switch (c) {
                '=' => {
                    id = .bang_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .bang;
                    break;
                },
            },
            .pipe => switch (c) {
                '=' => {
                    id = .pipe_equal;
                    self.index += 1;
                    break;
                },
                '|' => {
                    id = .pipe_pipe;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .pipe;
                    break;
                },
            },
            .colon => switch (c) {
                '>' => {
                    if (self.langopts.hasDigraphs()) {
                        id = .r_bracket;
                        self.index += 1;
                    } else {
                        id = .colon;
                    }
                    break;
                },
                ':' => {
                    if (self.langopts.standard.atLeast(.c23)) {
                        id = .colon_colon;
                        self.index += 1;
                        break;
                    } else {
                        id = .colon;
                        break;
                    }
                },
                else => {
                    id = .colon;
                    break;
                },
            },
            .percent => switch (c) {
                '=' => {
                    id = .percent_equal;
                    self.index += 1;
                    break;
                },
                '>' => {
                    if (self.langopts.hasDigraphs()) {
                        id = .r_brace;
                        self.index += 1;
                    } else {
                        id = .percent;
                    }
                    break;
                },
                ':' => {
                    if (self.langopts.hasDigraphs()) {
                        state = .hash_digraph;
                    } else {
                        id = .percent;
                        break;
                    }
                },
                else => {
                    id = .percent;
                    break;
                },
            },
            .asterisk => switch (c) {
                '=' => {
                    id = .asterisk_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .asterisk;
                    break;
                },
            },
            .plus => switch (c) {
                '=' => {
                    id = .plus_equal;
                    self.index += 1;
                    break;
                },
                '+' => {
                    id = .plus_plus;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .plus;
                    break;
                },
            },
            .angle_bracket_left => switch (c) {
                '<' => state = .angle_bracket_angle_bracket_left,
                '=' => {
                    id = .angle_bracket_left_equal;
                    self.index += 1;
                    break;
                },
                ':' => {
                    if (self.langopts.hasDigraphs()) {
                        id = .l_bracket;
                        self.index += 1;
                    } else {
                        id = .angle_bracket_left;
                    }
                    break;
                },
                '%' => {
                    if (self.langopts.hasDigraphs()) {
                        id = .l_brace;
                        self.index += 1;
                    } else {
                        id = .angle_bracket_left;
                    }
                    break;
                },
                else => {
                    id = .angle_bracket_left;
                    break;
                },
            },
            .angle_bracket_angle_bracket_left => switch (c) {
                '=' => {
                    id = .angle_bracket_angle_bracket_left_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_angle_bracket_left;
                    break;
                },
            },
            .angle_bracket_right => switch (c) {
                '>' => state = .angle_bracket_angle_bracket_right,
                '=' => {
                    id = .angle_bracket_right_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_right;
                    break;
                },
            },
            .angle_bracket_angle_bracket_right => switch (c) {
                '=' => {
                    id = .angle_bracket_angle_bracket_right_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .angle_bracket_angle_bracket_right;
                    break;
                },
            },
            .caret => switch (c) {
                '=' => {
                    id = .caret_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .caret;
                    break;
                },
            },
            .period => switch (c) {
                '.' => state = .period2,
                '0'...'9' => state = .pp_num,
                else => {
                    id = .period;
                    break;
                },
            },
            .period2 => switch (c) {
                '.' => {
                    id = .ellipsis;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .period;
                    self.index -= 1;
                    break;
                },
            },
            .minus => switch (c) {
                '>' => {
                    id = .arrow;
                    self.index += 1;
                    break;
                },
                '=' => {
                    id = .minus_equal;
                    self.index += 1;
                    break;
                },
                '-' => {
                    id = .minus_minus;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .minus;
                    break;
                },
            },
            .ampersand => switch (c) {
                '&' => {
                    id = .ampersand_ampersand;
                    self.index += 1;
                    break;
                },
                '=' => {
                    id = .ampersand_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .ampersand;
                    break;
                },
            },
            .hash => switch (c) {
                '#' => {
                    id = .hash_hash;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .hash;
                    break;
                },
            },
            .hash_digraph => switch (c) {
                '%' => state = .hash_hash_digraph_partial,
                else => {
                    id = .hash;
                    break;
                },
            },
            .hash_hash_digraph_partial => switch (c) {
                ':' => {
                    id = .hash_hash;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .hash;
                    self.index -= 1; // re-tokenize the percent
                    break;
                },
            },
            .slash => switch (c) {
                '/' => state = .line_comment,
                '*' => state = .multi_line_comment,
                '=' => {
                    id = .slash_equal;
                    self.index += 1;
                    break;
                },
                else => {
                    id = .slash;
                    break;
                },
            },
            .line_comment => switch (c) {
                '\n' => {
                    if (self.langopts.preserve_comments) {
                        id = .comment;
                        break;
                    }
                    self.index -= 1;
                    state = .start;
                },
                else => {},
            },
            .multi_line_comment => switch (c) {
                '*' => state = .multi_line_comment_asterisk,
                '\n' => self.line += 1,
                else => {},
            },
            .multi_line_comment_asterisk => switch (c) {
                '/' => {
                    if (self.langopts.preserve_comments) {
                        self.index += 1;
                        id = .comment;
                        break;
                    }
                    state = .multi_line_comment_done;
                },
                '\n' => {
                    self.line += 1;
                    state = .multi_line_comment;
                },
                '*' => {},
                else => state = .multi_line_comment,
            },
            .multi_line_comment_done => switch (c) {
                '\n' => {
                    start = self.index;
                    id = .nl;
                    self.index += 1;
                    self.line += 1;
                    break;
                },
                '\r' => unreachable,
                '\t', '\x0B', '\x0C', ' ' => {
                    start = self.index;
                    state = .whitespace;
                },
                else => {
                    id = .whitespace;
                    break;
                },
            },
            .pp_num => switch (c) {
                'a'...'d',
                'A'...'D',
                'f'...'o',
                'F'...'O',
                'q'...'z',
                'Q'...'Z',
                '0'...'9',
                '_',
                '.',
                => {},
                'e', 'E', 'p', 'P' => state = .pp_num_exponent,
                '\'' => if (self.langopts.standard.atLeast(.c23)) {
                    state = .pp_num_digit_separator;
                } else {
                    id = .pp_num;
                    break;
                },
                else => {
                    id = .pp_num;
                    break;
                },
            },
            .pp_num_digit_separator => switch (c) {
                'a'...'d',
                'A'...'D',
                'f'...'o',
                'F'...'O',
                'q'...'z',
                'Q'...'Z',
                '0'...'9',
                '_',
                => state = .pp_num,
                else => {
                    self.index -= 1;
                    id = .pp_num;
                    break;
                },
            },
            .pp_num_exponent => switch (c) {
                'a'...'o',
                'q'...'z',
                'A'...'O',
                'Q'...'Z',
                '0'...'9',
                '_',
                '.',
                '+',
                '-',
                => state = .pp_num,
                'p', 'P' => {},
                else => {
                    id = .pp_num;
                    break;
                },
            },
        }
    } else if (self.index == self.buf.len) {
        switch (state) {
            .start => {},
            .line_comment => if (self.langopts.preserve_comments) {
                id = .comment;
            },
            .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
            .extended_identifier => id = .extended_identifier,

            .period2 => {
                self.index -= 1;
                id = .period;
            },

            .multi_line_comment,
            .multi_line_comment_asterisk,
            => id = .unterminated_comment,

            .char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal,
            .string_escape_sequence, .string_literal => id = .unterminated_string_literal,

            .whitespace => id = .whitespace,
            .multi_line_comment_done => id = .whitespace,

            .equal => id = .equal,
            .bang => id = .bang,
            .minus => id = .minus,
            .slash => id = .slash,
            .ampersand => id = .ampersand,
            .hash => id = .hash,
            .period => id = .period,
            .pipe => id = .pipe,
            .angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
            .angle_bracket_right => id = .angle_bracket_right,
            .angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
            .angle_bracket_left => id = .angle_bracket_left,
            .plus => id = .plus,
            .colon => id = .colon,
            .percent => id = .percent,
            .caret => id = .caret,
            .asterisk => id = .asterisk,
            .hash_digraph => id = .hash,
            .hash_hash_digraph_partial => {
                id = .hash;
                self.index -= 1; // re-tokenize the percent
            },
            .pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
        }
    }

    return .{
        .id = id,
        .start = start,
        .end = self.index,
        .line = self.line,
        .source = self.source,
    };
}

pub fn nextNoWS(self: *Tokenizer) Token {
    var tok = self.next();
    while (tok.id == .whitespace or tok.id == .comment) tok = self.next();
    return tok;
}

pub fn nextNoWSComments(self: *Tokenizer) Token {
    var tok = self.next();
    while (tok.id == .whitespace) tok = self.next();
    return tok;
}

/// Try to tokenize a '::' even if not supported by the current language standard.
pub fn colonColon(self: *Tokenizer) Token {
    var tok = self.nextNoWS();
    if (tok.id == .colon and self.index < self.buf.len and self.buf[self.index] == ':') {
        self.index += 1;
        tok.id = .colon_colon;
    }
    return tok;
}

test "operators" {
    try expectTokens(
        \\ ! != | || |= = ==
        \\ ( ) { } [ ] . .. ...
        \\ ^ ^= + ++ += - -- -=
        \\ * *= % %= -> : ; / /=
        \\ , & && &= ? < <= <<
        \\  <<= > >= >> >>= ~ # ##
        \\
    , &.{
        .bang,
        .bang_equal,
        .pipe,
        .pipe_pipe,
        .pipe_equal,
        .equal,
        .equal_equal,
        .nl,
        .l_paren,
        .r_paren,
        .l_brace,
        .r_brace,
        .l_bracket,
        .r_bracket,
        .period,
        .period,
        .period,
        .ellipsis,
        .nl,
        .caret,
        .caret_equal,
        .plus,
        .plus_plus,
        .plus_equal,
        .minus,
        .minus_minus,
        .minus_equal,
        .nl,
        .asterisk,
        .asterisk_equal,
        .percent,
        .percent_equal,
        .arrow,
        .colon,
        .semicolon,
        .slash,
        .slash_equal,
        .nl,
        .comma,
        .ampersand,
        .ampersand_ampersand,
        .ampersand_equal,
        .question_mark,
        .angle_bracket_left,
        .angle_bracket_left_equal,
        .angle_bracket_angle_bracket_left,
        .nl,
        .angle_bracket_angle_bracket_left_equal,
        .angle_bracket_right,
        .angle_bracket_right_equal,
        .angle_bracket_angle_bracket_right,
        .angle_bracket_angle_bracket_right_equal,
        .tilde,
        .hash,
        .hash_hash,
        .nl,
    });
}

test "keywords" {
    try expectTokens(
        \\auto __auto_type break case char const continue default do
        \\double else enum extern float for goto if int
        \\long register return short signed sizeof static
        \\struct switch typedef union unsigned void volatile
        \\while _Bool _Complex _Imaginary inline restrict _Alignas
        \\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
        \\__attribute __attribute__
        \\
    , &.{
        .keyword_auto,
        .keyword_auto_type,
        .keyword_break,
        .keyword_case,
        .keyword_char,
        .keyword_const,
        .keyword_continue,
        .keyword_default,
        .keyword_do,
        .nl,
        .keyword_double,
        .keyword_else,
        .keyword_enum,
        .keyword_extern,
        .keyword_float,
        .keyword_for,
        .keyword_goto,
        .keyword_if,
        .keyword_int,
        .nl,
        .keyword_long,
        .keyword_register,
        .keyword_return,
        .keyword_short,
        .keyword_signed,
        .keyword_sizeof,
        .keyword_static,
        .nl,
        .keyword_struct,
        .keyword_switch,
        .keyword_typedef,
        .keyword_union,
        .keyword_unsigned,
        .keyword_void,
        .keyword_volatile,
        .nl,
        .keyword_while,
        .keyword_bool,
        .keyword_complex,
        .keyword_imaginary,
        .keyword_inline,
        .keyword_restrict,
        .keyword_alignas,
        .nl,
        .keyword_alignof,
        .keyword_atomic,
        .keyword_generic,
        .keyword_noreturn,
        .keyword_static_assert,
        .keyword_thread_local,
        .nl,
        .keyword_attribute1,
        .keyword_attribute2,
        .nl,
    });
}

test "preprocessor keywords" {
    try expectTokens(
        \\#include
        \\#include_next
        \\#embed
        \\#define
        \\#ifdef
        \\#ifndef
        \\#error
        \\#pragma
        \\
    , &.{
        .hash,
        .keyword_include,
        .nl,
        .hash,
        .keyword_include_next,
        .nl,
        .hash,
        .keyword_embed,
        .nl,
        .hash,
        .keyword_define,
        .nl,
        .hash,
        .keyword_ifdef,
        .nl,
        .hash,
        .keyword_ifndef,
        .nl,
        .hash,
        .keyword_error,
        .nl,
        .hash,
        .keyword_pragma,
        .nl,
    });
}

test "line continuation" {
    try expectTokens(
        \\#define foo \
        \\  bar
        \\"foo\
        \\ bar"
        \\#define "foo"
        \\ "bar"
        \\#define "foo" \
        \\ "bar"
    , &.{
        .hash,
        .keyword_define,
        .identifier,
        .identifier,
        .nl,
        .string_literal,
        .nl,
        .hash,
        .keyword_define,
        .string_literal,
        .nl,
        .string_literal,
        .nl,
        .hash,
        .keyword_define,
        .string_literal,
        .string_literal,
    });
}

test "string prefix" {
    try expectTokens(
        \\"foo"
        \\u"foo"
        \\u8"foo"
        \\U"foo"
        \\L"foo"
        \\'foo'
        \\u8'A'
        \\u'foo'
        \\U'foo'
        \\L'foo'
        \\
    , &.{
        .string_literal,
        .nl,
        .string_literal_utf_16,
        .nl,
        .string_literal_utf_8,
        .nl,
        .string_literal_utf_32,
        .nl,
        .string_literal_wide,
        .nl,
        .char_literal,
        .nl,
        .char_literal_utf_8,
        .nl,
        .char_literal_utf_16,
        .nl,
        .char_literal_utf_32,
        .nl,
        .char_literal_wide,
        .nl,
    });
}

test "num suffixes" {
    try expectTokens(
        \\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
        \\ 0l 0lu 0ll 0llu 0
        \\ 1u 1ul 1ull 1
        \\ 1.0i 1.0I
        \\ 1.0if 1.0If 1.0fi 1.0fI
        \\ 1.0il 1.0Il 1.0li 1.0lI
        \\
    , &.{
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
        .pp_num,
        .pp_num,
        .pp_num,
        .pp_num,
        .nl,
    });
}

test "comments" {
    try expectTokens(
        \\//foo
        \\#foo
    , &.{
        .nl,
        .hash,
        .identifier,
    });
    try expectTokensExtra(
        \\//foo
        \\void
        \\//bar
    , &.{
        .comment,      .nl,
        .keyword_void, .nl,
        .comment,
    }, .{ .preserve_comments = true });
}

test "extended identifiers" {
    try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
    try expectTokens("1™", &.{ .pp_num, .extended_identifier });
    try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
    try expectTokens("..™", &.{ .period, .period, .extended_identifier });
    try expectTokens("0™", &.{ .pp_num, .extended_identifier });
    try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
    try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
    try expectTokens("\"\\u\u{E0000}\"", &.{.string_literal});
    try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
    try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
}

test "digraphs" {
    try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
    try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
    try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
}

test "C23 keywords" {
    try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr typeof_unqual", &.{
        .keyword_true,
        .keyword_false,
        .keyword_c23_alignas,
        .keyword_c23_alignof,
        .keyword_c23_bool,
        .keyword_c23_static_assert,
        .keyword_c23_thread_local,
        .keyword_nullptr,
        .keyword_typeof_unqual,
    }, .{ .standard = .c23 });
}

test "Universal character names" {
    try expectTokens("\\", &.{.invalid});
    try expectTokens("\\g", &.{ .invalid, .identifier });
    try expectTokens("\\u", &.{ .incomplete_ucn, .identifier });
    try expectTokens("\\ua", &.{ .incomplete_ucn, .identifier });
    try expectTokens("\\U9", &.{ .incomplete_ucn, .identifier });
    try expectTokens("\\ug", &.{ .incomplete_ucn, .identifier });
    try expectTokens("\\uag", &.{ .incomplete_ucn, .identifier });

    try expectTokens("\\ ", &.{ .invalid, .eof });
    try expectTokens("\\g ", &.{ .invalid, .identifier, .eof });
    try expectTokens("\\u ", &.{ .incomplete_ucn, .identifier, .eof });
    try expectTokens("\\ua ", &.{ .incomplete_ucn, .identifier, .eof });
    try expectTokens("\\U9 ", &.{ .incomplete_ucn, .identifier, .eof });
    try expectTokens("\\ug ", &.{ .incomplete_ucn, .identifier, .eof });
    try expectTokens("\\uag ", &.{ .incomplete_ucn, .identifier, .eof });

    try expectTokens("a\\", &.{ .identifier, .invalid });
    try expectTokens("a\\g", &.{ .identifier, .invalid, .identifier });
    try expectTokens("a\\u", &.{ .identifier, .incomplete_ucn, .identifier });
    try expectTokens("a\\ua", &.{ .identifier, .incomplete_ucn, .identifier });
    try expectTokens("a\\U9", &.{ .identifier, .incomplete_ucn, .identifier });
    try expectTokens("a\\ug", &.{ .identifier, .incomplete_ucn, .identifier });
    try expectTokens("a\\uag", &.{ .identifier, .incomplete_ucn, .identifier });

    try expectTokens("a\\ ", &.{ .identifier, .invalid, .eof });
    try expectTokens("a\\g ", &.{ .identifier, .invalid, .identifier, .eof });
    try expectTokens("a\\u ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
    try expectTokens("a\\ua ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
    try expectTokens("a\\U9 ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
    try expectTokens("a\\ug ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
    try expectTokens("a\\uag ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
}

test "Tokenizer fuzz test" {
    const Context = struct {
        fn testOne(_: @This(), input_bytes: []const u8) anyerror!void {
            var arena: std.heap.ArenaAllocator = .init(std.testing.allocator);
            defer arena.deinit();
            var comp = Compilation.init(std.testing.allocator, arena.allocator(), undefined, std.fs.cwd());
            defer comp.deinit();

            const source = try comp.addSourceFromBuffer("fuzz.c", input_bytes);

            var tokenizer: Tokenizer = .{
                .buf = source.buf,
                .source = source.id,
                .langopts = comp.langopts,
            };
            while (true) {
                const prev_index = tokenizer.index;
                const tok = tokenizer.next();
                if (tok.id == .eof) break;
                try std.testing.expect(prev_index < tokenizer.index); // ensure that the tokenizer always makes progress
            }
        }
    };
    return std.testing.fuzz(Context{}, Context.testOne, .{});
}

fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, langopts: ?LangOpts) !void {
    var arena: std.heap.ArenaAllocator = .init(std.testing.allocator);
    defer arena.deinit();
    var comp = Compilation.init(std.testing.allocator, arena.allocator(), undefined, std.fs.cwd());
    defer comp.deinit();
    if (langopts) |provided| {
        comp.langopts = provided;
    }
    const source = try comp.addSourceFromBuffer("path", contents);
    var tokenizer = Tokenizer{
        .buf = source.buf,
        .source = source.id,
        .langopts = comp.langopts,
    };
    var i: usize = 0;
    while (i < expected_tokens.len) {
        const token = tokenizer.next();
        if (token.id == .whitespace) continue;
        const expected_token_id = expected_tokens[i];
        i += 1;
        if (!std.meta.eql(token.id, expected_token_id)) {
            std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
            return error.TokensDoNotEqual;
        }
    }
    const last_token = tokenizer.next();
    try std.testing.expect(last_token.id == .eof);
}

fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
    return expectTokensExtra(contents, expected_tokens, null);
}