const std = @import("std"); const assert = std.debug.assert; const Compilation = @import("Compilation.zig"); const Source = @import("Source.zig"); const LangOpts = @import("LangOpts.zig"); const Tokenizer = @This(); pub const Token = struct { id: Id, source: Source.Id, start: u32 = 0, end: u32 = 0, line: u32 = 0, pub const Id = enum(u8) { invalid, nl, whitespace, eof, /// identifier containing solely basic character set characters identifier, /// identifier with at least one extended character extended_identifier, // string literals with prefixes string_literal, string_literal_utf_16, string_literal_utf_8, string_literal_utf_32, string_literal_wide, // only generated by preprocessor macro_string, // char literals with prefixes char_literal, char_literal_utf_8, char_literal_utf_16, char_literal_utf_32, char_literal_wide, /// Integer literal tokens generated by preprocessor. one, zero, bang, bang_equal, pipe, pipe_pipe, pipe_equal, equal, equal_equal, l_paren, r_paren, l_brace, r_brace, l_bracket, r_bracket, period, ellipsis, caret, caret_equal, plus, plus_plus, plus_equal, minus, minus_minus, minus_equal, asterisk, asterisk_equal, percent, percent_equal, arrow, colon, colon_colon, semicolon, slash, slash_equal, comma, ampersand, ampersand_ampersand, ampersand_equal, question_mark, angle_bracket_left, angle_bracket_left_equal, angle_bracket_angle_bracket_left, angle_bracket_angle_bracket_left_equal, angle_bracket_right, angle_bracket_right_equal, angle_bracket_angle_bracket_right, angle_bracket_angle_bracket_right_equal, tilde, hash, hash_hash, /// Special token to speed up preprocessing, `loc.end` will be an index to the param list. macro_param, /// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation) macro_param_no_expand, /// Special token to speed up preprocessing, `loc.end` will be an index to the param list. stringify_param, /// Same as stringify_param, but for var args stringify_va_args, /// Special macro whitespace, always equal to a single space macro_ws, /// Special token for implementing __has_attribute macro_param_has_attribute, /// Special token for implementing __has_declspec_attribute macro_param_has_declspec_attribute, /// Special token for implementing __has_warning macro_param_has_warning, /// Special token for implementing __has_feature macro_param_has_feature, /// Special token for implementing __has_extension macro_param_has_extension, /// Special token for implementing __has_builtin macro_param_has_builtin, /// Special token for implementing __has_include macro_param_has_include, /// Special token for implementing __has_include_next macro_param_has_include_next, /// Special token for implementing __is_identifier macro_param_is_identifier, /// Special token for implementing __FILE__ macro_file, /// Special token for implementing __LINE__ macro_line, /// Special token for implementing __COUNTER__ macro_counter, /// Special token for implementing _Pragma macro_param_pragma_operator, /// Special identifier for implementing __func__ macro_func, /// Special identifier for implementing __FUNCTION__ macro_function, /// Special identifier for implementing __PRETTY_FUNCTION__ macro_pretty_func, keyword_auto, keyword_auto_type, keyword_break, keyword_case, keyword_char, keyword_const, keyword_continue, keyword_default, keyword_do, keyword_double, keyword_else, keyword_enum, keyword_extern, keyword_float, keyword_for, keyword_goto, keyword_if, keyword_int, keyword_long, keyword_register, keyword_return, keyword_short, keyword_signed, keyword_sizeof, keyword_static, keyword_struct, keyword_switch, keyword_typedef, keyword_typeof1, keyword_typeof2, keyword_union, keyword_unsigned, keyword_void, keyword_volatile, keyword_while, // ISO C99 keyword_bool, keyword_complex, keyword_imaginary, keyword_inline, keyword_restrict, // ISO C11 keyword_alignas, keyword_alignof, keyword_atomic, keyword_generic, keyword_noreturn, keyword_static_assert, keyword_thread_local, // ISO C23 keyword_bit_int, keyword_c23_alignas, keyword_c23_alignof, keyword_c23_bool, keyword_c23_static_assert, keyword_c23_thread_local, keyword_constexpr, keyword_true, keyword_false, keyword_nullptr, // Preprocessor directives keyword_include, keyword_include_next, keyword_embed, keyword_define, keyword_defined, keyword_undef, keyword_ifdef, keyword_ifndef, keyword_elif, keyword_elifdef, keyword_elifndef, keyword_endif, keyword_error, keyword_warning, keyword_pragma, keyword_line, keyword_va_args, // gcc keywords keyword_const1, keyword_const2, keyword_inline1, keyword_inline2, keyword_volatile1, keyword_volatile2, keyword_restrict1, keyword_restrict2, keyword_alignof1, keyword_alignof2, keyword_typeof, keyword_attribute1, keyword_attribute2, keyword_extension, keyword_asm, keyword_asm1, keyword_asm2, keyword_float80, keyword_float128, keyword_int128, keyword_imag1, keyword_imag2, keyword_real1, keyword_real2, keyword_float16, // clang keywords keyword_fp16, // ms keywords keyword_declspec, keyword_int64, keyword_int64_2, keyword_int32, keyword_int32_2, keyword_int16, keyword_int16_2, keyword_int8, keyword_int8_2, keyword_stdcall, keyword_stdcall2, keyword_thiscall, keyword_thiscall2, keyword_vectorcall, keyword_vectorcall2, // builtins that require special parsing builtin_choose_expr, builtin_va_arg, builtin_offsetof, builtin_bitoffsetof, builtin_types_compatible_p, /// Generated by #embed directive /// Decimal value with no prefix or suffix embed_byte, /// preprocessor number /// An optional period, followed by a digit 0-9, followed by any number of letters /// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-) pp_num, /// preprocessor placemarker token /// generated if `##` is used with a zero-token argument /// removed after substitution, so the parser should never see this /// See C99 6.10.3.3.2 placemarker, /// Virtual linemarker token output from preprocessor to indicate start of a new include include_start, /// Virtual linemarker token output from preprocessor to indicate resuming a file after /// completion of the preceding #include include_resume, /// A comment token if asked to preserve comments. comment, /// Return true if token is identifier or keyword. pub fn isMacroIdentifier(id: Id) bool { switch (id) { .keyword_include, .keyword_include_next, .keyword_embed, .keyword_define, .keyword_defined, .keyword_undef, .keyword_ifdef, .keyword_ifndef, .keyword_elif, .keyword_elifdef, .keyword_elifndef, .keyword_endif, .keyword_error, .keyword_warning, .keyword_pragma, .keyword_line, .keyword_va_args, .macro_func, .macro_function, .macro_pretty_func, .keyword_auto, .keyword_auto_type, .keyword_break, .keyword_case, .keyword_char, .keyword_const, .keyword_continue, .keyword_default, .keyword_do, .keyword_double, .keyword_else, .keyword_enum, .keyword_extern, .keyword_float, .keyword_for, .keyword_goto, .keyword_if, .keyword_int, .keyword_long, .keyword_register, .keyword_return, .keyword_short, .keyword_signed, .keyword_sizeof, .keyword_static, .keyword_struct, .keyword_switch, .keyword_typedef, .keyword_union, .keyword_unsigned, .keyword_void, .keyword_volatile, .keyword_while, .keyword_bool, .keyword_complex, .keyword_imaginary, .keyword_inline, .keyword_restrict, .keyword_alignas, .keyword_alignof, .keyword_atomic, .keyword_generic, .keyword_noreturn, .keyword_static_assert, .keyword_thread_local, .identifier, .extended_identifier, .keyword_typeof, .keyword_typeof1, .keyword_typeof2, .keyword_const1, .keyword_const2, .keyword_inline1, .keyword_inline2, .keyword_volatile1, .keyword_volatile2, .keyword_restrict1, .keyword_restrict2, .keyword_alignof1, .keyword_alignof2, .builtin_choose_expr, .builtin_va_arg, .builtin_offsetof, .builtin_bitoffsetof, .builtin_types_compatible_p, .keyword_attribute1, .keyword_attribute2, .keyword_extension, .keyword_asm, .keyword_asm1, .keyword_asm2, .keyword_float80, .keyword_float128, .keyword_int128, .keyword_imag1, .keyword_imag2, .keyword_real1, .keyword_real2, .keyword_float16, .keyword_fp16, .keyword_declspec, .keyword_int64, .keyword_int64_2, .keyword_int32, .keyword_int32_2, .keyword_int16, .keyword_int16_2, .keyword_int8, .keyword_int8_2, .keyword_stdcall, .keyword_stdcall2, .keyword_thiscall, .keyword_thiscall2, .keyword_vectorcall, .keyword_vectorcall2, .keyword_bit_int, .keyword_c23_alignas, .keyword_c23_alignof, .keyword_c23_bool, .keyword_c23_static_assert, .keyword_c23_thread_local, .keyword_constexpr, .keyword_true, .keyword_false, .keyword_nullptr, => return true, else => return false, } } /// Turn macro keywords into identifiers. /// `keyword_defined` is special since it should only turn into an identifier if /// we are *not* in an #if or #elif expression pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void { switch (id.*) { .keyword_include, .keyword_include_next, .keyword_embed, .keyword_define, .keyword_undef, .keyword_ifdef, .keyword_ifndef, .keyword_elif, .keyword_elifdef, .keyword_elifndef, .keyword_endif, .keyword_error, .keyword_warning, .keyword_pragma, .keyword_line, .keyword_va_args, => id.* = .identifier, .keyword_defined => if (defined_to_identifier) { id.* = .identifier; }, else => {}, } } pub fn simplifyMacroKeyword(id: *Id) void { simplifyMacroKeywordExtra(id, false); } pub fn lexeme(id: Id) ?[]const u8 { return switch (id) { .include_start, .include_resume, => unreachable, .invalid, .identifier, .extended_identifier, .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, .char_literal, .char_literal_utf_8, .char_literal_utf_16, .char_literal_utf_32, .char_literal_wide, .macro_string, .whitespace, .pp_num, .embed_byte, .comment, => null, .zero => "0", .one => "1", .nl, .eof, .macro_param, .macro_param_no_expand, .stringify_param, .stringify_va_args, .macro_param_has_attribute, .macro_param_has_declspec_attribute, .macro_param_has_warning, .macro_param_has_feature, .macro_param_has_extension, .macro_param_has_builtin, .macro_param_has_include, .macro_param_has_include_next, .macro_param_is_identifier, .macro_file, .macro_line, .macro_counter, .macro_param_pragma_operator, .placemarker, => "", .macro_ws => " ", .macro_func => "__func__", .macro_function => "__FUNCTION__", .macro_pretty_func => "__PRETTY_FUNCTION__", .bang => "!", .bang_equal => "!=", .pipe => "|", .pipe_pipe => "||", .pipe_equal => "|=", .equal => "=", .equal_equal => "==", .l_paren => "(", .r_paren => ")", .l_brace => "{", .r_brace => "}", .l_bracket => "[", .r_bracket => "]", .period => ".", .ellipsis => "...", .caret => "^", .caret_equal => "^=", .plus => "+", .plus_plus => "++", .plus_equal => "+=", .minus => "-", .minus_minus => "--", .minus_equal => "-=", .asterisk => "*", .asterisk_equal => "*=", .percent => "%", .percent_equal => "%=", .arrow => "->", .colon => ":", .colon_colon => "::", .semicolon => ";", .slash => "/", .slash_equal => "/=", .comma => ",", .ampersand => "&", .ampersand_ampersand => "&&", .ampersand_equal => "&=", .question_mark => "?", .angle_bracket_left => "<", .angle_bracket_left_equal => "<=", .angle_bracket_angle_bracket_left => "<<", .angle_bracket_angle_bracket_left_equal => "<<=", .angle_bracket_right => ">", .angle_bracket_right_equal => ">=", .angle_bracket_angle_bracket_right => ">>", .angle_bracket_angle_bracket_right_equal => ">>=", .tilde => "~", .hash => "#", .hash_hash => "##", .keyword_auto => "auto", .keyword_auto_type => "__auto_type", .keyword_break => "break", .keyword_case => "case", .keyword_char => "char", .keyword_const => "const", .keyword_continue => "continue", .keyword_default => "default", .keyword_do => "do", .keyword_double => "double", .keyword_else => "else", .keyword_enum => "enum", .keyword_extern => "extern", .keyword_float => "float", .keyword_for => "for", .keyword_goto => "goto", .keyword_if => "if", .keyword_int => "int", .keyword_long => "long", .keyword_register => "register", .keyword_return => "return", .keyword_short => "short", .keyword_signed => "signed", .keyword_sizeof => "sizeof", .keyword_static => "static", .keyword_struct => "struct", .keyword_switch => "switch", .keyword_typedef => "typedef", .keyword_typeof => "typeof", .keyword_union => "union", .keyword_unsigned => "unsigned", .keyword_void => "void", .keyword_volatile => "volatile", .keyword_while => "while", .keyword_bool => "_Bool", .keyword_complex => "_Complex", .keyword_imaginary => "_Imaginary", .keyword_inline => "inline", .keyword_restrict => "restrict", .keyword_alignas => "_Alignas", .keyword_alignof => "_Alignof", .keyword_atomic => "_Atomic", .keyword_generic => "_Generic", .keyword_noreturn => "_Noreturn", .keyword_static_assert => "_Static_assert", .keyword_thread_local => "_Thread_local", .keyword_bit_int => "_BitInt", .keyword_c23_alignas => "alignas", .keyword_c23_alignof => "alignof", .keyword_c23_bool => "bool", .keyword_c23_static_assert => "static_assert", .keyword_c23_thread_local => "thread_local", .keyword_constexpr => "constexpr", .keyword_true => "true", .keyword_false => "false", .keyword_nullptr => "nullptr", .keyword_include => "include", .keyword_include_next => "include_next", .keyword_embed => "embed", .keyword_define => "define", .keyword_defined => "defined", .keyword_undef => "undef", .keyword_ifdef => "ifdef", .keyword_ifndef => "ifndef", .keyword_elif => "elif", .keyword_elifdef => "elifdef", .keyword_elifndef => "elifndef", .keyword_endif => "endif", .keyword_error => "error", .keyword_warning => "warning", .keyword_pragma => "pragma", .keyword_line => "line", .keyword_va_args => "__VA_ARGS__", .keyword_const1 => "__const", .keyword_const2 => "__const__", .keyword_inline1 => "__inline", .keyword_inline2 => "__inline__", .keyword_volatile1 => "__volatile", .keyword_volatile2 => "__volatile__", .keyword_restrict1 => "__restrict", .keyword_restrict2 => "__restrict__", .keyword_alignof1 => "__alignof", .keyword_alignof2 => "__alignof__", .keyword_typeof1 => "__typeof", .keyword_typeof2 => "__typeof__", .builtin_choose_expr => "__builtin_choose_expr", .builtin_va_arg => "__builtin_va_arg", .builtin_offsetof => "__builtin_offsetof", .builtin_bitoffsetof => "__builtin_bitoffsetof", .builtin_types_compatible_p => "__builtin_types_compatible_p", .keyword_attribute1 => "__attribute", .keyword_attribute2 => "__attribute__", .keyword_extension => "__extension__", .keyword_asm => "asm", .keyword_asm1 => "__asm", .keyword_asm2 => "__asm__", .keyword_float80 => "__float80", .keyword_float128 => "__float18", .keyword_int128 => "__int128", .keyword_imag1 => "__imag", .keyword_imag2 => "__imag__", .keyword_real1 => "__real", .keyword_real2 => "__real__", .keyword_float16 => "_Float16", .keyword_fp16 => "__fp16", .keyword_declspec => "__declspec", .keyword_int64 => "__int64", .keyword_int64_2 => "_int64", .keyword_int32 => "__int32", .keyword_int32_2 => "_int32", .keyword_int16 => "__int16", .keyword_int16_2 => "_int16", .keyword_int8 => "__int8", .keyword_int8_2 => "_int8", .keyword_stdcall => "__stdcall", .keyword_stdcall2 => "_stdcall", .keyword_thiscall => "__thiscall", .keyword_thiscall2 => "_thiscall", .keyword_vectorcall => "__vectorcall", .keyword_vectorcall2 => "_vectorcall", }; } pub fn symbol(id: Id) []const u8 { return switch (id) { .macro_string, .invalid => unreachable, .identifier, .extended_identifier, .macro_func, .macro_function, .macro_pretty_func, .builtin_choose_expr, .builtin_va_arg, .builtin_offsetof, .builtin_bitoffsetof, .builtin_types_compatible_p, => "an identifier", .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, => "a string literal", .char_literal, .char_literal_utf_8, .char_literal_utf_16, .char_literal_utf_32, .char_literal_wide, => "a character literal", .pp_num, .embed_byte => "A number", else => id.lexeme().?, }; } /// tokens that can start an expression parsed by Preprocessor.expr /// Note that eof, r_paren, and string literals cannot actually start a /// preprocessor expression, but we include them here so that a nicer /// error message can be generated by the parser. pub fn validPreprocessorExprStart(id: Id) bool { return switch (id) { .eof, .r_paren, .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide, .char_literal, .char_literal_utf_8, .char_literal_utf_16, .char_literal_utf_32, .char_literal_wide, .l_paren, .plus, .minus, .tilde, .bang, .identifier, .extended_identifier, .keyword_defined, .one, .zero, .pp_num, .keyword_true, .keyword_false, => true, else => false, }; } pub fn allowsDigraphs(id: Id, comp: *const Compilation) bool { return switch (id) { .l_bracket, .r_bracket, .l_brace, .r_brace, .hash, .hash_hash, => comp.langopts.hasDigraphs(), else => false, }; } pub fn canOpenGCCAsmStmt(id: Id) bool { return switch (id) { .keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true, else => false, }; } pub fn isStringLiteral(id: Id) bool { return switch (id) { .string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true, else => false, }; } }; /// double underscore and underscore + capital letter identifiers /// belong to the implementation namespace, so we always convert them /// to keywords. pub fn getTokenId(comp: *const Compilation, str: []const u8) Token.Id { const kw = all_kws.get(str) orelse return .identifier; const standard = comp.langopts.standard; return switch (kw) { .keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier, .keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier, .keyword_typeof => if (standard.isGNU() or standard.atLeast(.c2x)) kw else .identifier, .keyword_asm => if (standard.isGNU()) kw else .identifier, .keyword_declspec => if (comp.langopts.declspec_attrs) kw else .identifier, .keyword_c23_alignas, .keyword_c23_alignof, .keyword_c23_bool, .keyword_c23_static_assert, .keyword_c23_thread_local, .keyword_constexpr, .keyword_true, .keyword_false, .keyword_nullptr, .keyword_elifdef, .keyword_elifndef, => if (standard.atLeast(.c2x)) kw else .identifier, .keyword_int64, .keyword_int64_2, .keyword_int32, .keyword_int32_2, .keyword_int16, .keyword_int16_2, .keyword_int8, .keyword_int8_2, .keyword_stdcall2, .keyword_thiscall2, .keyword_vectorcall2, => if (comp.langopts.ms_extensions) kw else .identifier, else => kw, }; } const all_kws = std.ComptimeStringMap(Id, .{ .{ "auto", auto: { @setEvalBranchQuota(3000); break :auto .keyword_auto; } }, .{ "break", .keyword_break }, .{ "case", .keyword_case }, .{ "char", .keyword_char }, .{ "const", .keyword_const }, .{ "continue", .keyword_continue }, .{ "default", .keyword_default }, .{ "do", .keyword_do }, .{ "double", .keyword_double }, .{ "else", .keyword_else }, .{ "enum", .keyword_enum }, .{ "extern", .keyword_extern }, .{ "float", .keyword_float }, .{ "for", .keyword_for }, .{ "goto", .keyword_goto }, .{ "if", .keyword_if }, .{ "int", .keyword_int }, .{ "long", .keyword_long }, .{ "register", .keyword_register }, .{ "return", .keyword_return }, .{ "short", .keyword_short }, .{ "signed", .keyword_signed }, .{ "sizeof", .keyword_sizeof }, .{ "static", .keyword_static }, .{ "struct", .keyword_struct }, .{ "switch", .keyword_switch }, .{ "typedef", .keyword_typedef }, .{ "union", .keyword_union }, .{ "unsigned", .keyword_unsigned }, .{ "void", .keyword_void }, .{ "volatile", .keyword_volatile }, .{ "while", .keyword_while }, .{ "__typeof__", .keyword_typeof2 }, .{ "__typeof", .keyword_typeof1 }, // ISO C99 .{ "_Bool", .keyword_bool }, .{ "_Complex", .keyword_complex }, .{ "_Imaginary", .keyword_imaginary }, .{ "inline", .keyword_inline }, .{ "restrict", .keyword_restrict }, // ISO C11 .{ "_Alignas", .keyword_alignas }, .{ "_Alignof", .keyword_alignof }, .{ "_Atomic", .keyword_atomic }, .{ "_Generic", .keyword_generic }, .{ "_Noreturn", .keyword_noreturn }, .{ "_Static_assert", .keyword_static_assert }, .{ "_Thread_local", .keyword_thread_local }, // ISO C23 .{ "_BitInt", .keyword_bit_int }, .{ "alignas", .keyword_c23_alignas }, .{ "alignof", .keyword_c23_alignof }, .{ "bool", .keyword_c23_bool }, .{ "static_assert", .keyword_c23_static_assert }, .{ "thread_local", .keyword_c23_thread_local }, .{ "constexpr", .keyword_constexpr }, .{ "true", .keyword_true }, .{ "false", .keyword_false }, .{ "nullptr", .keyword_nullptr }, // Preprocessor directives .{ "include", .keyword_include }, .{ "include_next", .keyword_include_next }, .{ "embed", .keyword_embed }, .{ "define", .keyword_define }, .{ "defined", .keyword_defined }, .{ "undef", .keyword_undef }, .{ "ifdef", .keyword_ifdef }, .{ "ifndef", .keyword_ifndef }, .{ "elif", .keyword_elif }, .{ "elifdef", .keyword_elifdef }, .{ "elifndef", .keyword_elifndef }, .{ "endif", .keyword_endif }, .{ "error", .keyword_error }, .{ "warning", .keyword_warning }, .{ "pragma", .keyword_pragma }, .{ "line", .keyword_line }, .{ "__VA_ARGS__", .keyword_va_args }, .{ "__func__", .macro_func }, .{ "__FUNCTION__", .macro_function }, .{ "__PRETTY_FUNCTION__", .macro_pretty_func }, // gcc keywords .{ "__auto_type", .keyword_auto_type }, .{ "__const", .keyword_const1 }, .{ "__const__", .keyword_const2 }, .{ "__inline", .keyword_inline1 }, .{ "__inline__", .keyword_inline2 }, .{ "__volatile", .keyword_volatile1 }, .{ "__volatile__", .keyword_volatile2 }, .{ "__restrict", .keyword_restrict1 }, .{ "__restrict__", .keyword_restrict2 }, .{ "__alignof", .keyword_alignof1 }, .{ "__alignof__", .keyword_alignof2 }, .{ "typeof", .keyword_typeof }, .{ "__attribute", .keyword_attribute1 }, .{ "__attribute__", .keyword_attribute2 }, .{ "__extension__", .keyword_extension }, .{ "asm", .keyword_asm }, .{ "__asm", .keyword_asm1 }, .{ "__asm__", .keyword_asm2 }, .{ "__float80", .keyword_float80 }, .{ "__float128", .keyword_float128 }, .{ "__int128", .keyword_int128 }, .{ "__imag", .keyword_imag1 }, .{ "__imag__", .keyword_imag2 }, .{ "__real", .keyword_real1 }, .{ "__real__", .keyword_real2 }, .{ "_Float16", .keyword_float16 }, // clang keywords .{ "__fp16", .keyword_fp16 }, // ms keywords .{ "__declspec", .keyword_declspec }, .{ "__int64", .keyword_int64 }, .{ "_int64", .keyword_int64_2 }, .{ "__int32", .keyword_int32 }, .{ "_int32", .keyword_int32_2 }, .{ "__int16", .keyword_int16 }, .{ "_int16", .keyword_int16_2 }, .{ "__int8", .keyword_int8 }, .{ "_int8", .keyword_int8_2 }, .{ "__stdcall", .keyword_stdcall }, .{ "_stdcall", .keyword_stdcall2 }, .{ "__thiscall", .keyword_thiscall }, .{ "_thiscall", .keyword_thiscall2 }, .{ "__vectorcall", .keyword_vectorcall }, .{ "_vectorcall", .keyword_vectorcall2 }, // builtins that require special parsing .{ "__builtin_choose_expr", .builtin_choose_expr }, .{ "__builtin_va_arg", .builtin_va_arg }, .{ "__builtin_offsetof", .builtin_offsetof }, .{ "__builtin_bitoffsetof", .builtin_bitoffsetof }, .{ "__builtin_types_compatible_p", .builtin_types_compatible_p }, }); }; buf: []const u8, index: u32 = 0, source: Source.Id, comp: *const Compilation, line: u32 = 1, /// Used to parse include strings with Windows style paths. path_escapes: bool = false, pub fn next(self: *Tokenizer) Token { var state: enum { start, whitespace, u, u8, U, L, string_literal, path_escape, char_literal_start, char_literal, char_escape_sequence, escape_sequence, octal_escape, hex_escape, unicode_escape, identifier, extended_identifier, equal, bang, pipe, colon, percent, asterisk, plus, angle_bracket_left, angle_bracket_angle_bracket_left, angle_bracket_right, angle_bracket_angle_bracket_right, caret, period, period2, minus, slash, ampersand, hash, hash_digraph, hash_hash_digraph_partial, line_comment, multi_line_comment, multi_line_comment_asterisk, multi_line_comment_done, pp_num, pp_num_exponent, pp_num_digit_separator, } = .start; var start = self.index; var id: Token.Id = .eof; var return_state = state; var counter: u32 = 0; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; switch (state) { .start => switch (c) { '\n' => { id = .nl; self.index += 1; self.line += 1; break; }, '"' => { id = .string_literal; state = .string_literal; }, '\'' => { id = .char_literal; state = .char_literal_start; }, 'u' => state = .u, 'U' => state = .U, 'L' => state = .L, 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier, '=' => state = .equal, '!' => state = .bang, '|' => state = .pipe, '(' => { id = .l_paren; self.index += 1; break; }, ')' => { id = .r_paren; self.index += 1; break; }, '[' => { id = .l_bracket; self.index += 1; break; }, ']' => { id = .r_bracket; self.index += 1; break; }, ';' => { id = .semicolon; self.index += 1; break; }, ',' => { id = .comma; self.index += 1; break; }, '?' => { id = .question_mark; self.index += 1; break; }, ':' => state = .colon, '%' => state = .percent, '*' => state = .asterisk, '+' => state = .plus, '<' => state = .angle_bracket_left, '>' => state = .angle_bracket_right, '^' => state = .caret, '{' => { id = .l_brace; self.index += 1; break; }, '}' => { id = .r_brace; self.index += 1; break; }, '~' => { id = .tilde; self.index += 1; break; }, '.' => state = .period, '-' => state = .minus, '/' => state = .slash, '&' => state = .ampersand, '#' => state = .hash, '0'...'9' => state = .pp_num, '\t', '\x0B', '\x0C', ' ' => state = .whitespace, '$' => if (self.comp.langopts.dollars_in_identifiers) { state = .extended_identifier; } else { id = .invalid; self.index += 1; break; }, 0x1A => if (self.comp.langopts.ms_extensions) { id = .eof; break; } else { id = .invalid; self.index += 1; break; }, 0x80...0xFF => state = .extended_identifier, else => { id = .invalid; self.index += 1; break; }, }, .whitespace => switch (c) { '\t', '\x0B', '\x0C', ' ' => {}, else => { id = .whitespace; break; }, }, .u => switch (c) { '8' => { state = .u8; }, '\'' => { id = .char_literal_utf_16; state = .char_literal_start; }, '\"' => { id = .string_literal_utf_16; state = .string_literal; }, else => { self.index -= 1; state = .identifier; }, }, .u8 => switch (c) { '\"' => { id = .string_literal_utf_8; state = .string_literal; }, '\'' => { id = .char_literal_utf_8; state = .char_literal_start; }, else => { self.index -= 1; state = .identifier; }, }, .U => switch (c) { '\'' => { id = .char_literal_utf_32; state = .char_literal_start; }, '\"' => { id = .string_literal_utf_32; state = .string_literal; }, else => { self.index -= 1; state = .identifier; }, }, .L => switch (c) { '\'' => { id = .char_literal_wide; state = .char_literal_start; }, '\"' => { id = .string_literal_wide; state = .string_literal; }, else => { self.index -= 1; state = .identifier; }, }, .string_literal => switch (c) { '\\' => { return_state = .string_literal; state = if (self.path_escapes) .path_escape else .escape_sequence; }, '"' => { self.index += 1; break; }, '\n' => { id = .invalid; break; }, '\r' => unreachable, else => {}, }, .path_escape => { state = .string_literal; }, .char_literal_start => switch (c) { '\\' => { state = .char_escape_sequence; }, '\'', '\n' => { id = .invalid; break; }, else => { state = .char_literal; }, }, .char_literal => switch (c) { '\\' => { state = .char_escape_sequence; }, '\'' => { self.index += 1; break; }, '\n' => { id = .invalid; break; }, else => {}, }, .char_escape_sequence => switch (c) { '\r', '\n' => unreachable, // removed by line splicing else => state = .char_literal, }, .escape_sequence => switch (c) { '\'', '"', '?', '\\', 'a', 'b', 'e', 'f', 'n', 'r', 't', 'v' => { state = return_state; }, '\r', '\n' => unreachable, // removed by line splicing '0'...'7' => { counter = 1; state = .octal_escape; }, 'x' => state = .hex_escape, 'u' => { counter = 4; state = .unicode_escape; }, 'U' => { counter = 8; state = .unicode_escape; }, else => { id = .invalid; break; }, }, .octal_escape => switch (c) { '0'...'7' => { counter += 1; if (counter == 3) state = return_state; }, else => { self.index -= 1; state = return_state; }, }, .hex_escape => switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => {}, else => { self.index -= 1; state = return_state; }, }, .unicode_escape => switch (c) { '0'...'9', 'a'...'f', 'A'...'F' => { counter -= 1; if (counter == 0) state = return_state; }, else => { id = .invalid; break; }, }, .identifier, .extended_identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, '$' => if (self.comp.langopts.dollars_in_identifiers) { state = .extended_identifier; } else { id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier; break; }, 0x80...0xFF => state = .extended_identifier, else => { id = if (state == .identifier) Token.getTokenId(self.comp, self.buf[start..self.index]) else .extended_identifier; break; }, }, .equal => switch (c) { '=' => { id = .equal_equal; self.index += 1; break; }, else => { id = .equal; break; }, }, .bang => switch (c) { '=' => { id = .bang_equal; self.index += 1; break; }, else => { id = .bang; break; }, }, .pipe => switch (c) { '=' => { id = .pipe_equal; self.index += 1; break; }, '|' => { id = .pipe_pipe; self.index += 1; break; }, else => { id = .pipe; break; }, }, .colon => switch (c) { '>' => { if (self.comp.langopts.hasDigraphs()) { id = .r_bracket; self.index += 1; } else { id = .colon; } break; }, ':' => { if (self.comp.langopts.standard.atLeast(.c2x)) { id = .colon_colon; self.index += 1; break; } else { id = .colon; break; } }, else => { id = .colon; break; }, }, .percent => switch (c) { '=' => { id = .percent_equal; self.index += 1; break; }, '>' => { if (self.comp.langopts.hasDigraphs()) { id = .r_brace; self.index += 1; } else { id = .percent; } break; }, ':' => { if (self.comp.langopts.hasDigraphs()) { state = .hash_digraph; } else { id = .percent; break; } }, else => { id = .percent; break; }, }, .asterisk => switch (c) { '=' => { id = .asterisk_equal; self.index += 1; break; }, else => { id = .asterisk; break; }, }, .plus => switch (c) { '=' => { id = .plus_equal; self.index += 1; break; }, '+' => { id = .plus_plus; self.index += 1; break; }, else => { id = .plus; break; }, }, .angle_bracket_left => switch (c) { '<' => state = .angle_bracket_angle_bracket_left, '=' => { id = .angle_bracket_left_equal; self.index += 1; break; }, ':' => { if (self.comp.langopts.hasDigraphs()) { id = .l_bracket; self.index += 1; } else { id = .angle_bracket_left; } break; }, '%' => { if (self.comp.langopts.hasDigraphs()) { id = .l_brace; self.index += 1; } else { id = .angle_bracket_left; } break; }, else => { id = .angle_bracket_left; break; }, }, .angle_bracket_angle_bracket_left => switch (c) { '=' => { id = .angle_bracket_angle_bracket_left_equal; self.index += 1; break; }, else => { id = .angle_bracket_angle_bracket_left; break; }, }, .angle_bracket_right => switch (c) { '>' => state = .angle_bracket_angle_bracket_right, '=' => { id = .angle_bracket_right_equal; self.index += 1; break; }, else => { id = .angle_bracket_right; break; }, }, .angle_bracket_angle_bracket_right => switch (c) { '=' => { id = .angle_bracket_angle_bracket_right_equal; self.index += 1; break; }, else => { id = .angle_bracket_angle_bracket_right; break; }, }, .caret => switch (c) { '=' => { id = .caret_equal; self.index += 1; break; }, else => { id = .caret; break; }, }, .period => switch (c) { '.' => state = .period2, '0'...'9' => state = .pp_num, else => { id = .period; break; }, }, .period2 => switch (c) { '.' => { id = .ellipsis; self.index += 1; break; }, else => { id = .period; self.index -= 1; break; }, }, .minus => switch (c) { '>' => { id = .arrow; self.index += 1; break; }, '=' => { id = .minus_equal; self.index += 1; break; }, '-' => { id = .minus_minus; self.index += 1; break; }, else => { id = .minus; break; }, }, .ampersand => switch (c) { '&' => { id = .ampersand_ampersand; self.index += 1; break; }, '=' => { id = .ampersand_equal; self.index += 1; break; }, else => { id = .ampersand; break; }, }, .hash => switch (c) { '#' => { id = .hash_hash; self.index += 1; break; }, else => { id = .hash; break; }, }, .hash_digraph => switch (c) { '%' => state = .hash_hash_digraph_partial, else => { id = .hash; break; }, }, .hash_hash_digraph_partial => switch (c) { ':' => { id = .hash_hash; self.index += 1; break; }, else => { id = .hash; self.index -= 1; // re-tokenize the percent break; }, }, .slash => switch (c) { '/' => state = .line_comment, '*' => state = .multi_line_comment, '=' => { id = .slash_equal; self.index += 1; break; }, else => { id = .slash; break; }, }, .line_comment => switch (c) { '\n' => { if (self.comp.langopts.preserve_comments) { id = .comment; break; } self.index -= 1; state = .start; }, else => {}, }, .multi_line_comment => switch (c) { '*' => state = .multi_line_comment_asterisk, '\n' => self.line += 1, else => {}, }, .multi_line_comment_asterisk => switch (c) { '/' => { if (self.comp.langopts.preserve_comments) { self.index += 1; id = .comment; break; } state = .multi_line_comment_done; }, '\n' => { self.line += 1; state = .multi_line_comment; }, '*' => {}, else => state = .multi_line_comment, }, .multi_line_comment_done => switch (c) { '\n' => { start = self.index; id = .nl; self.index += 1; self.line += 1; break; }, '\r' => unreachable, '\t', '\x0B', '\x0C', ' ' => { start = self.index; state = .whitespace; }, else => { id = .whitespace; break; }, }, .pp_num => switch (c) { 'a'...'d', 'A'...'D', 'f'...'o', 'F'...'O', 'q'...'z', 'Q'...'Z', '0'...'9', '_', '.', => {}, 'e', 'E', 'p', 'P' => state = .pp_num_exponent, '\'' => if (self.comp.langopts.standard.atLeast(.c2x)) { state = .pp_num_digit_separator; } else { id = .pp_num; break; }, else => { id = .pp_num; break; }, }, .pp_num_digit_separator => switch (c) { 'a'...'d', 'A'...'D', 'f'...'o', 'F'...'O', 'q'...'z', 'Q'...'Z', '0'...'9', '_', => state = .pp_num, else => { self.index -= 1; id = .pp_num; break; }, }, .pp_num_exponent => switch (c) { 'a'...'z', 'A'...'Z', '0'...'9', '_', '.', '+', '-', => state = .pp_num, else => { id = .pp_num; break; }, }, } } else if (self.index == self.buf.len) { switch (state) { .start, .line_comment => {}, .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.comp, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, .period2, .string_literal, .path_escape, .char_literal_start, .char_literal, .escape_sequence, .char_escape_sequence, .octal_escape, .hex_escape, .unicode_escape, .multi_line_comment, .multi_line_comment_asterisk, => id = .invalid, .whitespace => id = .whitespace, .multi_line_comment_done => id = .whitespace, .equal => id = .equal, .bang => id = .bang, .minus => id = .minus, .slash => id = .slash, .ampersand => id = .ampersand, .hash => id = .hash, .period => id = .period, .pipe => id = .pipe, .angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right, .angle_bracket_right => id = .angle_bracket_right, .angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left, .angle_bracket_left => id = .angle_bracket_left, .plus => id = .plus, .colon => id = .colon, .percent => id = .percent, .caret => id = .caret, .asterisk => id = .asterisk, .hash_digraph => id = .hash, .hash_hash_digraph_partial => { id = .hash; self.index -= 1; // re-tokenize the percent }, .pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num, } } return .{ .id = id, .start = start, .end = self.index, .line = self.line, .source = self.source, }; } pub fn nextNoWS(self: *Tokenizer) Token { var tok = self.next(); while (tok.id == .whitespace or tok.id == .comment) tok = self.next(); return tok; } pub fn nextNoWSComments(self: *Tokenizer) Token { var tok = self.next(); while (tok.id == .whitespace) tok = self.next(); return tok; } test "operators" { try expectTokens( \\ ! != | || |= = == \\ ( ) { } [ ] . .. ... \\ ^ ^= + ++ += - -- -= \\ * *= % %= -> : ; / /= \\ , & && &= ? < <= << \\ <<= > >= >> >>= ~ # ## \\ , &.{ .bang, .bang_equal, .pipe, .pipe_pipe, .pipe_equal, .equal, .equal_equal, .nl, .l_paren, .r_paren, .l_brace, .r_brace, .l_bracket, .r_bracket, .period, .period, .period, .ellipsis, .nl, .caret, .caret_equal, .plus, .plus_plus, .plus_equal, .minus, .minus_minus, .minus_equal, .nl, .asterisk, .asterisk_equal, .percent, .percent_equal, .arrow, .colon, .semicolon, .slash, .slash_equal, .nl, .comma, .ampersand, .ampersand_ampersand, .ampersand_equal, .question_mark, .angle_bracket_left, .angle_bracket_left_equal, .angle_bracket_angle_bracket_left, .nl, .angle_bracket_angle_bracket_left_equal, .angle_bracket_right, .angle_bracket_right_equal, .angle_bracket_angle_bracket_right, .angle_bracket_angle_bracket_right_equal, .tilde, .hash, .hash_hash, .nl, }); } test "keywords" { try expectTokens( \\auto __auto_type break case char const continue default do \\double else enum extern float for goto if int \\long register return short signed sizeof static \\struct switch typedef union unsigned void volatile \\while _Bool _Complex _Imaginary inline restrict _Alignas \\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local \\__attribute __attribute__ \\ , &.{ .keyword_auto, .keyword_auto_type, .keyword_break, .keyword_case, .keyword_char, .keyword_const, .keyword_continue, .keyword_default, .keyword_do, .nl, .keyword_double, .keyword_else, .keyword_enum, .keyword_extern, .keyword_float, .keyword_for, .keyword_goto, .keyword_if, .keyword_int, .nl, .keyword_long, .keyword_register, .keyword_return, .keyword_short, .keyword_signed, .keyword_sizeof, .keyword_static, .nl, .keyword_struct, .keyword_switch, .keyword_typedef, .keyword_union, .keyword_unsigned, .keyword_void, .keyword_volatile, .nl, .keyword_while, .keyword_bool, .keyword_complex, .keyword_imaginary, .keyword_inline, .keyword_restrict, .keyword_alignas, .nl, .keyword_alignof, .keyword_atomic, .keyword_generic, .keyword_noreturn, .keyword_static_assert, .keyword_thread_local, .nl, .keyword_attribute1, .keyword_attribute2, .nl, }); } test "preprocessor keywords" { try expectTokens( \\#include \\#include_next \\#embed \\#define \\#ifdef \\#ifndef \\#error \\#pragma \\ , &.{ .hash, .keyword_include, .nl, .hash, .keyword_include_next, .nl, .hash, .keyword_embed, .nl, .hash, .keyword_define, .nl, .hash, .keyword_ifdef, .nl, .hash, .keyword_ifndef, .nl, .hash, .keyword_error, .nl, .hash, .keyword_pragma, .nl, }); } test "line continuation" { try expectTokens( \\#define foo \ \\ bar \\"foo\ \\ bar" \\#define "foo" \\ "bar" \\#define "foo" \ \\ "bar" , &.{ .hash, .keyword_define, .identifier, .identifier, .nl, .string_literal, .nl, .hash, .keyword_define, .string_literal, .nl, .string_literal, .nl, .hash, .keyword_define, .string_literal, .string_literal, }); } test "string prefix" { try expectTokens( \\"foo" \\u"foo" \\u8"foo" \\U"foo" \\L"foo" \\'foo' \\u8'A' \\u'foo' \\U'foo' \\L'foo' \\ , &.{ .string_literal, .nl, .string_literal_utf_16, .nl, .string_literal_utf_8, .nl, .string_literal_utf_32, .nl, .string_literal_wide, .nl, .char_literal, .nl, .char_literal_utf_8, .nl, .char_literal_utf_16, .nl, .char_literal_utf_32, .nl, .char_literal_wide, .nl, }); } test "num suffixes" { try expectTokens( \\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0 \\ 0l 0lu 0ll 0llu 0 \\ 1u 1ul 1ull 1 \\ 1.0i 1.0I \\ 1.0if 1.0If 1.0fi 1.0fI \\ 1.0il 1.0Il 1.0li 1.0lI \\ , &.{ .pp_num, .pp_num, .pp_num, .pp_num, .pp_num, .pp_num, .pp_num, .nl, .pp_num, .pp_num, .pp_num, .pp_num, .pp_num, .nl, .pp_num, .pp_num, .pp_num, .pp_num, .nl, .pp_num, .pp_num, .nl, .pp_num, .pp_num, .pp_num, .pp_num, .nl, .pp_num, .pp_num, .pp_num, .pp_num, .nl, }); } test "comments" { try expectTokens( \\//foo \\#foo , &.{ .nl, .hash, .identifier, }); } test "extended identifiers" { try expectTokens("π“ͺ𝓻𝓸𝓬𝓬", &.{.extended_identifier}); try expectTokens("uπ“ͺ𝓻𝓸𝓬𝓬", &.{.extended_identifier}); try expectTokens("u8π“ͺ𝓻𝓸𝓬𝓬", &.{.extended_identifier}); try expectTokens("Uπ“ͺ𝓻𝓸𝓬𝓬", &.{.extended_identifier}); try expectTokens("Lπ“ͺ𝓻𝓸𝓬𝓬", &.{.extended_identifier}); try expectTokens("1β„’", &.{ .pp_num, .extended_identifier }); try expectTokens("1.β„’", &.{ .pp_num, .extended_identifier }); try expectTokens("..β„’", &.{ .period, .period, .extended_identifier }); try expectTokens("0β„’", &.{ .pp_num, .extended_identifier }); try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal}); try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal}); try expectTokens("\"\\u\u{E0000}\"", &.{ .invalid, .extended_identifier, .invalid }); try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier }); try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier }); } test "digraphs" { try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash }); try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal}); try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent }); } test "C23 keywords" { try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr", &.{ .keyword_true, .keyword_false, .keyword_c23_alignas, .keyword_c23_alignof, .keyword_c23_bool, .keyword_c23_static_assert, .keyword_c23_thread_local, .keyword_nullptr, }, .c2x); } fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, standard: ?LangOpts.Standard) !void { var comp = Compilation.init(std.testing.allocator); defer comp.deinit(); if (standard) |provided| { comp.langopts.standard = provided; } const source = try comp.addSourceFromBuffer("path", contents); var tokenizer = Tokenizer{ .buf = source.buf, .source = source.id, .comp = &comp, }; var i: usize = 0; while (i < expected_tokens.len) { const token = tokenizer.next(); if (token.id == .whitespace) continue; const expected_token_id = expected_tokens[i]; i += 1; if (!std.meta.eql(token.id, expected_token_id)) { std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) }); return error.TokensDoNotEqual; } } const last_token = tokenizer.next(); try std.testing.expect(last_token.id == .eof); } fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void { return expectTokensExtra(contents, expected_tokens, null); }