zig/lib/compiler/aro/aro/Tokenizer.zig

2387 lines
76 KiB
Zig
Vendored
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const std = @import("std");
const assert = std.debug.assert;
const Compilation = @import("Compilation.zig");
const LangOpts = @import("LangOpts.zig");
const Source = @import("Source.zig");
/// Value for valid escapes indicates how many characters to consume, not counting leading backslash
const UCNKind = enum(u8) {
/// Just `\`
none,
/// \u or \U followed by an insufficient number of hex digits
incomplete,
/// `\uxxxx`
hex4 = 5,
/// `\Uxxxxxxxx`
hex8 = 9,
/// In the classification phase we do not care if the escape represents a valid universal character name
/// e.g. \UFFFFFFFF is acceptable.
fn classify(buf: []const u8) UCNKind {
assert(buf[0] == '\\');
if (buf.len == 1) return .none;
switch (buf[1]) {
'u' => {
if (buf.len < 6) return .incomplete;
for (buf[2..6]) |c| {
if (!std.ascii.isHex(c)) return .incomplete;
}
return .hex4;
},
'U' => {
if (buf.len < 10) return .incomplete;
for (buf[2..10]) |c| {
if (!std.ascii.isHex(c)) return .incomplete;
}
return .hex8;
},
else => return .none,
}
}
};
pub const Token = struct {
id: Id,
source: Source.Id,
start: u32 = 0,
end: u32 = 0,
line: u32 = 0,
pub const Id = enum(u8) {
invalid,
nl,
whitespace,
eof,
/// identifier containing solely basic character set characters
identifier,
/// identifier with at least one extended character or UCN escape sequence
extended_identifier,
// string literals with prefixes
string_literal,
string_literal_utf_16,
string_literal_utf_8,
string_literal_utf_32,
string_literal_wide,
/// Any string literal with an embedded newline or EOF
/// Always a parser error; by default just a warning from preprocessor
unterminated_string_literal,
// <foobar> only generated by preprocessor
macro_string,
// char literals with prefixes
char_literal,
char_literal_utf_8,
char_literal_utf_16,
char_literal_utf_32,
char_literal_wide,
/// Any character literal with nothing inside the quotes
/// Always a parser error; by default just a warning from preprocessor
empty_char_literal,
/// Any character literal with an embedded newline or EOF
/// Always a parser error; by default just a warning from preprocessor
unterminated_char_literal,
/// `/* */` style comment without a closing `*/` before EOF
unterminated_comment,
/// Integer literal tokens generated by preprocessor.
one,
zero,
bang,
bang_equal,
pipe,
pipe_pipe,
pipe_equal,
equal,
equal_equal,
l_paren,
r_paren,
l_brace,
r_brace,
l_bracket,
r_bracket,
period,
ellipsis,
caret,
caret_equal,
plus,
plus_plus,
plus_equal,
minus,
minus_minus,
minus_equal,
asterisk,
asterisk_equal,
percent,
percent_equal,
arrow,
colon,
colon_colon,
semicolon,
slash,
slash_equal,
comma,
ampersand,
ampersand_ampersand,
ampersand_equal,
question_mark,
angle_bracket_left,
angle_bracket_left_equal,
angle_bracket_angle_bracket_left,
angle_bracket_angle_bracket_left_equal,
angle_bracket_right,
angle_bracket_right_equal,
angle_bracket_angle_bracket_right,
angle_bracket_angle_bracket_right_equal,
tilde,
hash,
hash_hash,
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
macro_param,
/// Special token to signal that the argument must be replaced without expansion (e.g. in concatenation)
macro_param_no_expand,
/// Special token to speed up preprocessing, `loc.end` will be an index to the param list.
stringify_param,
/// Same as stringify_param, but for var args
stringify_va_args,
/// Special macro whitespace, always equal to a single space
macro_ws,
/// Special token for implementing __has_attribute
macro_param_has_attribute,
/// Special token for implementing __has_c_attribute
macro_param_has_c_attribute,
/// Special token for implementing __has_declspec_attribute
macro_param_has_declspec_attribute,
/// Special token for implementing __has_warning
macro_param_has_warning,
/// Special token for implementing __has_feature
macro_param_has_feature,
/// Special token for implementing __has_extension
macro_param_has_extension,
/// Special token for implementing __has_builtin
macro_param_has_builtin,
/// Special token for implementing __has_include
macro_param_has_include,
/// Special token for implementing __has_include_next
macro_param_has_include_next,
/// Special token for implementing __has_embed
macro_param_has_embed,
/// Special token for implementing __is_identifier
macro_param_is_identifier,
/// Special token for implementing __FILE__
macro_file,
/// Special token for implementing __LINE__
macro_line,
/// Special token for implementing __COUNTER__
macro_counter,
/// Special token for implementing _Pragma
macro_param_pragma_operator,
/// Special token for implementing __identifier (MS extension)
macro_param_ms_identifier,
/// Special token for implementing __pragma (MS extension)
macro_param_ms_pragma,
/// Special identifier for implementing __func__
macro_func,
/// Special identifier for implementing __FUNCTION__
macro_function,
/// Special identifier for implementing __PRETTY_FUNCTION__
macro_pretty_func,
/// Special identifier for implementing __DATE__
macro_date,
/// Special identifier for implementing __TIME__
macro_time,
/// Special identifier for implementing __TIMESTAMP__
macro_timestamp,
keyword_auto,
keyword_auto_type,
keyword_break,
keyword_case,
keyword_char,
keyword_const,
keyword_continue,
keyword_default,
keyword_do,
keyword_double,
keyword_else,
keyword_enum,
keyword_extern,
keyword_float,
keyword_for,
keyword_goto,
keyword_if,
keyword_int,
keyword_long,
keyword_register,
keyword_return,
keyword_short,
keyword_signed,
keyword_signed1,
keyword_signed2,
keyword_sizeof,
keyword_static,
keyword_struct,
keyword_switch,
keyword_typedef,
keyword_typeof1,
keyword_typeof2,
keyword_union,
keyword_unsigned,
keyword_void,
keyword_volatile,
keyword_while,
// ISO C99
keyword_bool,
keyword_complex,
keyword_imaginary,
keyword_inline,
keyword_restrict,
// ISO C11
keyword_alignas,
keyword_alignof,
keyword_atomic,
keyword_generic,
keyword_noreturn,
keyword_static_assert,
keyword_thread_local,
// ISO C23
keyword_bit_int,
keyword_c23_alignas,
keyword_c23_alignof,
keyword_c23_bool,
keyword_c23_static_assert,
keyword_c23_thread_local,
keyword_constexpr,
keyword_true,
keyword_false,
keyword_nullptr,
keyword_typeof_unqual,
// Preprocessor directives
keyword_include,
keyword_include_next,
keyword_embed,
keyword_define,
keyword_defined,
keyword_undef,
keyword_ifdef,
keyword_ifndef,
keyword_elif,
keyword_elifdef,
keyword_elifndef,
keyword_endif,
keyword_error,
keyword_warning,
keyword_pragma,
keyword_line,
keyword_va_args,
keyword_va_opt,
// gcc keywords
keyword_const1,
keyword_const2,
keyword_inline1,
keyword_inline2,
keyword_volatile1,
keyword_volatile2,
keyword_restrict1,
keyword_restrict2,
keyword_alignof1,
keyword_alignof2,
keyword_typeof,
keyword_attribute1,
keyword_attribute2,
keyword_extension,
keyword_asm,
keyword_asm1,
keyword_asm2,
/// _Float128
keyword_float128_1,
/// __float128
keyword_float128_2,
keyword_int128,
keyword_imag1,
keyword_imag2,
keyword_real1,
keyword_real2,
keyword_float16,
// clang keywords
keyword_fp16,
// ms keywords
keyword_declspec,
keyword_int64,
keyword_int64_2,
keyword_int32,
keyword_int32_2,
keyword_int16,
keyword_int16_2,
keyword_int8,
keyword_int8_2,
keyword_stdcall,
keyword_stdcall2,
keyword_thiscall,
keyword_thiscall2,
keyword_vectorcall,
keyword_vectorcall2,
keyword_fastcall,
keyword_fastcall2,
keyword_regcall,
keyword_cdecl,
keyword_cdecl2,
keyword_forceinline,
keyword_forceinline2,
keyword_unaligned,
keyword_unaligned2,
// Type nullability
keyword_nonnull,
keyword_nullable,
keyword_nullable_result,
keyword_null_unspecified,
/// Generated by #embed directive
/// Decimal value with no prefix or suffix
embed_byte,
/// preprocessor number
/// An optional period, followed by a digit 0-9, followed by any number of letters
/// digits, underscores, periods, and exponents (e+, e-, E+, E-, p+, p-, P+, P-)
pp_num,
/// preprocessor placemarker token
/// generated if `##` is used with a zero-token argument
/// removed after substitution, so the parser should never see this
/// See C99 6.10.3.3.2
placemarker,
/// Virtual linemarker token output from preprocessor to indicate start of a new include
include_start,
/// Virtual linemarker token output from preprocessor to indicate resuming a file after
/// completion of the preceding #include
include_resume,
/// A comment token if asked to preserve comments.
comment,
/// Incomplete universal character name
/// This happens if the source text contains `\u` or `\U` followed by an insufficient number of hex
/// digits. This token id represents just the backslash; the subsequent `u` or `U` will be treated as the
/// leading character of the following identifier token.
incomplete_ucn,
/// Return true if token is identifier or keyword.
pub fn isMacroIdentifier(id: Id) bool {
switch (id) {
.keyword_include,
.keyword_include_next,
.keyword_embed,
.keyword_define,
.keyword_defined,
.keyword_undef,
.keyword_ifdef,
.keyword_ifndef,
.keyword_elif,
.keyword_elifdef,
.keyword_elifndef,
.keyword_endif,
.keyword_error,
.keyword_warning,
.keyword_pragma,
.keyword_line,
.keyword_va_args,
.keyword_va_opt,
.macro_func,
.macro_function,
.macro_pretty_func,
.macro_date,
.macro_time,
.macro_timestamp,
.keyword_auto,
.keyword_auto_type,
.keyword_break,
.keyword_case,
.keyword_char,
.keyword_const,
.keyword_continue,
.keyword_default,
.keyword_do,
.keyword_double,
.keyword_else,
.keyword_enum,
.keyword_extern,
.keyword_float,
.keyword_for,
.keyword_goto,
.keyword_if,
.keyword_int,
.keyword_long,
.keyword_register,
.keyword_return,
.keyword_short,
.keyword_signed,
.keyword_signed1,
.keyword_signed2,
.keyword_sizeof,
.keyword_static,
.keyword_struct,
.keyword_switch,
.keyword_typedef,
.keyword_union,
.keyword_unsigned,
.keyword_void,
.keyword_volatile,
.keyword_while,
.keyword_bool,
.keyword_complex,
.keyword_imaginary,
.keyword_inline,
.keyword_restrict,
.keyword_alignas,
.keyword_alignof,
.keyword_atomic,
.keyword_generic,
.keyword_noreturn,
.keyword_static_assert,
.keyword_thread_local,
.identifier,
.extended_identifier,
.keyword_typeof,
.keyword_typeof1,
.keyword_typeof2,
.keyword_const1,
.keyword_const2,
.keyword_inline1,
.keyword_inline2,
.keyword_volatile1,
.keyword_volatile2,
.keyword_restrict1,
.keyword_restrict2,
.keyword_alignof1,
.keyword_alignof2,
.keyword_attribute1,
.keyword_attribute2,
.keyword_extension,
.keyword_asm,
.keyword_asm1,
.keyword_asm2,
.keyword_float128_1,
.keyword_float128_2,
.keyword_int128,
.keyword_imag1,
.keyword_imag2,
.keyword_real1,
.keyword_real2,
.keyword_float16,
.keyword_fp16,
.keyword_declspec,
.keyword_int64,
.keyword_int64_2,
.keyword_int32,
.keyword_int32_2,
.keyword_int16,
.keyword_int16_2,
.keyword_int8,
.keyword_int8_2,
.keyword_stdcall,
.keyword_stdcall2,
.keyword_thiscall,
.keyword_thiscall2,
.keyword_vectorcall,
.keyword_vectorcall2,
.keyword_fastcall,
.keyword_fastcall2,
.keyword_regcall,
.keyword_cdecl,
.keyword_cdecl2,
.keyword_forceinline,
.keyword_forceinline2,
.keyword_unaligned,
.keyword_unaligned2,
.keyword_nonnull,
.keyword_nullable,
.keyword_nullable_result,
.keyword_null_unspecified,
.keyword_bit_int,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_constexpr,
.keyword_true,
.keyword_false,
.keyword_nullptr,
.keyword_typeof_unqual,
=> return true,
else => return false,
}
}
/// Turn macro keywords into identifiers.
/// `keyword_defined` is special since it should only turn into an identifier if
/// we are *not* in an #if or #elif expression
pub fn simplifyMacroKeywordExtra(id: *Id, defined_to_identifier: bool) void {
switch (id.*) {
.keyword_include,
.keyword_include_next,
.keyword_embed,
.keyword_define,
.keyword_undef,
.keyword_ifdef,
.keyword_ifndef,
.keyword_elif,
.keyword_elifdef,
.keyword_elifndef,
.keyword_endif,
.keyword_error,
.keyword_warning,
.keyword_pragma,
.keyword_line,
.keyword_va_args,
.keyword_va_opt,
=> id.* = .identifier,
.keyword_defined => if (defined_to_identifier) {
id.* = .identifier;
},
else => {},
}
}
pub fn simplifyMacroKeyword(id: *Id) void {
simplifyMacroKeywordExtra(id, false);
}
pub fn lexeme(id: Id) ?[]const u8 {
return switch (id) {
.include_start,
.include_resume,
=> unreachable,
.unterminated_comment,
.invalid,
.identifier,
.extended_identifier,
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
.unterminated_string_literal,
.unterminated_char_literal,
.empty_char_literal,
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
.macro_string,
.whitespace,
.pp_num,
.embed_byte,
.comment,
=> null,
.zero => "0",
.one => "1",
.nl,
.eof,
.macro_param,
.macro_param_no_expand,
.stringify_param,
.stringify_va_args,
.macro_param_has_attribute,
.macro_param_has_c_attribute,
.macro_param_has_declspec_attribute,
.macro_param_has_warning,
.macro_param_has_feature,
.macro_param_has_extension,
.macro_param_has_builtin,
.macro_param_has_include,
.macro_param_has_include_next,
.macro_param_has_embed,
.macro_param_is_identifier,
.macro_file,
.macro_line,
.macro_counter,
.macro_time,
.macro_date,
.macro_timestamp,
.macro_param_pragma_operator,
.macro_param_ms_identifier,
.macro_param_ms_pragma,
.placemarker,
=> "",
.macro_ws => " ",
.incomplete_ucn => "\\",
.macro_func => "__func__",
.macro_function => "__FUNCTION__",
.macro_pretty_func => "__PRETTY_FUNCTION__",
.bang => "!",
.bang_equal => "!=",
.pipe => "|",
.pipe_pipe => "||",
.pipe_equal => "|=",
.equal => "=",
.equal_equal => "==",
.l_paren => "(",
.r_paren => ")",
.l_brace => "{",
.r_brace => "}",
.l_bracket => "[",
.r_bracket => "]",
.period => ".",
.ellipsis => "...",
.caret => "^",
.caret_equal => "^=",
.plus => "+",
.plus_plus => "++",
.plus_equal => "+=",
.minus => "-",
.minus_minus => "--",
.minus_equal => "-=",
.asterisk => "*",
.asterisk_equal => "*=",
.percent => "%",
.percent_equal => "%=",
.arrow => "->",
.colon => ":",
.colon_colon => "::",
.semicolon => ";",
.slash => "/",
.slash_equal => "/=",
.comma => ",",
.ampersand => "&",
.ampersand_ampersand => "&&",
.ampersand_equal => "&=",
.question_mark => "?",
.angle_bracket_left => "<",
.angle_bracket_left_equal => "<=",
.angle_bracket_angle_bracket_left => "<<",
.angle_bracket_angle_bracket_left_equal => "<<=",
.angle_bracket_right => ">",
.angle_bracket_right_equal => ">=",
.angle_bracket_angle_bracket_right => ">>",
.angle_bracket_angle_bracket_right_equal => ">>=",
.tilde => "~",
.hash => "#",
.hash_hash => "##",
.keyword_auto => "auto",
.keyword_auto_type => "__auto_type",
.keyword_break => "break",
.keyword_case => "case",
.keyword_char => "char",
.keyword_const => "const",
.keyword_continue => "continue",
.keyword_default => "default",
.keyword_do => "do",
.keyword_double => "double",
.keyword_else => "else",
.keyword_enum => "enum",
.keyword_extern => "extern",
.keyword_float => "float",
.keyword_for => "for",
.keyword_goto => "goto",
.keyword_if => "if",
.keyword_int => "int",
.keyword_long => "long",
.keyword_register => "register",
.keyword_return => "return",
.keyword_short => "short",
.keyword_signed => "signed",
.keyword_signed1 => "__signed",
.keyword_signed2 => "__signed__",
.keyword_sizeof => "sizeof",
.keyword_static => "static",
.keyword_struct => "struct",
.keyword_switch => "switch",
.keyword_typedef => "typedef",
.keyword_typeof => "typeof",
.keyword_union => "union",
.keyword_unsigned => "unsigned",
.keyword_void => "void",
.keyword_volatile => "volatile",
.keyword_while => "while",
.keyword_bool => "_Bool",
.keyword_complex => "_Complex",
.keyword_imaginary => "_Imaginary",
.keyword_inline => "inline",
.keyword_restrict => "restrict",
.keyword_alignas => "_Alignas",
.keyword_alignof => "_Alignof",
.keyword_atomic => "_Atomic",
.keyword_generic => "_Generic",
.keyword_noreturn => "_Noreturn",
.keyword_static_assert => "_Static_assert",
.keyword_thread_local => "_Thread_local",
.keyword_bit_int => "_BitInt",
.keyword_c23_alignas => "alignas",
.keyword_c23_alignof => "alignof",
.keyword_c23_bool => "bool",
.keyword_c23_static_assert => "static_assert",
.keyword_c23_thread_local => "thread_local",
.keyword_constexpr => "constexpr",
.keyword_true => "true",
.keyword_false => "false",
.keyword_nullptr => "nullptr",
.keyword_typeof_unqual => "typeof_unqual",
.keyword_include => "include",
.keyword_include_next => "include_next",
.keyword_embed => "embed",
.keyword_define => "define",
.keyword_defined => "defined",
.keyword_undef => "undef",
.keyword_ifdef => "ifdef",
.keyword_ifndef => "ifndef",
.keyword_elif => "elif",
.keyword_elifdef => "elifdef",
.keyword_elifndef => "elifndef",
.keyword_endif => "endif",
.keyword_error => "error",
.keyword_warning => "warning",
.keyword_pragma => "pragma",
.keyword_line => "line",
.keyword_va_args => "__VA_ARGS__",
.keyword_va_opt => "__VA_OPT__",
.keyword_const1 => "__const",
.keyword_const2 => "__const__",
.keyword_inline1 => "__inline",
.keyword_inline2 => "__inline__",
.keyword_volatile1 => "__volatile",
.keyword_volatile2 => "__volatile__",
.keyword_restrict1 => "__restrict",
.keyword_restrict2 => "__restrict__",
.keyword_alignof1 => "__alignof",
.keyword_alignof2 => "__alignof__",
.keyword_typeof1 => "__typeof",
.keyword_typeof2 => "__typeof__",
.keyword_attribute1 => "__attribute",
.keyword_attribute2 => "__attribute__",
.keyword_extension => "__extension__",
.keyword_asm => "asm",
.keyword_asm1 => "__asm",
.keyword_asm2 => "__asm__",
.keyword_float128_1 => "_Float128",
.keyword_float128_2 => "__float128",
.keyword_int128 => "__int128",
.keyword_imag1 => "__imag",
.keyword_imag2 => "__imag__",
.keyword_real1 => "__real",
.keyword_real2 => "__real__",
.keyword_float16 => "_Float16",
.keyword_fp16 => "__fp16",
.keyword_declspec => "__declspec",
.keyword_int64 => "__int64",
.keyword_int64_2 => "_int64",
.keyword_int32 => "__int32",
.keyword_int32_2 => "_int32",
.keyword_int16 => "__int16",
.keyword_int16_2 => "_int16",
.keyword_int8 => "__int8",
.keyword_int8_2 => "_int8",
.keyword_stdcall => "__stdcall",
.keyword_stdcall2 => "_stdcall",
.keyword_thiscall => "__thiscall",
.keyword_thiscall2 => "_thiscall",
.keyword_vectorcall => "__vectorcall",
.keyword_vectorcall2 => "_vectorcall",
.keyword_fastcall => "__fastcall",
.keyword_fastcall2 => "_fastcall",
.keyword_regcall => "__regcall",
.keyword_cdecl => "__cdecl",
.keyword_cdecl2 => "_cdecl",
.keyword_forceinline => "__forceinline",
.keyword_forceinline2 => "_forceinline",
.keyword_unaligned => "__unaligned",
.keyword_unaligned2 => "_unaligned",
.keyword_nonnull => "_Nonnull",
.keyword_nullable => "_Nullable",
.keyword_nullable_result => "_Nullable_result",
.keyword_null_unspecified => "_Null_unspecified",
};
}
pub fn symbol(id: Id) []const u8 {
return switch (id) {
.macro_string => unreachable,
.invalid => "invalid bytes",
.identifier,
.extended_identifier,
.macro_func,
.macro_function,
.macro_pretty_func,
=> "an identifier",
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
.unterminated_string_literal,
=> "a string literal",
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
.unterminated_char_literal,
.empty_char_literal,
=> "a character literal",
.pp_num, .embed_byte => "a number",
else => id.lexeme().?,
};
}
/// tokens that can start an expression parsed by Preprocessor.expr
/// Note that eof, r_paren, and string literals cannot actually start a
/// preprocessor expression, but we include them here so that a nicer
/// error message can be generated by the parser.
pub fn validPreprocessorExprStart(id: Id) bool {
return switch (id) {
.eof,
.r_paren,
.string_literal,
.string_literal_utf_16,
.string_literal_utf_8,
.string_literal_utf_32,
.string_literal_wide,
.char_literal,
.char_literal_utf_8,
.char_literal_utf_16,
.char_literal_utf_32,
.char_literal_wide,
.l_paren,
.plus,
.minus,
.tilde,
.bang,
.identifier,
.extended_identifier,
.keyword_defined,
.one,
.zero,
.pp_num,
.keyword_true,
.keyword_false,
=> true,
else => false,
};
}
pub fn allowsDigraphs(id: Id, langopts: LangOpts) bool {
return switch (id) {
.l_bracket,
.r_bracket,
.l_brace,
.r_brace,
.hash,
.hash_hash,
=> langopts.hasDigraphs(),
else => false,
};
}
pub fn canOpenGCCAsmStmt(id: Id) bool {
return switch (id) {
.keyword_volatile, .keyword_volatile1, .keyword_volatile2, .keyword_inline, .keyword_inline1, .keyword_inline2, .keyword_goto, .l_paren => true,
else => false,
};
}
pub fn isStringLiteral(id: Id) bool {
return switch (id) {
.string_literal, .string_literal_utf_16, .string_literal_utf_8, .string_literal_utf_32, .string_literal_wide => true,
else => false,
};
}
};
/// double underscore and underscore + capital letter identifiers
/// belong to the implementation namespace, so we always convert them
/// to keywords.
pub fn getTokenId(langopts: LangOpts, str: []const u8) Token.Id {
const kw = all_kws.get(str) orelse return .identifier;
const standard = langopts.standard;
return switch (kw) {
.keyword_inline => if (standard.isGNU() or standard.atLeast(.c99)) kw else .identifier,
.keyword_restrict => if (standard.atLeast(.c99)) kw else .identifier,
.keyword_typeof => if (standard.isGNU() or standard.atLeast(.c23)) kw else .identifier,
.keyword_asm => if (standard.isGNU()) kw else .identifier,
.keyword_declspec => if (langopts.declspec_attrs) kw else .identifier,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_constexpr,
.keyword_true,
.keyword_false,
.keyword_nullptr,
.keyword_typeof_unqual,
.keyword_elifdef,
.keyword_elifndef,
=> if (standard.atLeast(.c23)) kw else .identifier,
.keyword_int64,
.keyword_int64_2,
.keyword_int32,
.keyword_int32_2,
.keyword_int16,
.keyword_int16_2,
.keyword_int8,
.keyword_int8_2,
.keyword_stdcall2,
.keyword_thiscall2,
.keyword_vectorcall2,
.keyword_fastcall2,
.keyword_cdecl2,
.keyword_forceinline,
.keyword_forceinline2,
.keyword_unaligned,
.keyword_unaligned2,
=> if (langopts.ms_extensions) kw else .identifier,
else => kw,
};
}
const all_kws = std.StaticStringMap(Id).initComptime(.{
.{ "auto", .keyword_auto },
.{ "break", .keyword_break },
.{ "case", .keyword_case },
.{ "char", .keyword_char },
.{ "const", .keyword_const },
.{ "continue", .keyword_continue },
.{ "default", .keyword_default },
.{ "do", .keyword_do },
.{ "double", .keyword_double },
.{ "else", .keyword_else },
.{ "enum", .keyword_enum },
.{ "extern", .keyword_extern },
.{ "float", .keyword_float },
.{ "for", .keyword_for },
.{ "goto", .keyword_goto },
.{ "if", .keyword_if },
.{ "int", .keyword_int },
.{ "long", .keyword_long },
.{ "register", .keyword_register },
.{ "return", .keyword_return },
.{ "short", .keyword_short },
.{ "signed", .keyword_signed },
.{ "__signed", .keyword_signed1 },
.{ "__signed__", .keyword_signed2 },
.{ "sizeof", .keyword_sizeof },
.{ "static", .keyword_static },
.{ "struct", .keyword_struct },
.{ "switch", .keyword_switch },
.{ "typedef", .keyword_typedef },
.{ "union", .keyword_union },
.{ "unsigned", .keyword_unsigned },
.{ "void", .keyword_void },
.{ "volatile", .keyword_volatile },
.{ "while", .keyword_while },
.{ "__typeof__", .keyword_typeof2 },
.{ "__typeof", .keyword_typeof1 },
// ISO C99
.{ "_Bool", .keyword_bool },
.{ "_Complex", .keyword_complex },
.{ "_Imaginary", .keyword_imaginary },
.{ "inline", .keyword_inline },
.{ "restrict", .keyword_restrict },
// ISO C11
.{ "_Alignas", .keyword_alignas },
.{ "_Alignof", .keyword_alignof },
.{ "_Atomic", .keyword_atomic },
.{ "_Generic", .keyword_generic },
.{ "_Noreturn", .keyword_noreturn },
.{ "_Static_assert", .keyword_static_assert },
.{ "_Thread_local", .keyword_thread_local },
// ISO C23
.{ "_BitInt", .keyword_bit_int },
.{ "alignas", .keyword_c23_alignas },
.{ "alignof", .keyword_c23_alignof },
.{ "bool", .keyword_c23_bool },
.{ "static_assert", .keyword_c23_static_assert },
.{ "thread_local", .keyword_c23_thread_local },
.{ "constexpr", .keyword_constexpr },
.{ "true", .keyword_true },
.{ "false", .keyword_false },
.{ "nullptr", .keyword_nullptr },
.{ "typeof_unqual", .keyword_typeof_unqual },
// Preprocessor directives
.{ "include", .keyword_include },
.{ "include_next", .keyword_include_next },
.{ "embed", .keyword_embed },
.{ "define", .keyword_define },
.{ "defined", .keyword_defined },
.{ "undef", .keyword_undef },
.{ "ifdef", .keyword_ifdef },
.{ "ifndef", .keyword_ifndef },
.{ "elif", .keyword_elif },
.{ "elifdef", .keyword_elifdef },
.{ "elifndef", .keyword_elifndef },
.{ "endif", .keyword_endif },
.{ "error", .keyword_error },
.{ "warning", .keyword_warning },
.{ "pragma", .keyword_pragma },
.{ "line", .keyword_line },
.{ "__VA_ARGS__", .keyword_va_args },
.{ "__VA_OPT__", .keyword_va_opt },
.{ "__func__", .macro_func },
.{ "__FUNCTION__", .macro_function },
.{ "__PRETTY_FUNCTION__", .macro_pretty_func },
// gcc keywords
.{ "__auto_type", .keyword_auto_type },
.{ "__const", .keyword_const1 },
.{ "__const__", .keyword_const2 },
.{ "__inline", .keyword_inline1 },
.{ "__inline__", .keyword_inline2 },
.{ "__volatile", .keyword_volatile1 },
.{ "__volatile__", .keyword_volatile2 },
.{ "__restrict", .keyword_restrict1 },
.{ "__restrict__", .keyword_restrict2 },
.{ "__alignof", .keyword_alignof1 },
.{ "__alignof__", .keyword_alignof2 },
.{ "typeof", .keyword_typeof },
.{ "__attribute", .keyword_attribute1 },
.{ "__attribute__", .keyword_attribute2 },
.{ "__extension__", .keyword_extension },
.{ "asm", .keyword_asm },
.{ "__asm", .keyword_asm1 },
.{ "__asm__", .keyword_asm2 },
.{ "_Float128", .keyword_float128_1 },
.{ "__float128", .keyword_float128_2 },
.{ "__int128", .keyword_int128 },
.{ "__imag", .keyword_imag1 },
.{ "__imag__", .keyword_imag2 },
.{ "__real", .keyword_real1 },
.{ "__real__", .keyword_real2 },
.{ "_Float16", .keyword_float16 },
// clang keywords
.{ "__fp16", .keyword_fp16 },
// ms keywords
.{ "__declspec", .keyword_declspec },
.{ "__int64", .keyword_int64 },
.{ "_int64", .keyword_int64_2 },
.{ "__int32", .keyword_int32 },
.{ "_int32", .keyword_int32_2 },
.{ "__int16", .keyword_int16 },
.{ "_int16", .keyword_int16_2 },
.{ "__int8", .keyword_int8 },
.{ "_int8", .keyword_int8_2 },
.{ "__stdcall", .keyword_stdcall },
.{ "_stdcall", .keyword_stdcall2 },
.{ "__thiscall", .keyword_thiscall },
.{ "_thiscall", .keyword_thiscall2 },
.{ "__vectorcall", .keyword_vectorcall },
.{ "_vectorcall", .keyword_vectorcall2 },
.{ "__fastcall", .keyword_fastcall },
.{ "_fastcall", .keyword_fastcall2 },
.{ "_regcall", .keyword_regcall },
.{ "__cdecl", .keyword_cdecl },
.{ "_cdecl", .keyword_cdecl2 },
.{ "__forceinline", .keyword_forceinline },
.{ "_forceinline", .keyword_forceinline2 },
.{ "__unaligned", .keyword_unaligned },
.{ "_unaligned", .keyword_unaligned2 },
// Type nullability
.{ "_Nonnull", .keyword_nonnull },
.{ "_Nullable", .keyword_nullable },
.{ "_Nullable_result", .keyword_nullable_result },
.{ "_Null_unspecified", .keyword_null_unspecified },
});
};
const Tokenizer = @This();
buf: []const u8,
index: u32 = 0,
source: Source.Id,
langopts: LangOpts,
line: u32 = 1,
pub fn next(self: *Tokenizer) Token {
var state: enum {
start,
whitespace,
u,
u8,
U,
L,
string_literal,
char_literal_start,
char_literal,
char_escape_sequence,
string_escape_sequence,
identifier,
extended_identifier,
equal,
bang,
pipe,
colon,
percent,
asterisk,
plus,
angle_bracket_left,
angle_bracket_angle_bracket_left,
angle_bracket_right,
angle_bracket_angle_bracket_right,
caret,
period,
period2,
minus,
slash,
ampersand,
hash,
hash_digraph,
hash_hash_digraph_partial,
line_comment,
multi_line_comment,
multi_line_comment_asterisk,
multi_line_comment_done,
pp_num,
pp_num_exponent,
pp_num_digit_separator,
} = .start;
var start = self.index;
var id: Token.Id = .eof;
while (self.index < self.buf.len) : (self.index += 1) {
const c = self.buf[self.index];
switch (state) {
.start => switch (c) {
'\n' => {
id = .nl;
self.index += 1;
self.line += 1;
break;
},
'"' => {
id = .string_literal;
state = .string_literal;
},
'\'' => {
id = .char_literal;
state = .char_literal_start;
},
'u' => state = .u,
'U' => state = .U,
'L' => state = .L,
'\\' => {
const ucn_kind = UCNKind.classify(self.buf[self.index..]);
switch (ucn_kind) {
.none => {
self.index += 1;
id = .invalid;
break;
},
.incomplete => {
self.index += 1;
id = .incomplete_ucn;
break;
},
.hex4, .hex8 => {
self.index += @intFromEnum(ucn_kind);
id = .extended_identifier;
state = .extended_identifier;
},
}
},
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier,
'=' => state = .equal,
'!' => state = .bang,
'|' => state = .pipe,
'(' => {
id = .l_paren;
self.index += 1;
break;
},
')' => {
id = .r_paren;
self.index += 1;
break;
},
'[' => {
id = .l_bracket;
self.index += 1;
break;
},
']' => {
id = .r_bracket;
self.index += 1;
break;
},
';' => {
id = .semicolon;
self.index += 1;
break;
},
',' => {
id = .comma;
self.index += 1;
break;
},
'?' => {
id = .question_mark;
self.index += 1;
break;
},
':' => state = .colon,
'%' => state = .percent,
'*' => state = .asterisk,
'+' => state = .plus,
'<' => state = .angle_bracket_left,
'>' => state = .angle_bracket_right,
'^' => state = .caret,
'{' => {
id = .l_brace;
self.index += 1;
break;
},
'}' => {
id = .r_brace;
self.index += 1;
break;
},
'~' => {
id = .tilde;
self.index += 1;
break;
},
'.' => state = .period,
'-' => state = .minus,
'/' => state = .slash,
'&' => state = .ampersand,
'#' => state = .hash,
'0'...'9' => state = .pp_num,
'\t', '\x0B', '\x0C', ' ' => state = .whitespace,
'$' => if (self.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = .invalid;
self.index += 1;
break;
},
0x1A => if (self.langopts.ms_extensions) {
id = .eof;
break;
} else {
id = .invalid;
self.index += 1;
break;
},
0x80...0xFF => state = .extended_identifier,
else => {
id = .invalid;
self.index += 1;
break;
},
},
.whitespace => switch (c) {
'\t', '\x0B', '\x0C', ' ' => {},
else => {
id = .whitespace;
break;
},
},
.u => switch (c) {
'8' => {
state = .u8;
},
'\'' => {
id = .char_literal_utf_16;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_utf_16;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.u8 => switch (c) {
'\"' => {
id = .string_literal_utf_8;
state = .string_literal;
},
'\'' => {
id = .char_literal_utf_8;
state = .char_literal_start;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.U => switch (c) {
'\'' => {
id = .char_literal_utf_32;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_utf_32;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.L => switch (c) {
'\'' => {
id = .char_literal_wide;
state = .char_literal_start;
},
'\"' => {
id = .string_literal_wide;
state = .string_literal;
},
else => {
self.index -= 1;
state = .identifier;
},
},
.string_literal => switch (c) {
'\\' => {
state = .string_escape_sequence;
},
'"' => {
self.index += 1;
break;
},
'\n' => {
id = .unterminated_string_literal;
break;
},
'\r' => unreachable,
else => {},
},
.char_literal_start => switch (c) {
'\\' => {
state = .char_escape_sequence;
},
'\'' => {
id = .empty_char_literal;
self.index += 1;
break;
},
'\n' => {
id = .unterminated_char_literal;
break;
},
else => {
state = .char_literal;
},
},
.char_literal => switch (c) {
'\\' => {
state = .char_escape_sequence;
},
'\'' => {
self.index += 1;
break;
},
'\n' => {
id = .unterminated_char_literal;
break;
},
else => {},
},
.char_escape_sequence => switch (c) {
'\r', '\n' => {
id = .unterminated_char_literal;
break;
},
else => state = .char_literal,
},
.string_escape_sequence => switch (c) {
'\r', '\n' => {
id = .unterminated_string_literal;
break;
},
else => state = .string_literal,
},
.identifier, .extended_identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
'$' => if (self.langopts.dollars_in_identifiers) {
state = .extended_identifier;
} else {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
},
0x80...0xFF => state = .extended_identifier,
'\\' => {
const ucn_kind = UCNKind.classify(self.buf[self.index..]);
switch (ucn_kind) {
.none, .incomplete => {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
},
.hex4, .hex8 => {
state = .extended_identifier;
self.index += @intFromEnum(ucn_kind);
},
}
},
else => {
id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier;
break;
},
},
.equal => switch (c) {
'=' => {
id = .equal_equal;
self.index += 1;
break;
},
else => {
id = .equal;
break;
},
},
.bang => switch (c) {
'=' => {
id = .bang_equal;
self.index += 1;
break;
},
else => {
id = .bang;
break;
},
},
.pipe => switch (c) {
'=' => {
id = .pipe_equal;
self.index += 1;
break;
},
'|' => {
id = .pipe_pipe;
self.index += 1;
break;
},
else => {
id = .pipe;
break;
},
},
.colon => switch (c) {
'>' => {
if (self.langopts.hasDigraphs()) {
id = .r_bracket;
self.index += 1;
} else {
id = .colon;
}
break;
},
':' => {
if (self.langopts.standard.atLeast(.c23)) {
id = .colon_colon;
self.index += 1;
break;
} else {
id = .colon;
break;
}
},
else => {
id = .colon;
break;
},
},
.percent => switch (c) {
'=' => {
id = .percent_equal;
self.index += 1;
break;
},
'>' => {
if (self.langopts.hasDigraphs()) {
id = .r_brace;
self.index += 1;
} else {
id = .percent;
}
break;
},
':' => {
if (self.langopts.hasDigraphs()) {
state = .hash_digraph;
} else {
id = .percent;
break;
}
},
else => {
id = .percent;
break;
},
},
.asterisk => switch (c) {
'=' => {
id = .asterisk_equal;
self.index += 1;
break;
},
else => {
id = .asterisk;
break;
},
},
.plus => switch (c) {
'=' => {
id = .plus_equal;
self.index += 1;
break;
},
'+' => {
id = .plus_plus;
self.index += 1;
break;
},
else => {
id = .plus;
break;
},
},
.angle_bracket_left => switch (c) {
'<' => state = .angle_bracket_angle_bracket_left,
'=' => {
id = .angle_bracket_left_equal;
self.index += 1;
break;
},
':' => {
if (self.langopts.hasDigraphs()) {
id = .l_bracket;
self.index += 1;
} else {
id = .angle_bracket_left;
}
break;
},
'%' => {
if (self.langopts.hasDigraphs()) {
id = .l_brace;
self.index += 1;
} else {
id = .angle_bracket_left;
}
break;
},
else => {
id = .angle_bracket_left;
break;
},
},
.angle_bracket_angle_bracket_left => switch (c) {
'=' => {
id = .angle_bracket_angle_bracket_left_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_angle_bracket_left;
break;
},
},
.angle_bracket_right => switch (c) {
'>' => state = .angle_bracket_angle_bracket_right,
'=' => {
id = .angle_bracket_right_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_right;
break;
},
},
.angle_bracket_angle_bracket_right => switch (c) {
'=' => {
id = .angle_bracket_angle_bracket_right_equal;
self.index += 1;
break;
},
else => {
id = .angle_bracket_angle_bracket_right;
break;
},
},
.caret => switch (c) {
'=' => {
id = .caret_equal;
self.index += 1;
break;
},
else => {
id = .caret;
break;
},
},
.period => switch (c) {
'.' => state = .period2,
'0'...'9' => state = .pp_num,
else => {
id = .period;
break;
},
},
.period2 => switch (c) {
'.' => {
id = .ellipsis;
self.index += 1;
break;
},
else => {
id = .period;
self.index -= 1;
break;
},
},
.minus => switch (c) {
'>' => {
id = .arrow;
self.index += 1;
break;
},
'=' => {
id = .minus_equal;
self.index += 1;
break;
},
'-' => {
id = .minus_minus;
self.index += 1;
break;
},
else => {
id = .minus;
break;
},
},
.ampersand => switch (c) {
'&' => {
id = .ampersand_ampersand;
self.index += 1;
break;
},
'=' => {
id = .ampersand_equal;
self.index += 1;
break;
},
else => {
id = .ampersand;
break;
},
},
.hash => switch (c) {
'#' => {
id = .hash_hash;
self.index += 1;
break;
},
else => {
id = .hash;
break;
},
},
.hash_digraph => switch (c) {
'%' => state = .hash_hash_digraph_partial,
else => {
id = .hash;
break;
},
},
.hash_hash_digraph_partial => switch (c) {
':' => {
id = .hash_hash;
self.index += 1;
break;
},
else => {
id = .hash;
self.index -= 1; // re-tokenize the percent
break;
},
},
.slash => switch (c) {
'/' => state = .line_comment,
'*' => state = .multi_line_comment,
'=' => {
id = .slash_equal;
self.index += 1;
break;
},
else => {
id = .slash;
break;
},
},
.line_comment => switch (c) {
'\n' => {
if (self.langopts.preserve_comments) {
id = .comment;
break;
}
self.index -= 1;
state = .start;
},
else => {},
},
.multi_line_comment => switch (c) {
'*' => state = .multi_line_comment_asterisk,
'\n' => self.line += 1,
else => {},
},
.multi_line_comment_asterisk => switch (c) {
'/' => {
if (self.langopts.preserve_comments) {
self.index += 1;
id = .comment;
break;
}
state = .multi_line_comment_done;
},
'\n' => {
self.line += 1;
state = .multi_line_comment;
},
'*' => {},
else => state = .multi_line_comment,
},
.multi_line_comment_done => switch (c) {
'\n' => {
start = self.index;
id = .nl;
self.index += 1;
self.line += 1;
break;
},
'\r' => unreachable,
'\t', '\x0B', '\x0C', ' ' => {
start = self.index;
state = .whitespace;
},
else => {
id = .whitespace;
break;
},
},
.pp_num => switch (c) {
'a'...'d',
'A'...'D',
'f'...'o',
'F'...'O',
'q'...'z',
'Q'...'Z',
'0'...'9',
'_',
'.',
=> {},
'e', 'E', 'p', 'P' => state = .pp_num_exponent,
'\'' => if (self.langopts.standard.atLeast(.c23)) {
state = .pp_num_digit_separator;
} else {
id = .pp_num;
break;
},
else => {
id = .pp_num;
break;
},
},
.pp_num_digit_separator => switch (c) {
'a'...'d',
'A'...'D',
'f'...'o',
'F'...'O',
'q'...'z',
'Q'...'Z',
'0'...'9',
'_',
=> state = .pp_num,
else => {
self.index -= 1;
id = .pp_num;
break;
},
},
.pp_num_exponent => switch (c) {
'a'...'o',
'q'...'z',
'A'...'O',
'Q'...'Z',
'0'...'9',
'_',
'.',
'+',
'-',
=> state = .pp_num,
'p', 'P' => {},
else => {
id = .pp_num;
break;
},
},
}
} else if (self.index == self.buf.len) {
switch (state) {
.start => {},
.line_comment => if (self.langopts.preserve_comments) {
id = .comment;
},
.u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]),
.extended_identifier => id = .extended_identifier,
.period2 => {
self.index -= 1;
id = .period;
},
.multi_line_comment,
.multi_line_comment_asterisk,
=> id = .unterminated_comment,
.char_escape_sequence, .char_literal, .char_literal_start => id = .unterminated_char_literal,
.string_escape_sequence, .string_literal => id = .unterminated_string_literal,
.whitespace => id = .whitespace,
.multi_line_comment_done => id = .whitespace,
.equal => id = .equal,
.bang => id = .bang,
.minus => id = .minus,
.slash => id = .slash,
.ampersand => id = .ampersand,
.hash => id = .hash,
.period => id = .period,
.pipe => id = .pipe,
.angle_bracket_angle_bracket_right => id = .angle_bracket_angle_bracket_right,
.angle_bracket_right => id = .angle_bracket_right,
.angle_bracket_angle_bracket_left => id = .angle_bracket_angle_bracket_left,
.angle_bracket_left => id = .angle_bracket_left,
.plus => id = .plus,
.colon => id = .colon,
.percent => id = .percent,
.caret => id = .caret,
.asterisk => id = .asterisk,
.hash_digraph => id = .hash,
.hash_hash_digraph_partial => {
id = .hash;
self.index -= 1; // re-tokenize the percent
},
.pp_num, .pp_num_exponent, .pp_num_digit_separator => id = .pp_num,
}
}
return .{
.id = id,
.start = start,
.end = self.index,
.line = self.line,
.source = self.source,
};
}
pub fn nextNoWS(self: *Tokenizer) Token {
var tok = self.next();
while (tok.id == .whitespace or tok.id == .comment) tok = self.next();
return tok;
}
pub fn nextNoWSComments(self: *Tokenizer) Token {
var tok = self.next();
while (tok.id == .whitespace) tok = self.next();
return tok;
}
/// Try to tokenize a '::' even if not supported by the current language standard.
pub fn colonColon(self: *Tokenizer) Token {
var tok = self.nextNoWS();
if (tok.id == .colon and self.index < self.buf.len and self.buf[self.index] == ':') {
self.index += 1;
tok.id = .colon_colon;
}
return tok;
}
test "operators" {
try expectTokens(
\\ ! != | || |= = ==
\\ ( ) { } [ ] . .. ...
\\ ^ ^= + ++ += - -- -=
\\ * *= % %= -> : ; / /=
\\ , & && &= ? < <= <<
\\ <<= > >= >> >>= ~ # ##
\\
, &.{
.bang,
.bang_equal,
.pipe,
.pipe_pipe,
.pipe_equal,
.equal,
.equal_equal,
.nl,
.l_paren,
.r_paren,
.l_brace,
.r_brace,
.l_bracket,
.r_bracket,
.period,
.period,
.period,
.ellipsis,
.nl,
.caret,
.caret_equal,
.plus,
.plus_plus,
.plus_equal,
.minus,
.minus_minus,
.minus_equal,
.nl,
.asterisk,
.asterisk_equal,
.percent,
.percent_equal,
.arrow,
.colon,
.semicolon,
.slash,
.slash_equal,
.nl,
.comma,
.ampersand,
.ampersand_ampersand,
.ampersand_equal,
.question_mark,
.angle_bracket_left,
.angle_bracket_left_equal,
.angle_bracket_angle_bracket_left,
.nl,
.angle_bracket_angle_bracket_left_equal,
.angle_bracket_right,
.angle_bracket_right_equal,
.angle_bracket_angle_bracket_right,
.angle_bracket_angle_bracket_right_equal,
.tilde,
.hash,
.hash_hash,
.nl,
});
}
test "keywords" {
try expectTokens(
\\auto __auto_type break case char const continue default do
\\double else enum extern float for goto if int
\\long register return short signed sizeof static
\\struct switch typedef union unsigned void volatile
\\while _Bool _Complex _Imaginary inline restrict _Alignas
\\_Alignof _Atomic _Generic _Noreturn _Static_assert _Thread_local
\\__attribute __attribute__
\\
, &.{
.keyword_auto,
.keyword_auto_type,
.keyword_break,
.keyword_case,
.keyword_char,
.keyword_const,
.keyword_continue,
.keyword_default,
.keyword_do,
.nl,
.keyword_double,
.keyword_else,
.keyword_enum,
.keyword_extern,
.keyword_float,
.keyword_for,
.keyword_goto,
.keyword_if,
.keyword_int,
.nl,
.keyword_long,
.keyword_register,
.keyword_return,
.keyword_short,
.keyword_signed,
.keyword_sizeof,
.keyword_static,
.nl,
.keyword_struct,
.keyword_switch,
.keyword_typedef,
.keyword_union,
.keyword_unsigned,
.keyword_void,
.keyword_volatile,
.nl,
.keyword_while,
.keyword_bool,
.keyword_complex,
.keyword_imaginary,
.keyword_inline,
.keyword_restrict,
.keyword_alignas,
.nl,
.keyword_alignof,
.keyword_atomic,
.keyword_generic,
.keyword_noreturn,
.keyword_static_assert,
.keyword_thread_local,
.nl,
.keyword_attribute1,
.keyword_attribute2,
.nl,
});
}
test "preprocessor keywords" {
try expectTokens(
\\#include
\\#include_next
\\#embed
\\#define
\\#ifdef
\\#ifndef
\\#error
\\#pragma
\\
, &.{
.hash,
.keyword_include,
.nl,
.hash,
.keyword_include_next,
.nl,
.hash,
.keyword_embed,
.nl,
.hash,
.keyword_define,
.nl,
.hash,
.keyword_ifdef,
.nl,
.hash,
.keyword_ifndef,
.nl,
.hash,
.keyword_error,
.nl,
.hash,
.keyword_pragma,
.nl,
});
}
test "line continuation" {
try expectTokens(
\\#define foo \
\\ bar
\\"foo\
\\ bar"
\\#define "foo"
\\ "bar"
\\#define "foo" \
\\ "bar"
, &.{
.hash,
.keyword_define,
.identifier,
.identifier,
.nl,
.string_literal,
.nl,
.hash,
.keyword_define,
.string_literal,
.nl,
.string_literal,
.nl,
.hash,
.keyword_define,
.string_literal,
.string_literal,
});
}
test "string prefix" {
try expectTokens(
\\"foo"
\\u"foo"
\\u8"foo"
\\U"foo"
\\L"foo"
\\'foo'
\\u8'A'
\\u'foo'
\\U'foo'
\\L'foo'
\\
, &.{
.string_literal,
.nl,
.string_literal_utf_16,
.nl,
.string_literal_utf_8,
.nl,
.string_literal_utf_32,
.nl,
.string_literal_wide,
.nl,
.char_literal,
.nl,
.char_literal_utf_8,
.nl,
.char_literal_utf_16,
.nl,
.char_literal_utf_32,
.nl,
.char_literal_wide,
.nl,
});
}
test "num suffixes" {
try expectTokens(
\\ 1.0f 1.0L 1.0 .0 1. 0x1p0f 0X1p0
\\ 0l 0lu 0ll 0llu 0
\\ 1u 1ul 1ull 1
\\ 1.0i 1.0I
\\ 1.0if 1.0If 1.0fi 1.0fI
\\ 1.0il 1.0Il 1.0li 1.0lI
\\
, &.{
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
.pp_num,
.pp_num,
.pp_num,
.pp_num,
.nl,
});
}
test "comments" {
try expectTokens(
\\//foo
\\#foo
, &.{
.nl,
.hash,
.identifier,
});
try expectTokensExtra(
\\//foo
\\void
\\//bar
, &.{
.comment, .nl,
.keyword_void, .nl,
.comment,
}, .{ .preserve_comments = true });
}
test "extended identifiers" {
try expectTokens("𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("u𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("u8𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("U𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("L𝓪𝓻𝓸𝓬𝓬", &.{.extended_identifier});
try expectTokens("1™", &.{ .pp_num, .extended_identifier });
try expectTokens("1.™", &.{ .pp_num, .extended_identifier });
try expectTokens("..™", &.{ .period, .period, .extended_identifier });
try expectTokens("0™", &.{ .pp_num, .extended_identifier });
try expectTokens("0b\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0b0\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("01\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("010\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0x\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("0x0\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("\"\\0\u{E0000}\"", &.{.string_literal});
try expectTokens("\"\\x\u{E0000}\"", &.{.string_literal});
try expectTokens("\"\\u\u{E0000}\"", &.{.string_literal});
try expectTokens("1e\u{E0000}", &.{ .pp_num, .extended_identifier });
try expectTokens("1e1\u{E0000}", &.{ .pp_num, .extended_identifier });
}
test "digraphs" {
try expectTokens("%:<::><%%>%:%:", &.{ .hash, .l_bracket, .r_bracket, .l_brace, .r_brace, .hash_hash });
try expectTokens("\"%:<::><%%>%:%:\"", &.{.string_literal});
try expectTokens("%:%42 %:%", &.{ .hash, .percent, .pp_num, .hash, .percent });
}
test "C23 keywords" {
try expectTokensExtra("true false alignas alignof bool static_assert thread_local nullptr typeof_unqual", &.{
.keyword_true,
.keyword_false,
.keyword_c23_alignas,
.keyword_c23_alignof,
.keyword_c23_bool,
.keyword_c23_static_assert,
.keyword_c23_thread_local,
.keyword_nullptr,
.keyword_typeof_unqual,
}, .{ .standard = .c23 });
}
test "Universal character names" {
try expectTokens("\\", &.{.invalid});
try expectTokens("\\g", &.{ .invalid, .identifier });
try expectTokens("\\u", &.{ .incomplete_ucn, .identifier });
try expectTokens("\\ua", &.{ .incomplete_ucn, .identifier });
try expectTokens("\\U9", &.{ .incomplete_ucn, .identifier });
try expectTokens("\\ug", &.{ .incomplete_ucn, .identifier });
try expectTokens("\\uag", &.{ .incomplete_ucn, .identifier });
try expectTokens("\\ ", &.{ .invalid, .eof });
try expectTokens("\\g ", &.{ .invalid, .identifier, .eof });
try expectTokens("\\u ", &.{ .incomplete_ucn, .identifier, .eof });
try expectTokens("\\ua ", &.{ .incomplete_ucn, .identifier, .eof });
try expectTokens("\\U9 ", &.{ .incomplete_ucn, .identifier, .eof });
try expectTokens("\\ug ", &.{ .incomplete_ucn, .identifier, .eof });
try expectTokens("\\uag ", &.{ .incomplete_ucn, .identifier, .eof });
try expectTokens("a\\", &.{ .identifier, .invalid });
try expectTokens("a\\g", &.{ .identifier, .invalid, .identifier });
try expectTokens("a\\u", &.{ .identifier, .incomplete_ucn, .identifier });
try expectTokens("a\\ua", &.{ .identifier, .incomplete_ucn, .identifier });
try expectTokens("a\\U9", &.{ .identifier, .incomplete_ucn, .identifier });
try expectTokens("a\\ug", &.{ .identifier, .incomplete_ucn, .identifier });
try expectTokens("a\\uag", &.{ .identifier, .incomplete_ucn, .identifier });
try expectTokens("a\\ ", &.{ .identifier, .invalid, .eof });
try expectTokens("a\\g ", &.{ .identifier, .invalid, .identifier, .eof });
try expectTokens("a\\u ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
try expectTokens("a\\ua ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
try expectTokens("a\\U9 ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
try expectTokens("a\\ug ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
try expectTokens("a\\uag ", &.{ .identifier, .incomplete_ucn, .identifier, .eof });
}
test "Tokenizer fuzz test" {
const Context = struct {
fn testOne(_: @This(), input_bytes: []const u8) anyerror!void {
var arena: std.heap.ArenaAllocator = .init(std.testing.allocator);
defer arena.deinit();
var comp = Compilation.init(std.testing.allocator, arena.allocator(), undefined, std.fs.cwd());
defer comp.deinit();
const source = try comp.addSourceFromBuffer("fuzz.c", input_bytes);
var tokenizer: Tokenizer = .{
.buf = source.buf,
.source = source.id,
.langopts = comp.langopts,
};
while (true) {
const prev_index = tokenizer.index;
const tok = tokenizer.next();
if (tok.id == .eof) break;
try std.testing.expect(prev_index < tokenizer.index); // ensure that the tokenizer always makes progress
}
}
};
return std.testing.fuzz(Context{}, Context.testOne, .{});
}
fn expectTokensExtra(contents: []const u8, expected_tokens: []const Token.Id, langopts: ?LangOpts) !void {
var arena: std.heap.ArenaAllocator = .init(std.testing.allocator);
defer arena.deinit();
var comp = Compilation.init(std.testing.allocator, arena.allocator(), undefined, std.fs.cwd());
defer comp.deinit();
if (langopts) |provided| {
comp.langopts = provided;
}
const source = try comp.addSourceFromBuffer("path", contents);
var tokenizer = Tokenizer{
.buf = source.buf,
.source = source.id,
.langopts = comp.langopts,
};
var i: usize = 0;
while (i < expected_tokens.len) {
const token = tokenizer.next();
if (token.id == .whitespace) continue;
const expected_token_id = expected_tokens[i];
i += 1;
if (!std.meta.eql(token.id, expected_token_id)) {
std.debug.print("expected {s}, found {s}\n", .{ @tagName(expected_token_id), @tagName(token.id) });
return error.TokensDoNotEqual;
}
}
const last_token = tokenizer.next();
try std.testing.expect(last_token.id == .eof);
}
fn expectTokens(contents: []const u8, expected_tokens: []const Token.Id) !void {
return expectTokensExtra(contents, expected_tokens, null);
}