From d75697a6a3e2c8d96819c365dcb5690d4d8028e9 Mon Sep 17 00:00:00 2001 From: Vexu Date: Sat, 4 Jan 2020 02:44:23 +0200 Subject: [PATCH] std-c tokenizer keywords --- lib/std/c/tokenizer.zig | 194 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 184 insertions(+), 10 deletions(-) diff --git a/lib/std/c/tokenizer.zig b/lib/std/c/tokenizer.zig index a5f2ad770d..0a4a217551 100644 --- a/lib/std/c/tokenizer.zig +++ b/lib/std/c/tokenizer.zig @@ -1,5 +1,5 @@ const std = @import("std"); -const expect = std.testing.expect; +const mem = std.mem; pub const Source = struct { buffer: []const u8, @@ -7,11 +7,19 @@ pub const Source = struct { }; pub const Token = struct { - id: union(enum) { + id: Id, + start: usize, + end: usize, + source: *Source, + + pub const Id = union(enum) { Invalid, Eof, Nl, Identifier, + + /// special case for #include <...> + MacroString, StringLiteral: StrKind, CharLiteral: StrKind, IntegerLiteral: NumSuffix, @@ -68,10 +76,160 @@ pub const Token = struct { MultiLineComment, Hash, HashHash, - }, - start: usize, - end: usize, - source: *Source, + + Keyword_auto, + Keyword_break, + Keyword_case, + Keyword_char, + Keyword_const, + Keyword_continue, + Keyword_default, + Keyword_do, + Keyword_double, + Keyword_else, + Keyword_enum, + Keyword_extern, + Keyword_float, + Keyword_for, + Keyword_goto, + Keyword_if, + Keyword_int, + Keyword_long, + Keyword_register, + Keyword_return, + Keyword_short, + Keyword_signed, + Keyword_sizeof, + Keyword_static, + Keyword_struct, + Keyword_switch, + Keyword_typedef, + Keyword_union, + Keyword_unsigned, + Keyword_void, + Keyword_volatile, + Keyword_while, + + // ISO C99 + Keyword_bool, + Keyword_complex, + Keyword_imaginary, + Keyword_inline, + Keyword_restrict, + + // ISO C11 + Keyword_alignas, + Keyword_alignof, + Keyword_atomic, + Keyword_generic, + Keyword_noreturn, + Keyword_static_assert, + Keyword_thread_local, + + // Preprocessor + Keyword_include, + Keyword_define, + Keyword_ifdef, + Keyword_ifndef, + Keyword_error, + Keyword_pragma, + }; + + pub const Keyword = struct { + bytes: []const u8, + id: Id, + hash: u32, + + fn init(bytes: []const u8, id: Id) Keyword { + @setEvalBranchQuota(2000); + return .{ + .bytes = bytes, + .id = id, + .hash = std.hash_map.hashString(bytes), + }; + } + }; + + // TODO extensions + pub const keywords = [_]Keyword{ + Keyword.init("auto", .Keyword_auto), + Keyword.init("break", .Keyword_break), + Keyword.init("case", .Keyword_case), + Keyword.init("char", .Keyword_char), + Keyword.init("const", .Keyword_const), + Keyword.init("continue", .Keyword_continue), + Keyword.init("default", .Keyword_default), + Keyword.init("do", .Keyword_do), + Keyword.init("double", .Keyword_double), + Keyword.init("else", .Keyword_else), + Keyword.init("enum", .Keyword_enum), + Keyword.init("extern", .Keyword_extern), + Keyword.init("float", .Keyword_float), + Keyword.init("for", .Keyword_for), + Keyword.init("goto", .Keyword_goto), + Keyword.init("if", .Keyword_if), + Keyword.init("int", .Keyword_int), + Keyword.init("long", .Keyword_long), + Keyword.init("register", .Keyword_register), + Keyword.init("return", .Keyword_return), + Keyword.init("short", .Keyword_short), + Keyword.init("signed", .Keyword_signed), + Keyword.init("sizeof", .Keyword_sizeof), + Keyword.init("static", .Keyword_static), + Keyword.init("struct", .Keyword_struct), + Keyword.init("switch", .Keyword_switch), + Keyword.init("typedef", .Keyword_typedef), + Keyword.init("union", .Keyword_union), + Keyword.init("unsigned", .Keyword_unsigned), + Keyword.init("void", .Keyword_void), + Keyword.init("volatile", .Keyword_volatile), + Keyword.init("while", .Keyword_while), + + // ISO C99 + Keyword.init("_Bool", .Keyword_bool), + Keyword.init("_Complex", .Keyword_complex), + Keyword.init("_Imaginary", .Keyword_imaginary), + Keyword.init("inline", .Keyword_inline), + Keyword.init("restrict", .Keyword_restrict), + + // ISO C11 + Keyword.init("_Alignas", .Keyword_alignas), + Keyword.init("_Alignof", .Keyword_alignof), + Keyword.init("_Atomic", .Keyword_atomic), + Keyword.init("_Generic", .Keyword_generic), + Keyword.init("_Noreturn", .Keyword_noreturn), + Keyword.init("_Static_assert", .Keyword_static_assert), + Keyword.init("_Thread_local", .Keyword_thread_local), + + // Preprocessor + Keyword.init("include", .Keyword_include), + Keyword.init("define", .Keyword_define), + Keyword.init("ifdef", .Keyword_ifdef), + Keyword.init("ifndef", .Keyword_ifndef), + Keyword.init("error", .Keyword_error), + Keyword.init("pragma", .Keyword_pragma), + }; + + // TODO perfect hash at comptime + pub fn getKeyword(bytes: []const u8, macro: bool) ?Id { + var hash = std.hash_map.hashString(bytes); + for (keywords) |kw| { + if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) { + switch (kw.id) { + .Keyword_include, + .Keyword_define, + .Keyword_ifdef, + .Keyword_ifndef, + .Keyword_error, + .Keyword_pragma, + => if (!macro) return null, + else => {}, + } + return kw.id; + } + } + return null; + } pub const NumSuffix = enum { None, @@ -95,6 +253,7 @@ pub const Token = struct { pub const Tokenizer = struct { source: *Source, index: usize = 0, + prev_tok_id: @TagType(Token.Id), pub fn next(self: *Tokenizer) Token { const start_index = self.index; @@ -124,6 +283,9 @@ pub const Tokenizer = struct { Percent, Asterisk, Plus, + + /// special case for #include <...> + MacroString, AngleBracketLeft, AngleBracketAngleBracketLeft, AngleBracketRight, @@ -189,7 +351,6 @@ pub const Tokenizer = struct { }, 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => { state = .Identifier; - result.id = .Identifier; }, '=' => { state = .Equal; @@ -250,7 +411,10 @@ pub const Tokenizer = struct { state = .Plus; }, '<' => { - state = .AngleBracketLeft; + if (self.prev_tok_id == .Keyword_include) + state = .MacroString + else + state = .AngleBracketLeft; }, '>' => { state = .AngleBracketRight; @@ -442,7 +606,7 @@ pub const Tokenizer = struct { .Identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, else => { - result.id = .Identifier; + result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier; break; }, }, @@ -522,6 +686,14 @@ pub const Tokenizer = struct { break; }, }, + .MacroString => switch (c) { + '>' => { + result.id = .MacroString; + self.index += 1; + break; + }, + else => {}, + }, .AngleBracketLeft => switch (c) { '<' => { state = .AngleBracketAngleBracketLeft; @@ -859,7 +1031,7 @@ pub const Tokenizer = struct { switch (state) { .Start => {}, .u, .u8, .U, .L, .Identifier => { - result.id = .Identifier; + result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier; }, .Cr, @@ -876,6 +1048,7 @@ pub const Tokenizer = struct { .FloatFractionHex, .FloatExponent, .FloatExponentDigits, + .MacroString, => result.id = .Invalid, .IntegerLiteralOct, @@ -910,6 +1083,7 @@ pub const Tokenizer = struct { } } + self.prev_tok_id = result.id; result.end = self.index; return result; }