std-c tokenizer keywords

This commit is contained in:
Vexu 2020-01-04 02:44:23 +02:00
parent 26bf410b06
commit d75697a6a3
No known key found for this signature in database
GPG Key ID: 59AEB8936E16A6AC

View File

@ -1,5 +1,5 @@
const std = @import("std");
const expect = std.testing.expect;
const mem = std.mem;
pub const Source = struct {
buffer: []const u8,
@ -7,11 +7,19 @@ pub const Source = struct {
};
pub const Token = struct {
id: union(enum) {
id: Id,
start: usize,
end: usize,
source: *Source,
pub const Id = union(enum) {
Invalid,
Eof,
Nl,
Identifier,
/// special case for #include <...>
MacroString,
StringLiteral: StrKind,
CharLiteral: StrKind,
IntegerLiteral: NumSuffix,
@ -68,10 +76,160 @@ pub const Token = struct {
MultiLineComment,
Hash,
HashHash,
},
start: usize,
end: usize,
source: *Source,
Keyword_auto,
Keyword_break,
Keyword_case,
Keyword_char,
Keyword_const,
Keyword_continue,
Keyword_default,
Keyword_do,
Keyword_double,
Keyword_else,
Keyword_enum,
Keyword_extern,
Keyword_float,
Keyword_for,
Keyword_goto,
Keyword_if,
Keyword_int,
Keyword_long,
Keyword_register,
Keyword_return,
Keyword_short,
Keyword_signed,
Keyword_sizeof,
Keyword_static,
Keyword_struct,
Keyword_switch,
Keyword_typedef,
Keyword_union,
Keyword_unsigned,
Keyword_void,
Keyword_volatile,
Keyword_while,
// ISO C99
Keyword_bool,
Keyword_complex,
Keyword_imaginary,
Keyword_inline,
Keyword_restrict,
// ISO C11
Keyword_alignas,
Keyword_alignof,
Keyword_atomic,
Keyword_generic,
Keyword_noreturn,
Keyword_static_assert,
Keyword_thread_local,
// Preprocessor
Keyword_include,
Keyword_define,
Keyword_ifdef,
Keyword_ifndef,
Keyword_error,
Keyword_pragma,
};
pub const Keyword = struct {
bytes: []const u8,
id: Id,
hash: u32,
fn init(bytes: []const u8, id: Id) Keyword {
@setEvalBranchQuota(2000);
return .{
.bytes = bytes,
.id = id,
.hash = std.hash_map.hashString(bytes),
};
}
};
// TODO extensions
pub const keywords = [_]Keyword{
Keyword.init("auto", .Keyword_auto),
Keyword.init("break", .Keyword_break),
Keyword.init("case", .Keyword_case),
Keyword.init("char", .Keyword_char),
Keyword.init("const", .Keyword_const),
Keyword.init("continue", .Keyword_continue),
Keyword.init("default", .Keyword_default),
Keyword.init("do", .Keyword_do),
Keyword.init("double", .Keyword_double),
Keyword.init("else", .Keyword_else),
Keyword.init("enum", .Keyword_enum),
Keyword.init("extern", .Keyword_extern),
Keyword.init("float", .Keyword_float),
Keyword.init("for", .Keyword_for),
Keyword.init("goto", .Keyword_goto),
Keyword.init("if", .Keyword_if),
Keyword.init("int", .Keyword_int),
Keyword.init("long", .Keyword_long),
Keyword.init("register", .Keyword_register),
Keyword.init("return", .Keyword_return),
Keyword.init("short", .Keyword_short),
Keyword.init("signed", .Keyword_signed),
Keyword.init("sizeof", .Keyword_sizeof),
Keyword.init("static", .Keyword_static),
Keyword.init("struct", .Keyword_struct),
Keyword.init("switch", .Keyword_switch),
Keyword.init("typedef", .Keyword_typedef),
Keyword.init("union", .Keyword_union),
Keyword.init("unsigned", .Keyword_unsigned),
Keyword.init("void", .Keyword_void),
Keyword.init("volatile", .Keyword_volatile),
Keyword.init("while", .Keyword_while),
// ISO C99
Keyword.init("_Bool", .Keyword_bool),
Keyword.init("_Complex", .Keyword_complex),
Keyword.init("_Imaginary", .Keyword_imaginary),
Keyword.init("inline", .Keyword_inline),
Keyword.init("restrict", .Keyword_restrict),
// ISO C11
Keyword.init("_Alignas", .Keyword_alignas),
Keyword.init("_Alignof", .Keyword_alignof),
Keyword.init("_Atomic", .Keyword_atomic),
Keyword.init("_Generic", .Keyword_generic),
Keyword.init("_Noreturn", .Keyword_noreturn),
Keyword.init("_Static_assert", .Keyword_static_assert),
Keyword.init("_Thread_local", .Keyword_thread_local),
// Preprocessor
Keyword.init("include", .Keyword_include),
Keyword.init("define", .Keyword_define),
Keyword.init("ifdef", .Keyword_ifdef),
Keyword.init("ifndef", .Keyword_ifndef),
Keyword.init("error", .Keyword_error),
Keyword.init("pragma", .Keyword_pragma),
};
// TODO perfect hash at comptime
pub fn getKeyword(bytes: []const u8, macro: bool) ?Id {
var hash = std.hash_map.hashString(bytes);
for (keywords) |kw| {
if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) {
switch (kw.id) {
.Keyword_include,
.Keyword_define,
.Keyword_ifdef,
.Keyword_ifndef,
.Keyword_error,
.Keyword_pragma,
=> if (!macro) return null,
else => {},
}
return kw.id;
}
}
return null;
}
pub const NumSuffix = enum {
None,
@ -95,6 +253,7 @@ pub const Token = struct {
pub const Tokenizer = struct {
source: *Source,
index: usize = 0,
prev_tok_id: @TagType(Token.Id),
pub fn next(self: *Tokenizer) Token {
const start_index = self.index;
@ -124,6 +283,9 @@ pub const Tokenizer = struct {
Percent,
Asterisk,
Plus,
/// special case for #include <...>
MacroString,
AngleBracketLeft,
AngleBracketAngleBracketLeft,
AngleBracketRight,
@ -189,7 +351,6 @@ pub const Tokenizer = struct {
},
'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => {
state = .Identifier;
result.id = .Identifier;
},
'=' => {
state = .Equal;
@ -250,7 +411,10 @@ pub const Tokenizer = struct {
state = .Plus;
},
'<' => {
state = .AngleBracketLeft;
if (self.prev_tok_id == .Keyword_include)
state = .MacroString
else
state = .AngleBracketLeft;
},
'>' => {
state = .AngleBracketRight;
@ -442,7 +606,7 @@ pub const Tokenizer = struct {
.Identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
else => {
result.id = .Identifier;
result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier;
break;
},
},
@ -522,6 +686,14 @@ pub const Tokenizer = struct {
break;
},
},
.MacroString => switch (c) {
'>' => {
result.id = .MacroString;
self.index += 1;
break;
},
else => {},
},
.AngleBracketLeft => switch (c) {
'<' => {
state = .AngleBracketAngleBracketLeft;
@ -859,7 +1031,7 @@ pub const Tokenizer = struct {
switch (state) {
.Start => {},
.u, .u8, .U, .L, .Identifier => {
result.id = .Identifier;
result.id = Token.getKeyword(self.source.buffer[result.start..self.index], self.prev_tok_id == .Hash) orelse .Identifier;
},
.Cr,
@ -876,6 +1048,7 @@ pub const Tokenizer = struct {
.FloatFractionHex,
.FloatExponent,
.FloatExponentDigits,
.MacroString,
=> result.id = .Invalid,
.IntegerLiteralOct,
@ -910,6 +1083,7 @@ pub const Tokenizer = struct {
}
}
self.prev_tok_id = result.id;
result.end = self.index;
return result;
}