196 lines
5.9 KiB
Zig
196 lines
5.9 KiB
Zig
// From https://github.com/ziglang/zig/blob/master/lib/std/zig/tokenizer.zig
|
|
const std = @import("std");
|
|
const Loc = @import("../dataStructure/loc.zig");
|
|
|
|
pub const Token = struct {
|
|
tag: Tag,
|
|
loc: Loc,
|
|
|
|
pub const keywords = std.StaticStringMap(Tag).initComptime(.{
|
|
.{ "run", .keyword_run },
|
|
.{ "help", .keyword_help },
|
|
.{ "describe", .keyword_describe },
|
|
.{ "init", .keyword_init },
|
|
.{ "schema", .keyword_schema },
|
|
.{ "quit", .keyword_quit },
|
|
.{ "db", .keyword_db },
|
|
.{ "new", .keyword_new },
|
|
.{ "metrics", .keyword_metrics },
|
|
.{ "use", .keyword_use },
|
|
.{ "state", .keyword_state },
|
|
.{ "dump", .keyword_dump },
|
|
.{ "csv", .keyword_csv },
|
|
.{ "json", .keyword_json },
|
|
.{ "zid", .keyword_zid },
|
|
});
|
|
|
|
pub fn getKeyword(bytes: []const u8) ?Tag {
|
|
return keywords.get(bytes);
|
|
}
|
|
|
|
pub const Tag = enum {
|
|
eof,
|
|
invalid,
|
|
|
|
keyword_run,
|
|
keyword_help,
|
|
keyword_describe,
|
|
keyword_schema,
|
|
keyword_init,
|
|
keyword_quit,
|
|
keyword_db,
|
|
keyword_new,
|
|
keyword_metrics,
|
|
keyword_use,
|
|
keyword_state,
|
|
keyword_dump,
|
|
keyword_csv,
|
|
keyword_json,
|
|
keyword_zid,
|
|
|
|
string_literal,
|
|
identifier,
|
|
};
|
|
};
|
|
|
|
pub const Tokenizer = struct {
|
|
buffer: [:0]const u8,
|
|
index: usize,
|
|
|
|
pub fn init(buffer: [:0]const u8) Tokenizer {
|
|
// Skip the UTF-8 BOM if present.
|
|
return .{
|
|
.buffer = buffer,
|
|
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
|
|
};
|
|
}
|
|
|
|
const State = enum {
|
|
start,
|
|
invalid,
|
|
identifier,
|
|
string_literal,
|
|
string_literal_backslash,
|
|
};
|
|
|
|
pub fn getTokenSlice(self: *Tokenizer, token: Token) []const u8 {
|
|
return self.buffer[token.loc.start..token.loc.end];
|
|
}
|
|
|
|
pub fn next(self: *Tokenizer) Token {
|
|
var state: State = .start;
|
|
var result: Token = .{
|
|
.tag = undefined,
|
|
.loc = .{
|
|
.start = self.index,
|
|
.end = undefined,
|
|
},
|
|
};
|
|
while (true) : (self.index += 1) {
|
|
const c = self.buffer[self.index];
|
|
switch (state) {
|
|
.start => switch (c) {
|
|
0 => {
|
|
if (self.index == self.buffer.len) return .{
|
|
.tag = .eof,
|
|
.loc = .{
|
|
.start = self.index,
|
|
.end = self.index,
|
|
},
|
|
};
|
|
state = .invalid;
|
|
},
|
|
' ', '\n', '\t', '\r' => {
|
|
result.loc.start = self.index + 1;
|
|
},
|
|
'a'...'z', 'A'...'Z', '_' => {
|
|
state = .identifier;
|
|
result.tag = .identifier;
|
|
},
|
|
'"' => {
|
|
state = .string_literal;
|
|
result.tag = .string_literal;
|
|
},
|
|
else => {
|
|
state = .invalid;
|
|
},
|
|
},
|
|
|
|
.invalid => {
|
|
result.tag = .invalid;
|
|
break;
|
|
},
|
|
|
|
.identifier => switch (c) {
|
|
'a'...'z', 'A'...'Z', '_', '0'...'9', '.', '/' => continue,
|
|
else => {
|
|
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
|
|
result.tag = tag;
|
|
}
|
|
break;
|
|
},
|
|
},
|
|
|
|
.string_literal => switch (c) {
|
|
0 => {
|
|
if (self.index != self.buffer.len) {
|
|
state = .invalid;
|
|
continue;
|
|
}
|
|
result.tag = .invalid;
|
|
break;
|
|
},
|
|
'\n' => {
|
|
result.tag = .invalid;
|
|
break;
|
|
},
|
|
'\\' => {
|
|
state = .string_literal_backslash;
|
|
},
|
|
'"' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
0x01...0x09, 0x0b...0x1f, 0x7f => {
|
|
state = .invalid;
|
|
},
|
|
else => continue,
|
|
},
|
|
|
|
.string_literal_backslash => switch (c) {
|
|
0, '\n' => {
|
|
result.tag = .invalid;
|
|
break;
|
|
},
|
|
else => {
|
|
state = .string_literal;
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
result.loc.end = self.index;
|
|
return result;
|
|
}
|
|
};
|
|
|
|
test "Basics" {
|
|
try testTokenize("help", &.{.keyword_help});
|
|
try testTokenize("run \"Hello world\"", &.{ .keyword_run, .string_literal });
|
|
}
|
|
|
|
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
|
var tokenizer = Tokenizer.init(source);
|
|
for (expected_token_tags) |expected_token_tag| {
|
|
const token = tokenizer.next();
|
|
try std.testing.expectEqual(expected_token_tag, token.tag);
|
|
}
|
|
// Last token should always be eof, even when the last token was invalid,
|
|
// in which case the tokenizer is in an invalid state, which can only be
|
|
// recovered by opinionated means outside the scope of this implementation.
|
|
const last_token = tokenizer.next();
|
|
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
|
|
try std.testing.expectEqual(source.len, last_token.loc.start);
|
|
try std.testing.expectEqual(source.len, last_token.loc.end);
|
|
}
|