ZipponDB/src/tokenizers/ziqlTokenizer.zig

374 lines
12 KiB
Zig

// From https://github.com/ziglang/zig/blob/master/lib/std/zig/tokenizer.zig
const std = @import("std");
pub const Token = struct {
tag: Tag,
loc: Loc,
pub const Loc = struct {
start: usize,
end: usize,
};
pub const keywords = std.StaticStringMap(Tag).initComptime(.{
.{ "GRAB", .keyword_grab },
.{ "UPDATE", .keyword_update },
.{ "DELETE", .keyword_delete },
.{ "ADD", .keyword_add },
.{ "IN", .keyword_in },
.{ "null", .keyword_null },
.{ "__DESCRIBE__", .keyword__describe__ },
});
pub fn getKeyword(bytes: []const u8) ?Tag {
return keywords.get(bytes);
}
pub const Tag = enum {
eof,
invalid,
keyword_grab,
keyword_update,
keyword_delete,
keyword_add,
keyword_in,
keyword_null,
keyword__describe__,
string_literal,
number_literal,
identifier,
equal,
bang,
pipe,
l_paren,
r_paren,
l_bracket,
r_bracket,
l_brace,
r_brace,
semicolon,
comma,
angle_bracket_left,
angle_bracket_right,
angle_bracket_left_equal,
angle_bracket_right_equal,
equal_angle_bracket_right,
period,
bang_equal,
};
};
pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
pub fn getTokenSlice(self: *Tokenizer, token: Token) []const u8 {
return self.buffer[token.loc.start..token.loc.end];
}
pub fn init(buffer: [:0]const u8) Tokenizer {
// Skip the UTF-8 BOM if present.
return .{
.buffer = buffer,
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0,
};
}
const State = enum {
start,
invalid,
string_literal,
identifier,
equal,
bang,
angle_bracket_left,
angle_bracket_right,
string_literal_backslash,
int_exponent,
int_period,
float,
float_exponent,
int,
};
pub fn next(self: *Tokenizer) Token {
var state: State = .start;
var result: Token = .{
.tag = undefined,
.loc = .{
.start = self.index,
.end = undefined,
},
};
while (true) : (self.index += 1) {
const c = self.buffer[self.index];
switch (state) {
.start => switch (c) {
0 => {
if (self.index == self.buffer.len) return .{
.tag = .eof,
.loc = .{
.start = self.index,
.end = self.index,
},
};
state = .invalid;
},
' ', '\n', '\t', '\r' => {
result.loc.start = self.index + 1;
},
'\'' => {
state = .string_literal;
result.tag = .string_literal;
},
'a'...'z', 'A'...'Z', '_' => {
state = .identifier;
result.tag = .identifier;
},
'=' => {
state = .equal;
},
'!' => {
state = .bang;
},
'|' => {
result.tag = .pipe;
self.index += 1;
break;
},
'(' => {
result.tag = .l_paren;
self.index += 1;
break;
},
')' => {
result.tag = .r_paren;
self.index += 1;
break;
},
'[' => {
result.tag = .l_bracket;
self.index += 1;
break;
},
']' => {
result.tag = .r_bracket;
self.index += 1;
break;
},
';' => {
result.tag = .semicolon;
self.index += 1;
break;
},
',' => {
result.tag = .comma;
self.index += 1;
break;
},
'<' => {
state = .angle_bracket_left;
},
'>' => {
state = .angle_bracket_right;
},
'{' => {
result.tag = .l_brace;
self.index += 1;
break;
},
'}' => {
result.tag = .r_brace;
self.index += 1;
break;
},
'.' => {
result.tag = .period;
self.index += 1;
break;
},
'0'...'9' => {
state = .int;
result.tag = .number_literal;
},
else => {
state = .invalid;
},
},
.invalid => {
// TODO make a better invalid handler
@panic("Unknow char!!!");
},
.identifier => switch (c) {
'a'...'z', 'A'...'Z', '_', '0'...'9' => continue,
else => {
if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
result.tag = tag;
}
break;
},
},
.string_literal => switch (c) {
0 => {
if (self.index != self.buffer.len) {
state = .invalid;
continue;
}
result.tag = .invalid;
break;
},
'\n' => {
result.tag = .invalid;
break;
},
'\\' => {
state = .string_literal_backslash;
},
'\'' => {
self.index += 1;
break;
},
0x01...0x09, 0x0b...0x1f, 0x7f => {
state = .invalid;
},
else => continue,
},
.string_literal_backslash => switch (c) {
0, '\n' => {
result.tag = .invalid;
break;
},
else => {
state = .string_literal;
},
},
.bang => switch (c) {
'=' => {
result.tag = .bang_equal;
self.index += 1;
break;
},
//TODO Add the !IN
else => {
result.tag = .bang;
break;
},
},
.equal => switch (c) {
'>' => {
result.tag = .equal_angle_bracket_right;
self.index += 1;
break;
},
else => {
result.tag = .equal;
break;
},
},
.angle_bracket_left => switch (c) {
'=' => {
result.tag = .angle_bracket_left_equal;
self.index += 1;
break;
},
else => {
result.tag = .angle_bracket_left;
break;
},
},
.angle_bracket_right => switch (c) {
'=' => {
result.tag = .angle_bracket_right_equal;
self.index += 1;
break;
},
else => {
result.tag = .angle_bracket_right;
break;
},
},
.int => switch (c) {
'.' => state = .int_period,
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
'e', 'E', 'p', 'P' => state = .int_exponent,
else => break,
},
.int_exponent => switch (c) {
'-', '+' => {
state = .float;
},
else => {
self.index -= 1;
state = .int;
},
},
.int_period => switch (c) {
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => {
state = .float;
},
'e', 'E', 'p', 'P' => state = .float_exponent,
else => {
self.index -= 1;
break;
},
},
.float => switch (c) {
'_', 'a'...'d', 'f'...'o', 'q'...'z', 'A'...'D', 'F'...'O', 'Q'...'Z', '0'...'9' => continue,
'e', 'E', 'p', 'P' => state = .float_exponent,
else => break,
},
.float_exponent => switch (c) {
'-', '+' => state = .float,
else => {
self.index -= 1;
state = .float;
},
},
}
}
result.loc.end = self.index;
return result;
}
};
test "keywords" {
try testTokenize("GRAB UPDATE ADD DELETE IN", &.{ .keyword_grab, .keyword_update, .keyword_add, .keyword_delete, .keyword_in });
std.debug.print("ZiQL keywords OK\n", .{});
}
test "basic query" {
try testTokenize("GRAB User {}", &.{ .keyword_grab, .identifier, .l_brace, .r_brace });
try testTokenize("GRAB User { name = 'Adrien'}", &.{ .keyword_grab, .identifier, .l_brace, .identifier, .equal, .string_literal, .r_brace });
try testTokenize("GRAB User [1; name] {}", &.{ .keyword_grab, .identifier, .l_bracket, .number_literal, .semicolon, .identifier, .r_bracket, .l_brace, .r_brace });
try testTokenize("GRAB User{}|ASCENDING name|", &.{ .keyword_grab, .identifier, .l_brace, .r_brace, .pipe, .identifier, .identifier, .pipe });
try testTokenize("DELETE User[1]{name='Adrien'}|ASCENDING name, age|", &.{ .keyword_delete, .identifier, .l_bracket, .number_literal, .r_bracket, .l_brace, .identifier, .equal, .string_literal, .r_brace, .pipe, .identifier, .identifier, .comma, .identifier, .pipe });
std.debug.print("ZiQL query OK\n", .{});
}
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
var tokenizer = Tokenizer.init(source);
for (expected_token_tags) |expected_token_tag| {
const token = tokenizer.next();
try std.testing.expectEqual(expected_token_tag, token.tag);
}
// Last token should always be eof, even when the last token was invalid,
// in which case the tokenizer is in an invalid state, which can only be
// recovered by opinionated means outside the scope of this implementation.
const last_token = tokenizer.next();
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
try std.testing.expectEqual(source.len, last_token.loc.start);
try std.testing.expectEqual(source.len, last_token.loc.end);
}