// From https://github.com/ziglang/zig/blob/master/lib/std/zig/tokenizer.zig const std = @import("std"); const Loc = @import("../dataStructure/loc.zig"); pub const Token = struct { tag: Tag, loc: Loc, pub const keywords = std.StaticStringMap(Tag).initComptime(.{ .{ "GRAB", .keyword_grab }, .{ "UPDATE", .keyword_update }, .{ "DELETE", .keyword_delete }, .{ "ADD", .keyword_add }, .{ "IN", .keyword_in }, .{ "AND", .keyword_and }, .{ "OR", .keyword_or }, .{ "TO", .keyword_to }, .{ "NONE", .keyword_none }, .{ "NOW", .keyword_now }, .{ "APPEND", .keyword_append }, .{ "POP", .keyword_pop }, .{ "REMOVE", .keyword_remove }, .{ "REMOVEAT", .keyword_remove_at }, .{ "grab", .keyword_grab }, .{ "update", .keyword_update }, .{ "delete", .keyword_delete }, .{ "add", .keyword_add }, .{ "in", .keyword_in }, .{ "and", .keyword_and }, .{ "or", .keyword_or }, .{ "to", .keyword_to }, .{ "none", .keyword_none }, .{ "true", .bool_literal_true }, .{ "false", .bool_literal_false }, .{ "now", .keyword_now }, }); pub fn getKeyword(bytes: []const u8) ?Tag { return keywords.get(bytes); } pub const Tag = enum { eof, invalid, keyword_grab, keyword_update, keyword_delete, keyword_add, keyword_in, keyword_not_in, keyword_and, keyword_or, keyword_to, keyword_none, keyword_now, keyword_append, keyword_pop, keyword_remove, keyword_remove_at, string_literal, int_literal, float_literal, date_literal, time_literal, datetime_literal, bool_literal_true, bool_literal_false, uuid_literal, identifier, equal, bang, // ! pipe, // | l_paren, // ( r_paren, // ) l_bracket, // [ r_bracket, // ] l_brace, // { r_brace, // } semicolon, // ; comma, // , angle_bracket_left, // < angle_bracket_right, // > angle_bracket_left_equal, // <= angle_bracket_right_equal, // >= equal_angle_bracket_right, // => period, // . bang_equal, // != }; }; pub const Tokenizer = struct { buffer: [:0]const u8, index: usize, last_token: Token = undefined, pub fn getTokenSlice(self: *Tokenizer, token: Token) []const u8 { return self.buffer[token.loc.start..token.loc.end]; } pub fn last(self: Tokenizer) Token { return self.last_token; } pub fn init(buffer: [:0]const u8) Tokenizer { // Skip the UTF-8 BOM if present. return .{ .buffer = buffer, .index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, }; } const State = enum { start, invalid, string_literal, date_literal, time_literal, uuid_literal, identifier, equal, bang, angle_bracket_left, angle_bracket_right, string_literal_backslash, float, int, }; pub fn next(self: *Tokenizer) Token { var state: State = .start; var result: Token = .{ .tag = undefined, .loc = .{ .start = self.index, .end = undefined, }, }; while (true) : (self.index += 1) { const c = self.buffer[self.index]; switch (state) { .start => switch (c) { 0 => { if (self.index == self.buffer.len) return .{ .tag = .eof, .loc = .{ .start = self.index, .end = self.index, }, }; state = .invalid; }, ' ', '\n', '\t', '\r' => { result.loc.start = self.index + 1; }, '\'' => { state = .string_literal; result.tag = .string_literal; }, 'a'...'z', 'A'...'Z', '_' => { state = .identifier; result.tag = .identifier; }, '=' => { state = .equal; }, '!' => { state = .bang; }, '|' => { result.tag = .pipe; self.index += 1; break; }, '(' => { result.tag = .l_paren; self.index += 1; break; }, ')' => { result.tag = .r_paren; self.index += 1; break; }, '[' => { result.tag = .l_bracket; self.index += 1; break; }, ']' => { result.tag = .r_bracket; self.index += 1; break; }, ';' => { result.tag = .semicolon; self.index += 1; break; }, ',' => { result.tag = .comma; self.index += 1; break; }, '<' => { state = .angle_bracket_left; }, '>' => { state = .angle_bracket_right; }, '{' => { result.tag = .l_brace; self.index += 1; break; }, '}' => { result.tag = .r_brace; self.index += 1; break; }, '.' => { state = .float; result.tag = .float_literal; }, '0'...'9', '-' => { state = .int; result.tag = .int_literal; }, else => { state = .invalid; }, }, .invalid => { // TODO make a better invalid handler @panic("Unknow char!!!"); }, .identifier => switch (c) { 'a'...'z', 'A'...'Z', '_', '0'...'9' => continue, '-' => { state = .uuid_literal; result.tag = .uuid_literal; }, else => { if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| { result.tag = tag; } else { result.tag = .identifier; } break; }, }, .string_literal => switch (c) { 0 => { if (self.index != self.buffer.len) { state = .invalid; continue; } result.tag = .invalid; break; }, '\n' => { result.tag = .invalid; break; }, '\\' => { state = .string_literal_backslash; }, '\'' => { self.index += 1; break; }, 0x01...0x09, 0x0b...0x1f, 0x7f => { state = .invalid; }, else => continue, }, .string_literal_backslash => switch (c) { 0, '\n' => { result.tag = .invalid; break; }, else => { state = .string_literal; }, }, .bang => switch (c) { '=' => { result.tag = .bang_equal; self.index += 1; break; }, 'I' => { if (self.buffer.len > self.index + 1 and self.buffer[self.index + 1] == 'N') { result.tag = .keyword_not_in; self.index += 2; // Skip 'I' and 'N' break; } else { result.tag = .bang; break; } }, else => { result.tag = .bang; break; }, }, .equal => switch (c) { '>' => { result.tag = .equal_angle_bracket_right; self.index += 1; break; }, else => { result.tag = .equal; break; }, }, .angle_bracket_left => switch (c) { '=' => { result.tag = .angle_bracket_left_equal; self.index += 1; break; }, else => { result.tag = .angle_bracket_left; break; }, }, .angle_bracket_right => switch (c) { '=' => { result.tag = .angle_bracket_right_equal; self.index += 1; break; }, else => { result.tag = .angle_bracket_right; break; }, }, .int => switch (c) { '.' => { state = .float; result.tag = .float_literal; }, 'a'...'z', '-' => { state = .uuid_literal; result.tag = .uuid_literal; }, '/' => { state = .date_literal; result.tag = .date_literal; }, ':' => { state = .time_literal; result.tag = .time_literal; }, '_', '0'...'9' => continue, else => break, }, .float => switch (c) { '_', '0'...'9' => { continue; }, else => { break; }, }, .date_literal => switch (c) { '-' => { state = .time_literal; result.tag = .datetime_literal; }, '0'...'9', '/' => continue, else => break, }, .time_literal => switch (c) { '0'...'9', ':', '.' => continue, else => break, }, .uuid_literal => switch (c) { '0'...'9', 'a'...'z', '-' => continue, else => break, }, } } result.loc.end = self.index; self.last_token = result; return result; } }; test "keywords" { try testTokenize("GRAB UPDATE ADD DELETE IN", &.{ .keyword_grab, .keyword_update, .keyword_add, .keyword_delete, .keyword_in }); } test "basic query" { try testTokenize("GRAB User {}", &.{ .keyword_grab, .identifier, .l_brace, .r_brace }); try testTokenize("GRAB User { name = 'Adrien'}", &.{ .keyword_grab, .identifier, .l_brace, .identifier, .equal, .string_literal, .r_brace }); try testTokenize("GRAB User { age = 1.5}", &.{ .keyword_grab, .identifier, .l_brace, .identifier, .equal, .float_literal, .r_brace }); try testTokenize("GRAB User { admin = true}", &.{ .keyword_grab, .identifier, .l_brace, .identifier, .equal, .bool_literal_true, .r_brace }); try testTokenize("GRAB User [1; name] {}", &.{ .keyword_grab, .identifier, .l_bracket, .int_literal, .semicolon, .identifier, .r_bracket, .l_brace, .r_brace }); try testTokenize("GRAB User{}|ASCENDING name|", &.{ .keyword_grab, .identifier, .l_brace, .r_brace, .pipe, .identifier, .identifier, .pipe }); try testTokenize("DELETE User[1]{name='Adrien'}|ASCENDING name, age|", &.{ .keyword_delete, .identifier, .l_bracket, .int_literal, .r_bracket, .l_brace, .identifier, .equal, .string_literal, .r_brace, .pipe, .identifier, .identifier, .comma, .identifier, .pipe }); } test "basic date" { try testTokenize("1a5527af-88fb-48c1-8d5c-49c9b73c2379", &.{.uuid_literal}); try testTokenize("1998/01/21", &.{.date_literal}); try testTokenize("17:55:31.0000", &.{.time_literal}); try testTokenize("1998/01/21-17:55:31.0000", &.{.datetime_literal}); } test "not in keyword" { try testTokenize("!IN", &.{.keyword_not_in}); try testTokenize("!IN(", &.{ .keyword_not_in, .l_paren }); try testTokenize("!Ind", &.{ .bang, .identifier }); } fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void { var tokenizer = Tokenizer.init(source); for (expected_token_tags) |expected_token_tag| { const token = tokenizer.next(); try std.testing.expectEqual(expected_token_tag, token.tag); } // Last token should always be eof, even when the last token was invalid, // in which case the tokenizer is in an invalid state, which can only be // recovered by opinionated means outside the scope of this implementation. const last_token = tokenizer.next(); try std.testing.expectEqual(Token.Tag.eof, last_token.tag); try std.testing.expectEqual(source.len, last_token.loc.start); try std.testing.expectEqual(source.len, last_token.loc.end); }