169 lines
5.0 KiB
Zig

const std = @import("std");
const Loc = @import("shared/loc.zig").Loc;
pub const Token = struct {
tag: Tag,
loc: Loc,
pub const Tag = enum {
string_literal,
int_literal,
float_literal,
uuid_literal,
date_literal,
time_literal,
datetime_literal,
l_bracket, // [
r_bracket, // ]
};
};
pub const Tokenizer = struct {
buffer: [:0]const u8,
index: usize,
// Maybe change that to use the stream directly so I dont have to read the line 2 times
pub fn init(buffer: [:0]const u8) Tokenizer {
// Skip the UTF-8 BOM if present.
return .{
.buffer = buffer,
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, // WTF ? I guess some OS add that or some shit like that
};
}
const State = enum {
start,
string_literal,
float,
int,
uuid_literal,
date_literal,
time_literal,
};
pub fn getTokenSlice(self: *Tokenizer, token: Token) []const u8 {
return self.buffer[token.loc.start..token.loc.end];
}
pub fn next(self: *Tokenizer) Token {
// That ugly but work
if (self.buffer[self.index] == ';') self.index += 1; // Hardcoded delimiter
if (self.buffer[self.index] == ' ') self.index += 1; // Hardcoded delimiter
var state: State = .start;
var result: Token = .{
.tag = undefined,
.loc = .{
.start = self.index,
.end = undefined,
},
};
while (true) : (self.index += 1) {
const c = self.buffer[self.index];
if (self.index == self.buffer.len) break;
switch (state) {
.start => switch (c) {
'\'' => {
state = .string_literal;
result.tag = .string_literal;
},
'a'...'z' => {
state = .uuid_literal;
result.tag = .uuid_literal;
},
'0'...'9', '-' => {
state = .int;
result.tag = .int_literal;
},
'[' => {
result.tag = .l_bracket;
self.index += 1;
break;
},
']' => {
result.tag = .r_bracket;
self.index += 1;
break;
},
else => std.debug.print("Unknow character: {c}\n", .{c}),
},
.string_literal => switch (c) {
'\'' => {
self.index += 1;
break;
},
else => continue,
},
.int => switch (c) {
'.' => {
state = .float;
result.tag = .float_literal;
},
'a'...'z', '-' => {
state = .uuid_literal;
result.tag = .uuid_literal;
},
'/' => {
state = .date_literal;
result.tag = .date_literal;
},
':' => {
state = .time_literal;
result.tag = .time_literal;
},
'_', '0'...'9' => continue,
else => break,
},
.float => switch (c) {
'0'...'9' => {
continue;
},
else => {
break;
},
},
.date_literal => switch (c) {
'-' => {
state = .time_literal;
result.tag = .datetime_literal;
},
'0'...'9', '/' => continue,
else => break,
},
.time_literal => switch (c) {
'0'...'9', ':', '.' => continue,
else => break,
},
.uuid_literal => switch (c) {
'0'...'9', 'a'...'z', '-' => continue,
else => break,
},
}
}
result.loc.end = self.index;
return result;
}
};
test "Basics" {
try testTokenize("193 88.92 [ 123] 'hello mommy'", &.{ .int_literal, .float_literal, .l_bracket, .int_literal, .r_bracket });
}
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
var tokenizer = Tokenizer.init(source);
for (expected_token_tags) |expected_token_tag| {
const token = tokenizer.next();
try std.testing.expectEqual(expected_token_tag, token.tag);
}
}