169 lines
5.0 KiB
Zig
169 lines
5.0 KiB
Zig
const std = @import("std");
|
|
const Loc = @import("shared/loc.zig").Loc;
|
|
|
|
pub const Token = struct {
|
|
tag: Tag,
|
|
loc: Loc,
|
|
|
|
pub const Tag = enum {
|
|
string_literal,
|
|
int_literal,
|
|
float_literal,
|
|
uuid_literal,
|
|
date_literal,
|
|
time_literal,
|
|
datetime_literal,
|
|
l_bracket, // [
|
|
r_bracket, // ]
|
|
};
|
|
};
|
|
|
|
pub const Tokenizer = struct {
|
|
buffer: [:0]const u8,
|
|
index: usize,
|
|
|
|
// Maybe change that to use the stream directly so I dont have to read the line 2 times
|
|
pub fn init(buffer: [:0]const u8) Tokenizer {
|
|
// Skip the UTF-8 BOM if present.
|
|
return .{
|
|
.buffer = buffer,
|
|
.index = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0, // WTF ? I guess some OS add that or some shit like that
|
|
};
|
|
}
|
|
|
|
const State = enum {
|
|
start,
|
|
string_literal,
|
|
float,
|
|
int,
|
|
uuid_literal,
|
|
date_literal,
|
|
time_literal,
|
|
};
|
|
|
|
pub fn getTokenSlice(self: *Tokenizer, token: Token) []const u8 {
|
|
return self.buffer[token.loc.start..token.loc.end];
|
|
}
|
|
|
|
pub fn next(self: *Tokenizer) Token {
|
|
// That ugly but work
|
|
if (self.buffer[self.index] == ';') self.index += 1; // Hardcoded delimiter
|
|
if (self.buffer[self.index] == ' ') self.index += 1; // Hardcoded delimiter
|
|
|
|
var state: State = .start;
|
|
var result: Token = .{
|
|
.tag = undefined,
|
|
.loc = .{
|
|
.start = self.index,
|
|
.end = undefined,
|
|
},
|
|
};
|
|
while (true) : (self.index += 1) {
|
|
const c = self.buffer[self.index];
|
|
|
|
if (self.index == self.buffer.len) break;
|
|
|
|
switch (state) {
|
|
.start => switch (c) {
|
|
'\'' => {
|
|
state = .string_literal;
|
|
result.tag = .string_literal;
|
|
},
|
|
'a'...'z' => {
|
|
state = .uuid_literal;
|
|
result.tag = .uuid_literal;
|
|
},
|
|
|
|
'0'...'9', '-' => {
|
|
state = .int;
|
|
result.tag = .int_literal;
|
|
},
|
|
'[' => {
|
|
result.tag = .l_bracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
']' => {
|
|
result.tag = .r_bracket;
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => std.debug.print("Unknow character: {c}\n", .{c}),
|
|
},
|
|
|
|
.string_literal => switch (c) {
|
|
'\'' => {
|
|
self.index += 1;
|
|
break;
|
|
},
|
|
else => continue,
|
|
},
|
|
|
|
.int => switch (c) {
|
|
'.' => {
|
|
state = .float;
|
|
result.tag = .float_literal;
|
|
},
|
|
'a'...'z', '-' => {
|
|
state = .uuid_literal;
|
|
result.tag = .uuid_literal;
|
|
},
|
|
'/' => {
|
|
state = .date_literal;
|
|
result.tag = .date_literal;
|
|
},
|
|
':' => {
|
|
state = .time_literal;
|
|
result.tag = .time_literal;
|
|
},
|
|
'_', '0'...'9' => continue,
|
|
else => break,
|
|
},
|
|
|
|
.float => switch (c) {
|
|
'0'...'9' => {
|
|
continue;
|
|
},
|
|
else => {
|
|
break;
|
|
},
|
|
},
|
|
|
|
.date_literal => switch (c) {
|
|
'-' => {
|
|
state = .time_literal;
|
|
result.tag = .datetime_literal;
|
|
},
|
|
'0'...'9', '/' => continue,
|
|
else => break,
|
|
},
|
|
|
|
.time_literal => switch (c) {
|
|
'0'...'9', ':', '.' => continue,
|
|
else => break,
|
|
},
|
|
|
|
.uuid_literal => switch (c) {
|
|
'0'...'9', 'a'...'z', '-' => continue,
|
|
else => break,
|
|
},
|
|
}
|
|
}
|
|
|
|
result.loc.end = self.index;
|
|
return result;
|
|
}
|
|
};
|
|
|
|
test "Basics" {
|
|
try testTokenize("193 88.92 [ 123] 'hello mommy'", &.{ .int_literal, .float_literal, .l_bracket, .int_literal, .r_bracket });
|
|
}
|
|
|
|
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
|
|
var tokenizer = Tokenizer.init(source);
|
|
for (expected_token_tags) |expected_token_tag| {
|
|
const token = tokenizer.next();
|
|
try std.testing.expectEqual(expected_token_tag, token.tag);
|
|
}
|
|
}
|