Parse and write at the same time

Created a new function to replace parseAndFindUUIDWithFilter.

Now it parse the file, evaluate with the filter and write directly to a
buffer. There is no UUID that is return, then file are parse again.
Should be a huge perf improve.

Some bugs with additionData tho, got a name: "<UUID>" =(
This commit is contained in:
Adrien Bouvais 2024-11-01 20:17:45 +01:00
parent 1bcc4465c5
commit 5e1ac7a0d7
7 changed files with 202 additions and 186 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ engine
engine.o
zig-out
TODO v0.2.md
*.py

View File

@ -39,7 +39,6 @@ pub fn parseBool(value_str: []const u8) bool {
return (value_str[0] != '0');
}
// TODO: Optimize all date parsing
pub fn parseDate(value_str: []const u8) DateTime {
const year: u16 = std.fmt.parseInt(u16, value_str[0..4], 10) catch 0;
const month: u16 = std.fmt.parseInt(u16, value_str[5..7], 10) catch 0;
@ -178,8 +177,7 @@ pub fn parseArrayStr(allocator: std.mem.Allocator, array_str: []const u8) std.Ar
_ = it.next(); // SSkip first token that is empty
while (it.next()) |x| {
if (std.mem.eql(u8, " ", x)) continue;
const x_copy = std.fmt.allocPrint(allocator, "'{s}'", .{x}) catch @panic("=(");
array.append(x_copy) catch {};
array.append(x) catch {};
}
if (array.items.len > 0) allocator.free(array.pop()); // Remove the last because empty like the first one

View File

@ -44,6 +44,19 @@ pub const UUID = struct {
return buf;
}
pub fn format_bytes(bytes: [16]u8) [36]u8 {
var buf: [36]u8 = undefined;
buf[8] = '-';
buf[13] = '-';
buf[18] = '-';
buf[23] = '-';
inline for (encoded_pos, 0..) |i, j| {
buf[i + 0] = hex[bytes[j] >> 4];
buf[i + 1] = hex[bytes[j] & 0x0f];
}
return buf;
}
// Indices in the UUID string representation for each byte.
const encoded_pos = [16]u8{ 0, 2, 4, 6, 9, 11, 14, 16, 19, 21, 24, 26, 28, 30, 32, 34 };

View File

@ -14,6 +14,7 @@ const FileToken = @import("tokenizers/file.zig").Token;
const SchemaTokenizer = @import("tokenizers/schema.zig").Tokenizer;
const SchemaToken = @import("tokenizers/schema.zig").Token;
const AdditionalData = @import("stuffs/additionalData.zig").AdditionalData;
const Filter = @import("stuffs/filter.zig").Filter;
const Loc = @import("tokenizers/shared/loc.zig").Loc;
const Condition = @import("stuffs/filter.zig").Condition;
@ -253,165 +254,148 @@ pub const FileEngine = struct {
}
}
/// Take a list of UUID and, a buffer array and the additional data to write into the buffer the JSON to send
/// TODO: Optimize
/// FIXME: Array of string are not working
pub fn parseAndWriteToSend(self: *FileEngine, struct_name: []const u8, uuids: []UUID, buffer: *std.ArrayList(u8), additional_data: AdditionalData) FileEngineError!void {
const max_file_index = try self.maxFileIndex(struct_name);
var current_index: usize = 0;
var path_buff = std.fmt.allocPrint(
self.allocator,
"{s}/DATA/{s}/{d}.csv",
.{ self.path_to_ZipponDB_dir, struct_name, current_index },
) catch return FileEngineError.MemoryError;
defer self.allocator.free(path_buff);
var file = std.fs.cwd().openFile(path_buff, .{}) catch return FileEngineError.CantOpenFile;
defer file.close();
var output: [BUFFER_SIZE]u8 = undefined;
var output_fbs = std.io.fixedBufferStream(&output);
const writer = output_fbs.writer();
var buffered = std.io.bufferedReader(file.reader());
var reader = buffered.reader();
var founded = false;
var token: FileToken = undefined;
var out_writer = buffer.writer();
out_writer.writeAll("[") catch return FileEngineError.WriteError;
// Write the start {
while (true) {
output_fbs.reset();
reader.streamUntilDelimiter(writer, '\n', null) catch |err| switch (err) {
error.EndOfStream => {
// When end of file, check if all file was parse, if not update the reader to the next file
// TODO: Be able to give an array of file index from the B+Tree to only parse them
output_fbs.reset(); // clear buffer before exit
if (current_index == max_file_index) break;
current_index += 1;
self.allocator.free(path_buff);
path_buff = std.fmt.allocPrint(
self.allocator,
"{s}/DATA/{s}/{d}.csv",
.{ self.path_to_ZipponDB_dir, struct_name, current_index },
) catch @panic("Can't create sub_path for init a DataIterator");
file.close(); // Do I need to close ? I think so
file = std.fs.cwd().openFile(path_buff, .{}) catch {
log.err("Error trying to open {s}\n", .{path_buff});
@panic("Can't open file to update a data iterator");
};
buffered = std.io.bufferedReader(file.reader());
reader = buffered.reader();
continue;
}, // file read till the end
else => return FileEngineError.StreamError,
};
const null_terminated_string = self.allocator.dupeZ(u8, output_fbs.getWritten()[37..]) catch return FileEngineError.MemoryError;
defer self.allocator.free(null_terminated_string);
var data_toker = FileTokenizer.init(null_terminated_string);
const uuid = UUID.parse(output_fbs.getWritten()[0..36]) catch return FileEngineError.InvalidUUID;
founded = false;
// Optimize this
for (uuids) |elem| {
if (elem.compare(uuid)) {
founded = true;
break;
}
}
if (!founded) continue;
// Maybe do a JSON writer wrapper
out_writer.writeAll("{") catch return FileEngineError.WriteError;
out_writer.writeAll("id:\"") catch return FileEngineError.WriteError;
out_writer.print("{s}", .{output_fbs.getWritten()[0..36]}) catch return FileEngineError.WriteError;
out_writer.writeAll("\", ") catch return FileEngineError.WriteError;
for (try self.structName2structMembers(struct_name), try self.structName2DataType(struct_name)) |member_name, member_type| {
token = data_toker.next();
// FIXME: When relationship will be implemented, need to check if the len of NON link is 0
if (!(additional_data.member_to_find.items.len == 0 or additional_data.contains(member_name))) continue;
// write the member name and = sign
out_writer.print("{s}: ", .{member_name}) catch return FileEngineError.WriteError;
switch (member_type) {
.str => {
const str_slice = data_toker.getTokenSlice(token);
out_writer.print("\"{s}\"", .{str_slice[1 .. str_slice.len - 1]}) catch return FileEngineError.WriteError;
},
.date, .time, .datetime => {
const str_slice = data_toker.getTokenSlice(token);
out_writer.print("\"{s}\"", .{str_slice}) catch return FileEngineError.WriteError;
},
.str_array => {
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
token = data_toker.next();
while (token.tag != .r_bracket) : (token = data_toker.next()) {
out_writer.writeAll("\"") catch return FileEngineError.WriteError;
out_writer.writeAll(data_toker.getTokenSlice(token)[1..(token.loc.end - token.loc.start)]) catch return FileEngineError.WriteError;
out_writer.writeAll("\", ") catch return FileEngineError.WriteError;
}
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
},
.date_array, .time_array, .datetime_array => {
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
token = data_toker.next();
while (token.tag != .r_bracket) : (token = data_toker.next()) {
out_writer.writeAll("\"") catch return FileEngineError.WriteError;
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
out_writer.writeAll("\", ") catch return FileEngineError.WriteError;
}
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
},
.int_array, .float_array, .bool_array => {
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
token = data_toker.next();
while (token.tag != .r_bracket) : (token = data_toker.next()) {
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
out_writer.writeAll(", ") catch return FileEngineError.WriteError;
}
out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError;
},
.link => out_writer.writeAll("false") catch return FileEngineError.WriteError, // TODO: Get and send data
.link_array => out_writer.writeAll("false") catch return FileEngineError.WriteError, // TODO: Get and send data
else => out_writer.writeAll(data_toker.getTokenSlice(token)) catch return FileEngineError.WriteError, //write the value as if
}
out_writer.writeAll(", ") catch return FileEngineError.WriteError;
}
out_writer.writeAll("}") catch return FileEngineError.WriteError;
out_writer.writeAll(", ") catch return FileEngineError.WriteError;
}
out_writer.writeAll("]") catch return FileEngineError.WriteError;
}
/// Use a struct name to populate a list with all UUID of this struct
pub fn getAllUUIDList(self: *FileEngine, struct_name: []const u8, uuid_array: *std.ArrayList(UUID)) FileEngineError!void {
pub fn getAllUUIDList(self: *FileEngine, struct_name: []const u8, uuid_list: *std.ArrayList(UUID)) FileEngineError!void {
var sstruct = try self.structName2SchemaStruct(struct_name);
var iter = sstruct.uuid_file_index.keyIterator();
while (iter.next()) |key| {
uuid_array.append(UUID{ .bytes = key.* }) catch return FileEngineError.MemoryError;
uuid_list.append(UUID{ .bytes = key.* }) catch return FileEngineError.MemoryError;
}
}
/// Take a condition and an array of UUID and fill the array with all UUID that match the condition
/// TODO: Use the new filter and DataIterator
pub fn getUUIDListUsingCondition(_: *FileEngine, _: Condition, _: *std.ArrayList(UUID)) FileEngineError!void {
return;
pub fn getUUIDListUsingFilter(self: *FileEngine, struct_name: []const u8, filter: Filter, uuid_list: *std.ArrayList(UUID)) FileEngineError!void {
const sstruct = try self.structName2SchemaStruct(struct_name);
const max_file_index = try self.maxFileIndex(sstruct.name);
var path_buff = std.fmt.allocPrint(
self.allocator,
"{s}/DATA/{s}",
.{ self.path_to_ZipponDB_dir, sstruct.name },
) catch return FileEngineError.MemoryError;
defer self.allocator.free(path_buff);
const dir = std.fs.cwd().openDir(path_buff, .{}) catch return FileEngineError.CantOpenDir;
for (0..(max_file_index + 1)) |i| {
self.allocator.free(path_buff);
path_buff = std.fmt.allocPrint(self.allocator, "{d}.zid", .{i}) catch return FileEngineError.MemoryError;
var iter = zid.DataIterator.init(self.allocator, path_buff, dir, sstruct.zid_schema) catch return FileEngineError.ZipponDataError;
defer iter.deinit();
while (iter.next() catch return FileEngineError.ZipponDataError) |row| {
if (!filter.evaluate(row)) uuid_list.append(UUID{ .bytes = row[0] });
}
}
}
fn isIn(array: []usize, value: usize) bool {
for (array) |v| if (v == value) return true;
return false;
}
/// Take a filter, parse all file and if one struct if validate by the filter, write it in a JSON format to the writer
/// filter can be null. This will return all of them
pub fn parseToSendUsingFilter(
self: *FileEngine,
struct_name: []const u8,
filter: ?Filter,
buffer: *std.ArrayList(u8),
additional_data: *AdditionalData,
) FileEngineError!void {
const sstruct = try self.structName2SchemaStruct(struct_name);
const max_file_index = try self.maxFileIndex(sstruct.name);
var total_currently_found: usize = 0;
var path_buff = std.fmt.allocPrint(
self.allocator,
"{s}/DATA/{s}",
.{ self.path_to_ZipponDB_dir, sstruct.name },
) catch return FileEngineError.MemoryError;
defer self.allocator.free(path_buff);
const dir = std.fs.cwd().openDir(path_buff, .{}) catch return FileEngineError.CantOpenDir;
// If there is no member to find, that mean we need to return all members, so let's populate additional data with all of them
if (additional_data.member_to_find.items.len == 0) {
additional_data.populateWithEverything(self.allocator, sstruct.members) catch return FileEngineError.MemoryError;
}
var writer = buffer.writer();
writer.writeAll("[") catch return FileEngineError.WriteError;
for (0..(max_file_index + 1)) |file_index| { // TODO: Multi thread that
self.allocator.free(path_buff);
path_buff = std.fmt.allocPrint(self.allocator, "{d}.zid", .{file_index}) catch return FileEngineError.MemoryError;
var iter = zid.DataIterator.init(self.allocator, path_buff, dir, sstruct.zid_schema) catch return FileEngineError.ZipponDataError;
defer iter.deinit();
blk: while (iter.next() catch return FileEngineError.ZipponDataError) |row| {
if (filter != null) if (!filter.?.evaluate(row)) continue;
writer.writeByte('{') catch return FileEngineError.WriteError;
for (additional_data.member_to_find.items) |member| {
// write the member name and = sign
writer.print("{s}: ", .{member.name}) catch return FileEngineError.WriteError;
switch (row[member.index]) {
.Int => |v| writer.print("{d}", .{v}) catch return FileEngineError.WriteError,
.Float => |v| writer.print("{d}", .{v}) catch return FileEngineError.WriteError,
.Str => |v| writer.print("\"{s}\"", .{v}) catch return FileEngineError.WriteError,
.UUID => |v| writer.print("\"{s}\"", .{UUID.format_bytes(v)}) catch return FileEngineError.WriteError,
.Bool => |v| writer.print("{any}", .{v}) catch return FileEngineError.WriteError,
.Unix => |v| {
const datetime = DateTime.initUnix(v);
writer.writeByte('"') catch return FileEngineError.WriteError;
switch (try self.memberName2DataType(struct_name, member.name)) {
.date => datetime.format("YYYY/MM/DD", writer) catch return FileEngineError.WriteError,
.time => datetime.format("HH:mm:ss.SSSS", writer) catch return FileEngineError.WriteError,
.datetime => datetime.format("YYYY/MM/DD-HH:mm:ss.SSSS", writer) catch return FileEngineError.WriteError,
else => unreachable,
}
writer.writeByte('"') catch return FileEngineError.WriteError;
},
.IntArray, .FloatArray, .StrArray, .UUIDArray, .BoolArray => try writeArray(&row[member.index], writer, null),
.UnixArray => try writeArray(&row[member.index], writer, try self.memberName2DataType(struct_name, member.name)),
}
writer.writeAll(", ") catch return FileEngineError.WriteError;
}
writer.writeAll("}, ") catch return FileEngineError.WriteError;
total_currently_found += 1;
if (additional_data.entity_count_to_find != 0 and total_currently_found >= additional_data.entity_count_to_find) break :blk;
}
}
writer.writeAll("]") catch return FileEngineError.WriteError;
}
fn writeArray(data: *zid.Data, writer: anytype, datatype: ?DataType) FileEngineError!void {
writer.writeByte('[') catch return FileEngineError.WriteError;
var iter = zid.ArrayIterator.init(data) catch return FileEngineError.ZipponDataError;
switch (data.*) {
.IntArray => while (iter.next()) |v| writer.print("{d}, ", .{v.Int}) catch return FileEngineError.WriteError,
.FloatArray => while (iter.next()) |v| writer.print("{d}", .{v.Float}) catch return FileEngineError.WriteError,
.StrArray => while (iter.next()) |v| writer.print("\"{s}\"", .{v.Str}) catch return FileEngineError.WriteError,
.UUIDArray => while (iter.next()) |v| writer.print("\"{s}\"", .{UUID.format_bytes(v.UUID)}) catch return FileEngineError.WriteError,
.BoolArray => while (iter.next()) |v| writer.print("{any}", .{v.Bool}) catch return FileEngineError.WriteError,
.UnixArray => {
while (iter.next()) |v| {
const datetime = DateTime.initUnix(v.Unix);
writer.writeByte('"') catch return FileEngineError.WriteError;
switch (datatype.?) {
.date => datetime.format("YYYY/MM/DD", writer) catch return FileEngineError.WriteError,
.time => datetime.format("HH:mm:ss.SSSS", writer) catch return FileEngineError.WriteError,
.datetime => datetime.format("YYYY/MM/DD-HH:mm:ss.SSSS", writer) catch return FileEngineError.WriteError,
else => unreachable,
}
writer.writeAll("\", ") catch return FileEngineError.WriteError;
}
},
else => unreachable,
}
writer.writeByte(']') catch return FileEngineError.WriteError;
}
// --------------------Change existing files--------------------
@ -932,7 +916,7 @@ pub const FileEngine = struct {
}
pub fn memberName2DataIndex(self: *FileEngine, struct_name: []const u8, member_name: []const u8) FileEngineError!usize {
var i: usize = 0;
var i: usize = 1; // Start at 1 because there is the id
for (try self.structName2structMembers(struct_name)) |mn| {
if (std.mem.eql(u8, mn, member_name)) return i;

View File

@ -19,11 +19,11 @@ pub const AdditionalData = struct {
self.member_to_find.deinit();
}
pub fn contains(additional_data: AdditionalData, member_name: []const u8) bool {
for (additional_data.member_to_find.items) |elem| {
if (std.mem.eql(u8, member_name, elem.name)) return true;
pub fn populateWithEverything(self: *AdditionalData, allocator: Allocator, members: [][]const u8) !void {
try self.member_to_find.append(AdditionalDataMember.init(allocator, "id", 0));
for (members, 1..) |member, i| {
try self.member_to_find.append(AdditionalDataMember.init(allocator, member, i));
}
return false;
}
};
@ -31,10 +31,11 @@ pub const AdditionalData = struct {
// There is an additional data because it can be [friend [1; name]]
pub const AdditionalDataMember = struct {
name: []const u8,
index: usize, // Index place in the schema
additional_data: AdditionalData,
pub fn init(allocator: Allocator, name: []const u8) AdditionalDataMember {
pub fn init(allocator: Allocator, name: []const u8, index: usize) AdditionalDataMember {
const additional_data = AdditionalData.init(allocator);
return AdditionalDataMember{ .name = name, .additional_data = additional_data };
return AdditionalDataMember{ .name = name, .additional_data = additional_data, .index = index };
}
};

View File

@ -258,7 +258,7 @@ pub const Filter = struct {
.AND => self.evaluateNode(log.left, row) and self.evaluateNode(log.right, row),
.OR => self.evaluateNode(log.left, row) or self.evaluateNode(log.right, row),
},
.empty => unreachable, // FIXME: I think this is reachable. At least if this is the root node, so it return always true. Like in the query GRAB User {}
.empty => true,
};
}

View File

@ -78,24 +78,6 @@ pub const Parser = struct {
};
}
// TODO: Update to use ASC and DESC
// Maybe create a Sender struct or something like that
fn sendEntity(self: Parser, uuid_list: *std.ArrayList(UUID), additional_data: AdditionalData, struct_name: []const u8) void {
var buffer = std.ArrayList(u8).init(self.allocator);
defer buffer.deinit();
// Pop some element if the array is too long
if ((additional_data.entity_count_to_find != 0) and (additional_data.entity_count_to_find < uuid_list.items.len)) {
const to_pop = uuid_list.items.len - additional_data.entity_count_to_find;
for (0..to_pop) |_| _ = uuid_list.pop();
}
// Im gonna need a function in the file engine to parse and write in the buffer
self.file_engine.parseAndWriteToSend(struct_name, uuid_list.items, &buffer, additional_data) catch @panic("Error parsing data to send");
send("{s}", .{buffer.items});
}
/// Format a list of UUID into a json and send it
pub fn sendUUIDs(self: Parser, uuid_list: []UUID) ZiQlParserError!void {
var buffer = std.ArrayList(u8).init(self.allocator);
@ -226,18 +208,19 @@ pub const Parser = struct {
var filter = try self.parseFilter(struct_name, false);
defer filter.deinit();
var uuids = std.ArrayList(UUID).init(self.allocator);
defer uuids.deinit();
var buff = std.ArrayList(u8).init(self.allocator);
defer buff.deinit();
// TODO: self.sendEntity(&uuids, additional_data, struct_name);
try self.file_engine.parseToSendUsingFilter(struct_name, filter, &buff, &additional_data);
send("{s}", .{buff.items});
state = .end;
},
.eof => {
var uuids = std.ArrayList(UUID).init(self.allocator);
defer uuids.deinit();
try self.file_engine.getAllUUIDList(struct_name, &uuids);
var buff = std.ArrayList(u8).init(self.allocator);
defer buff.deinit();
self.sendEntity(&uuids, additional_data, struct_name);
try self.file_engine.parseToSendUsingFilter(struct_name, null, &buff, &additional_data);
send("{s}", .{buff.items});
state = .end;
},
else => return printError(
@ -796,6 +779,7 @@ pub const Parser = struct {
AdditionalDataMember.init(
self.allocator,
self.toker.getTokenSlice(token),
additional_data.member_to_find.items.len,
),
) catch return ZipponError.MemoryError;
@ -940,7 +924,10 @@ pub const Parser = struct {
}
}
member_map.put(member_name, self.toker.buffer[start_index..token.loc.end]) catch return ZipponError.MemoryError;
switch (data_type) {
.str => member_map.put(member_name, self.toker.buffer[start_index + 1 .. token.loc.end - 1]) catch return ZipponError.MemoryError,
else => member_map.put(member_name, self.toker.buffer[start_index..token.loc.end]) catch return ZipponError.MemoryError,
}
} else {
// Handle bool and bool array
switch (data_type) {
@ -1037,6 +1024,38 @@ test "ADD" {
try testParsing("ADD User (name = 'Bob', email='bob@email.com', age=-55, scores=[ 1 ], friends=[], bday=2000/01/01, a_time=12:04:54.8741, last_order=2000/01/01-12:45)");
}
test "GRAB filter with string" {
try testParsing("GRAB User {name = 'Bob'}");
try testParsing("GRAB User {name != 'Brittany Rogers'}");
}
test "GRAB with additional data" {
try testParsing("GRAB User [1] {age < 18}");
try testParsing("GRAB User [name] {age < 18}");
try testParsing("GRAB User [100; name] {age < 18}");
}
test "GRAB filter with int" {
try testParsing("GRAB User {age = 18}");
try testParsing("GRAB User {age > -18}");
try testParsing("GRAB User {age < 18}");
try testParsing("GRAB User {age <= 18}");
try testParsing("GRAB User {age >= 18}");
try testParsing("GRAB User {age != 18}");
}
test "GRAB filter with date" {
try testParsing("GRAB User {bday > 2000/01/01}");
try testParsing("GRAB User {a_time < 08:00}");
try testParsing("GRAB User {last_order > 2000/01/01-12:45}");
}
test "Specific query" {
try testParsing("GRAB User");
try testParsing("GRAB User {}");
try testParsing("GRAB User [1]");
}
test "Synthax error" {
try expectParsingError("GRAB {}", ZiQlParserError.StructNotFound);
try expectParsingError("GRAB User {qwe = 'qwe'}", ZiQlParserError.MemberNotFound);