From 2a4842432d84005ffbcf2bca298db9a648ef6d56 Mon Sep 17 00:00:00 2001 From: MrBounty Date: Tue, 7 Jan 2025 13:55:02 +0100 Subject: [PATCH] Speed up batch ADD and better bechmark Now I flush only when the file is full and I check the the currently used file if it is big enough. So I dont get stat of all files and flush everytime like before --- benchmark.zig | 63 ++++++++++++++++++++++++++++++---------- lib/types/out.zig | 1 + lib/zid.zig | 8 ++++-- src/fileEngine.zig | 28 ++++++++++++------ src/ziqlParser.zig | 71 +++++++++++++++++++++++++--------------------- 5 files changed, 113 insertions(+), 58 deletions(-) diff --git a/benchmark.zig b/benchmark.zig index a980234..5d0ef9f 100644 --- a/benchmark.zig +++ b/benchmark.zig @@ -8,6 +8,9 @@ const ZipponError = @import("src/stuffs/errors.zig").ZipponError; const names = [_][]const u8{ "Alice", "Bob", "Charlie", "Dave", "Eve" }; const emails = [_][]const u8{ "alice@email.com", "bob@email.com", "charlie@email.com", "dave@email.com", "eve@email.com" }; +const dates = [_][]const u8{ "2000/01/01", "1954/04/02", "1998/01/21", "1977/12/31" }; +const times = [_][]const u8{ "12:04", "20:45:11", "03:11:13", "03:00:01.0152" }; +const datetimes = [_][]const u8{ "2000/01/01-12:04", "1954/04/02-20:45:11", "1998/01/21-03:11:13", "1977/12/31-03:00:01.0153" }; const scores = [_]i32{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; pub const std_options = .{ @@ -28,34 +31,63 @@ pub fn myLog( } pub fn main() !void { - const to_test = [_]usize{ 1, 10, 100, 1_000, 10_000, 100_000, 1_000_000 }; + const to_test = [_]usize{50_000}; var line_buffer: [1024 * 1024]u8 = undefined; - // Initialize your DBEngine here var db_engine = DBEngine.init("benchmark", "schema/example"); defer db_engine.deinit(); for (to_test) |users_count| { // Populate with random dummy value + // Need some speed up, spended times to find that it is the parsonConditionValue that take time, the last switch to be exact, that parse str to value { std.debug.print("\n=====================================\n\n", .{}); std.debug.print("Populating with {d} users.\n", .{users_count}); - var gpa = std.rand.DefaultPrng.init(0); + const allocator = std.heap.page_allocator; + + var prng = std.rand.DefaultPrng.init(0); + const rng = prng.random(); const populate_start_time = std.time.nanoTimestamp(); - for (users_count) |_| { - const name = names[gpa.random().uintAtMost(usize, names.len - 1)]; - const email = emails[gpa.random().uintAtMost(usize, emails.len - 1)]; - const age = gpa.random().uintAtMost(usize, 100); - const score = scores[gpa.random().uintAtMost(usize, scores.len - 1)]; - const null_term_query_str = try std.fmt.bufPrintZ( - &line_buffer, - "ADD User (name = '{s}', email='{s}', age={d}, scores=[ {d} ], best_friend=none, friends=none, bday=2000/01/01, a_time=12:04, last_order=2000/01/01-12:45)", - .{ name, email, age, score }, + + var array = std.ArrayList(u8).init(allocator); + defer array.deinit(); + var writer = array.writer(); + + try writer.print( + "ADD User (name = '{s}', email='{s}', age={d}, scores=[ {d} ], best_friend=none, friends=none, bday={s}, a_time={s}, last_order={s})", + .{ + names[rng.uintAtMost(usize, names.len - 1)], + emails[rng.uintAtMost(usize, emails.len - 1)], + rng.uintAtMost(usize, 100), + scores[rng.uintAtMost(usize, scores.len - 1)], + dates[rng.uintAtMost(usize, dates.len - 1)], + times[rng.uintAtMost(usize, times.len - 1)], + datetimes[rng.uintAtMost(usize, datetimes.len - 1)], + }, + ); + + for (users_count - 1) |_| { + try writer.print( + "('{s}', '{s}', {d}, [ {d} ], none, none, {s}, {s}, {s})", + .{ + names[rng.uintAtMost(usize, names.len - 1)], + emails[rng.uintAtMost(usize, emails.len - 1)], + rng.uintAtMost(usize, 100), + scores[rng.uintAtMost(usize, scores.len - 1)], + dates[rng.uintAtMost(usize, dates.len - 1)], + times[rng.uintAtMost(usize, times.len - 1)], + datetimes[rng.uintAtMost(usize, datetimes.len - 1)], + }, ); - var toker = ziqlTokenizer.init(null_term_query_str); - var parser = ziqlParser.init(&toker, &db_engine.file_engine, &db_engine.schema_engine); - try parser.parse(); } + + const null_term_query_str = try std.fmt.allocPrintZ(allocator, "{s}", .{array.items}); + defer allocator.free(null_term_query_str); + + var toker = ziqlTokenizer.init(null_term_query_str); + var parser = ziqlParser.init(&toker, &db_engine.file_engine, &db_engine.schema_engine); + try parser.parse(); + const populate_end_time = std.time.nanoTimestamp(); const populate_duration = @as(f64, @floatFromInt(populate_end_time - populate_start_time)) / 1e9; @@ -76,6 +108,7 @@ pub fn main() !void { "GRAB User [name] {}", "GRAB User {name = 'Charlie'}", "GRAB User {age > 30}", + "GRAB User {bday > 2000/01/01}", "DELETE User {}", }; diff --git a/lib/types/out.zig b/lib/types/out.zig index 97a8cef..4af6be7 100644 --- a/lib/types/out.zig +++ b/lib/types/out.zig @@ -1,6 +1,7 @@ // This file is just to expose what I need to grab pub const UUID = @import("uuid.zig").UUID; +pub const Zero = @import("uuid.zig").zero; pub const DateTime = @import("date.zig").DateTime; pub const OR = @import("uuid.zig").OR; pub const AND = @import("uuid.zig").AND; diff --git a/lib/zid.zig b/lib/zid.zig index bcfbeaa..7d055ee 100644 --- a/lib/zid.zig +++ b/lib/zid.zig @@ -509,7 +509,7 @@ pub const ArrayIterator = struct { /// Performance concern once again. pub const DataWriter = struct { file: std.fs.File, - writer: std.io.BufferedWriter(4096, std.fs.File.Writer), + writer: std.io.BufferedWriter(4096, std.fs.File.Writer), // TODO: Increase buffer size, this should speed up a bit pub fn init(name: []const u8, dir: ?std.fs.Dir) !DataWriter { const d_ = dir orelse std.fs.cwd(); @@ -533,6 +533,10 @@ pub const DataWriter = struct { pub fn flush(self: *DataWriter) !void { try self.writer.flush(); } + + pub fn fileStat(self: DataWriter) !std.fs.File.Stat { + return self.file.stat(); + } }; /// Create a new data file that can then be use by the DataWriter @@ -557,7 +561,7 @@ pub fn statFile(name: []const u8, dir: ?std.fs.Dir) !std.fs.File.Stat { // I have almost more lines of test than the real stuff x) // But I think everything is tested to be fair, so good stuff // It also write benchmark so you can benchmark on your own hardware -// The data write and read is not really representative of real worl tho +// The data write and read is not really representative of real world tho test "Array Iterators" { const allocator = std.testing.allocator; diff --git a/src/fileEngine.zig b/src/fileEngine.zig index 3d38408..b78bed7 100644 --- a/src/fileEngine.zig +++ b/src/fileEngine.zig @@ -685,26 +685,36 @@ pub const FileEngine = struct { pub fn addEntity( self: *FileEngine, struct_name: []const u8, - map: std.StringHashMap(ConditionValue), + maps: []std.StringHashMap(ConditionValue), writer: anytype, - n: usize, ) ZipponError!void { var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); defer arena.deinit(); const allocator = arena.allocator(); - const file_index = try self.getFirstUsableIndexFile(struct_name); // TODO: Speed up this - - const path = std.fmt.bufPrint(&path_buffer, "{s}/DATA/{s}/{d}.zid", .{ self.path_to_ZipponDB_dir, struct_name, file_index }) catch return FileEngineError.MemoryError; - const data = try self.orderedNewData(allocator, struct_name, map); + var file_index = try self.getFirstUsableIndexFile(struct_name); // TODO: Speed up this + var path = std.fmt.bufPrint(&path_buffer, "{s}/DATA/{s}/{d}.zid", .{ self.path_to_ZipponDB_dir, struct_name, file_index }) catch return FileEngineError.MemoryError; var data_writer = zid.DataWriter.init(path, null) catch return FileEngineError.ZipponDataError; defer data_writer.deinit(); - for (0..n) |_| data_writer.write(data) catch return FileEngineError.ZipponDataError; - data_writer.flush() catch return FileEngineError.ZipponDataError; + for (maps) |map| { + const data = try self.orderedNewData(allocator, struct_name, map); + data_writer.write(data) catch return FileEngineError.ZipponDataError; + writer.print("\"{s}\", ", .{UUID.format_bytes(data[0].UUID)}) catch return FileEngineError.WriteError; - writer.print("\"{s}\", ", .{UUID.format_bytes(data[0].UUID)}) catch return FileEngineError.WriteError; + const file_stat = data_writer.fileStat() catch return ZipponError.ZipponDataError; + if (file_stat.size > MAX_FILE_SIZE) { + file_index = try self.getFirstUsableIndexFile(struct_name); + data_writer.flush() catch return FileEngineError.ZipponDataError; + data_writer.deinit(); + + path = std.fmt.bufPrint(&path_buffer, "{s}/DATA/{s}/{d}.zid", .{ self.path_to_ZipponDB_dir, struct_name, file_index }) catch return FileEngineError.MemoryError; + data_writer = zid.DataWriter.init(path, null) catch return FileEngineError.ZipponDataError; + } + } + + data_writer.flush() catch return FileEngineError.ZipponDataError; } pub fn updateEntities( diff --git a/src/ziqlParser.zig b/src/ziqlParser.zig index 8a2f0bf..935396d 100644 --- a/src/ziqlParser.zig +++ b/src/ziqlParser.zig @@ -309,7 +309,7 @@ pub const Parser = struct { ), }, - // TODO: Be able to do it in batch + // TODO: Speed up batch by flushing one time and speed up how to find which file to use .parse_new_data_and_add_data => { var order = std.ArrayList([]const u8).init(allocator); defer order.deinit(); @@ -319,9 +319,14 @@ pub const Parser = struct { defer buff.deinit(); buff.writer().writeAll("[") catch return ZipponError.WriteError; - while (true) { - var data_map = std.StringHashMap(ConditionValue).init(allocator); - defer data_map.deinit(); + var maps = std.ArrayList(std.StringHashMap(ConditionValue)).init(allocator); + defer maps.deinit(); + + var data_map = std.StringHashMap(ConditionValue).init(allocator); + defer data_map.deinit(); + + while (true) { // I could multithread that as it do take a long time for big benchmark + data_map.clearRetainingCapacity(); try self.parseNewData(allocator, &data_map, struct_name, &order, ordered); ordered = true; @@ -345,12 +350,15 @@ pub const Parser = struct { ); } - token = self.toker.last_token; - self.file_engine.addEntity(struct_name, data_map, &buff.writer(), 1) catch return ZipponError.CantWriteEntity; + maps.append(data_map.clone() catch return ZipponError.MemoryError) catch return ZipponError.MemoryError; + token = self.toker.last_token; if (token.tag == .l_paren) continue; break; } + + self.file_engine.addEntity(struct_name, maps.items, &buff.writer()) catch return ZipponError.CantWriteEntity; + buff.writer().writeAll("]") catch return ZipponError.WriteError; send("{s}", .{buff.items}); state = .end; @@ -779,7 +787,7 @@ pub const Parser = struct { ) !void { var token = self.toker.next(); var keep_next = false; - var member_name: []const u8 = undefined; // Maybe use allocator.alloc + var member_name: []const u8 = undefined; var state: State = .expect_member_OR_value; var i: usize = 0; @@ -970,29 +978,30 @@ pub const Parser = struct { } // And finally create the ConditionValue - var value: ConditionValue = undefined; + // FIXME: This take the majority of time when ADD in big batch. Need serious speed up. I aim to be able to load a simple 10MB query in less then 0.1s + // Rn for 100_000 users for around 10Mb, it take 30s... I mean come on, 30s ? For 10MB ? That suck... switch (data_type) { - .int => value = ConditionValue.initInt(self.toker.buffer[start_index..token.loc.end]), - .float => value = ConditionValue.initFloat(self.toker.buffer[start_index..token.loc.end]), - .str => value = ConditionValue.initStr(self.toker.buffer[start_index + 1 .. token.loc.end - 1]), - .date => value = ConditionValue.initDate(self.toker.buffer[start_index..token.loc.end]), - .time => value = ConditionValue.initTime(self.toker.buffer[start_index..token.loc.end]), - .datetime => value = ConditionValue.initDateTime(self.toker.buffer[start_index..token.loc.end]), - .bool => value = ConditionValue.initBool(self.toker.buffer[start_index..token.loc.end]), - .int_array => value = try ConditionValue.initArrayInt(allocator, self.toker.buffer[start_index..token.loc.end]), - .str_array => value = try ConditionValue.initArrayStr(allocator, self.toker.buffer[start_index..token.loc.end]), - .bool_array => value = try ConditionValue.initArrayBool(allocator, self.toker.buffer[start_index..token.loc.end]), - .float_array => value = try ConditionValue.initArrayFloat(allocator, self.toker.buffer[start_index..token.loc.end]), - .date_array => value = try ConditionValue.initArrayDate(allocator, self.toker.buffer[start_index..token.loc.end]), - .time_array => value = try ConditionValue.initArrayTime(allocator, self.toker.buffer[start_index..token.loc.end]), - .datetime_array => value = try ConditionValue.initArrayDateTime(allocator, self.toker.buffer[start_index..token.loc.end]), + .int => return ConditionValue.initInt(self.toker.buffer[start_index..token.loc.end]), + .float => return ConditionValue.initFloat(self.toker.buffer[start_index..token.loc.end]), + .str => return ConditionValue.initStr(self.toker.buffer[start_index + 1 .. token.loc.end - 1]), + .date => return ConditionValue.initDate(self.toker.buffer[start_index..token.loc.end]), + .time => return ConditionValue.initTime(self.toker.buffer[start_index..token.loc.end]), + .datetime => return ConditionValue.initDateTime(self.toker.buffer[start_index..token.loc.end]), + .bool => return ConditionValue.initBool(self.toker.buffer[start_index..token.loc.end]), + .int_array => return try ConditionValue.initArrayInt(allocator, self.toker.buffer[start_index..token.loc.end]), + .str_array => return try ConditionValue.initArrayStr(allocator, self.toker.buffer[start_index..token.loc.end]), + .bool_array => return try ConditionValue.initArrayBool(allocator, self.toker.buffer[start_index..token.loc.end]), + .float_array => return try ConditionValue.initArrayFloat(allocator, self.toker.buffer[start_index..token.loc.end]), + .date_array => return try ConditionValue.initArrayDate(allocator, self.toker.buffer[start_index..token.loc.end]), + .time_array => return try ConditionValue.initArrayTime(allocator, self.toker.buffer[start_index..token.loc.end]), + .datetime_array => return try ConditionValue.initArrayDateTime(allocator, self.toker.buffer[start_index..token.loc.end]), .link => switch (token.tag) { - .keyword_none => { + .keyword_none => { // TODO: Stop creating a map if empty, can be null or something. Or maybe just keep one map link that in memory, so I dont create it everytime const map = allocator.create(std.AutoHashMap(UUID, void)) catch return ZipponError.MemoryError; map.* = std.AutoHashMap(UUID, void).init(allocator); - _ = map.getOrPut(UUID.parse("00000000-0000-0000-0000-000000000000") catch @panic("Sorry wot ?")) catch return ZipponError.MemoryError; - value = ConditionValue.initLink(map); + map.put(dtype.Zero, {}) catch return ZipponError.MemoryError; _ = self.toker.next(); + return ConditionValue.initLink(map); }, .uuid_literal => { const uuid = UUID.parse(self.toker.buffer[start_index..token.loc.end]) catch return ZipponError.InvalidUUID; @@ -1006,9 +1015,9 @@ pub const Parser = struct { const map = allocator.create(std.AutoHashMap(UUID, void)) catch return ZipponError.MemoryError; map.* = std.AutoHashMap(UUID, void).init(allocator); - _ = map.getOrPut(uuid) catch return ZipponError.MemoryError; - value = ConditionValue.initLink(map); + map.put(uuid, {}) catch return ZipponError.MemoryError; _ = self.toker.next(); + return ConditionValue.initLink(map); }, .l_brace, .l_bracket => { var filter: ?Filter = null; @@ -1040,7 +1049,7 @@ pub const Parser = struct { map, &additional_data, ); - value = ConditionValue.initLink(map); + return ConditionValue.initLink(map); }, else => return printError( @@ -1055,8 +1064,8 @@ pub const Parser = struct { .keyword_none => { const map = allocator.create(std.AutoHashMap(UUID, void)) catch return ZipponError.MemoryError; map.* = std.AutoHashMap(UUID, void).init(allocator); - value = ConditionValue.initArrayLink(map); _ = self.toker.next(); + return ConditionValue.initArrayLink(map); }, .l_brace, .l_bracket => { var filter: ?Filter = null; @@ -1088,7 +1097,7 @@ pub const Parser = struct { map, &additional_data, ); - value = ConditionValue.initArrayLink(map); + return ConditionValue.initArrayLink(map); }, else => return printError( "Error: Expected uuid or none", @@ -1100,8 +1109,6 @@ pub const Parser = struct { }, .self => unreachable, } - - return value; } /// Check if all token in an array is of one specific type