From ba41a9d5d7b761ad8e3ad0f1b863c7f6457598a6 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 26 May 2020 14:07:47 -0400 Subject: [PATCH] different strategy for tokenizing keywords throughput: 279 MiB/s => 347 MiB/s --- lib/std/zig/tokenizer.zig | 49 ++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/lib/std/zig/tokenizer.zig b/lib/std/zig/tokenizer.zig index 3ed57f621f..defb888027 100644 --- a/lib/std/zig/tokenizer.zig +++ b/lib/std/zig/tokenizer.zig @@ -13,14 +13,11 @@ pub const Token = struct { pub const Keyword = struct { bytes: []const u8, id: Id, - hash: u32, fn init(bytes: []const u8, id: Id) Keyword { - @setEvalBranchQuota(2000); return .{ .bytes = bytes, .id = id, - .hash = std.hash_map.hashString(bytes), }; } }; @@ -79,15 +76,49 @@ pub const Token = struct { Keyword.init("while", .Keyword_while), }; - // TODO perfect hash at comptime pub fn getKeyword(bytes: []const u8) ?Id { - var hash = std.hash_map.hashString(bytes); - for (keywords) |kw| { - if (kw.hash == hash and mem.eql(u8, kw.bytes, bytes)) { - return kw.id; + const precomputed = comptime blk: { + @setEvalBranchQuota(2000); + var sorted_keywords = keywords; + const lenAsc = (struct { + fn lenAsc(a: Keyword, b: Keyword) bool { + return a.bytes.len < b.bytes.len; + } + }).lenAsc; + std.sort.sort(Keyword, &sorted_keywords, lenAsc); + const min_len = sorted_keywords[0].bytes.len; + const max_len = sorted_keywords[sorted_keywords.len - 1].bytes.len; + var len_indexes: [max_len + 1]usize = undefined; + var len: usize = 0; + var kw_i: usize = 0; + while (len <= max_len) : (len += 1) { + // find the first keyword len == len + while (len > sorted_keywords[kw_i].bytes.len) { + kw_i += 1; + } + len_indexes[len] = kw_i; } + break :blk .{ + .min_len = min_len, + .max_len = max_len, + .sorted_keywords = sorted_keywords, + .len_indexes = len_indexes, + }; + }; + if (bytes.len < precomputed.min_len or bytes.len > precomputed.max_len) + return null; + + var i = precomputed.len_indexes[bytes.len]; + while (true) { + const kw = precomputed.sorted_keywords[i]; + if (kw.bytes.len != bytes.len) + return null; + if (mem.eql(u8, kw.bytes, bytes)) + return kw.id; + i += 1; + if (i >= precomputed.sorted_keywords.len) + return null; } - return null; } pub const Id = enum {