From c66d3f6bf6be62d565a444792390655f4db3bd7a Mon Sep 17 00:00:00 2001 From: Zhora Trush Date: Tue, 18 Oct 2022 14:49:09 +0200 Subject: [PATCH] Enhance indexOfIgnoreCase with Boyer-Moore-Horspool algorithm --- lib/std/ascii.zig | 53 +++++++++++++++++++++++++++++++++++++++-------- lib/std/mem.zig | 13 ++++++------ 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/lib/std/ascii.zig b/lib/std/ascii.zig index cd8b14e98f..eac3ba0565 100644 --- a/lib/std/ascii.zig +++ b/lib/std/ascii.zig @@ -555,22 +555,54 @@ test "ascii.endsWithIgnoreCase" { try std.testing.expect(!endsWithIgnoreCase("BoB", "Bo")); } -/// Finds `substr` in `container`, ignoring case, starting at `start_index`. -/// TODO boyer-moore algorithm -pub fn indexOfIgnoreCasePos(container: []const u8, start_index: usize, substr: []const u8) ?usize { - if (substr.len > container.len) return null; +/// Finds `needle` in `haystack`, ignoring case, starting at index 0. +pub fn indexOfIgnoreCase(haystack: []const u8, needle: []const u8) ?usize { + return indexOfIgnoreCasePos(haystack, 0, needle); +} + +/// Finds `needle` in `haystack`, ignoring case, starting at `start_index`. +/// Uses Boyer-Moore-Horspool algorithm on large inputs; `indexOfIgnoreCasePosLinear` on small inputs. +pub fn indexOfIgnoreCasePos(haystack: []const u8, start_index: usize, needle: []const u8) ?usize { + if (needle.len > haystack.len) return null; + if (needle.len == 0) return start_index; + + if (haystack.len < 52 or needle.len <= 4) + return indexOfIgnoreCasePosLinear(haystack, start_index, needle); + + var skip_table: [256]usize = undefined; + boyerMooreHorspoolPreprocessIgnoreCase(needle, skip_table[0..]); var i: usize = start_index; - const end = container.len - substr.len; + while (i <= haystack.len - needle.len) { + if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i; + i += skip_table[toLower(haystack[i + needle.len - 1])]; + } + + return null; +} + +/// Consider using `indexOfIgnoreCasePos` instead of this, which will automatically use a +/// more sophisticated algorithm on larger inputs. +pub fn indexOfIgnoreCasePosLinear(haystack: []const u8, start_index: usize, needle: []const u8) ?usize { + var i: usize = start_index; + const end = haystack.len - needle.len; while (i <= end) : (i += 1) { - if (eqlIgnoreCase(container[i .. i + substr.len], substr)) return i; + if (eqlIgnoreCase(haystack[i .. i + needle.len], needle)) return i; } return null; } -/// Finds `substr` in `container`, ignoring case, starting at index 0. -pub fn indexOfIgnoreCase(container: []const u8, substr: []const u8) ?usize { - return indexOfIgnoreCasePos(container, 0, substr); +fn boyerMooreHorspoolPreprocessIgnoreCase(pattern: []const u8, table: *[256]usize) void { + for (table) |*c| { + c.* = pattern.len; + } + + var i: usize = 0; + // The last item is intentionally ignored and the skip size will be pattern.len. + // This is the standard way Boyer-Moore-Horspool is implemented. + while (i < pattern.len - 1) : (i += 1) { + table[toLower(pattern[i])] = pattern.len - 1 - i; + } } test "indexOfIgnoreCase" { @@ -579,6 +611,9 @@ test "indexOfIgnoreCase" { try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0); try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null); try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0); + + try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "ThReE fOUr").? == 8); + try std.testing.expect(indexOfIgnoreCase("one two three four five six seven eight nine ten eleven", "Two tWo") == null); } /// Returns the lexicographical order of two slices. O(n). diff --git a/lib/std/mem.zig b/lib/std/mem.zig index 4000030fc0..d43ba70477 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -1083,7 +1083,7 @@ fn boyerMooreHorspoolPreprocessReverse(pattern: []const u8, table: *[256]usize) var i: usize = pattern.len - 1; // The first item is intentionally ignored and the skip size will be pattern.len. - // This is the standard way boyer-moore-horspool is implemented. + // This is the standard way Boyer-Moore-Horspool is implemented. while (i > 0) : (i -= 1) { table[pattern[i]] = i; } @@ -1096,14 +1096,15 @@ fn boyerMooreHorspoolPreprocess(pattern: []const u8, table: *[256]usize) void { var i: usize = 0; // The last item is intentionally ignored and the skip size will be pattern.len. - // This is the standard way boyer-moore-horspool is implemented. + // This is the standard way Boyer-Moore-Horspool is implemented. while (i < pattern.len - 1) : (i += 1) { table[pattern[i]] = pattern.len - 1 - i; } } + /// Find the index in a slice of a sub-slice, searching from the end backwards. /// To start looking at a different index, slice the haystack first. -/// Uses the Reverse boyer-moore-horspool algorithm on large inputs; +/// Uses the Reverse Boyer-Moore-Horspool algorithm on large inputs; /// `lastIndexOfLinear` on small inputs. pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize { if (needle.len > haystack.len) return null; @@ -1131,7 +1132,7 @@ pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?us return null; } -/// Uses Boyer-moore-horspool algorithm on large inputs; `indexOfPosLinear` on small inputs. +/// Uses Boyer-Moore-Horspool algorithm on large inputs; `indexOfPosLinear` on small inputs. pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize { if (needle.len > haystack.len) return null; if (needle.len == 0) return start_index; @@ -1183,7 +1184,7 @@ test "indexOf" { test "indexOf multibyte" { { - // make haystack and needle long enough to trigger boyer-moore-horspool algorithm + // make haystack and needle long enough to trigger Boyer-Moore-Horspool algorithm const haystack = [1]u16{0} ** 100 ++ [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee, 0x00ff }; const needle = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee }; try testing.expectEqual(indexOfPos(u16, &haystack, 0, &needle), 100); @@ -1196,7 +1197,7 @@ test "indexOf multibyte" { } { - // make haystack and needle long enough to trigger boyer-moore-horspool algorithm + // make haystack and needle long enough to trigger Boyer-Moore-Horspool algorithm const haystack = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee, 0x00ff } ++ [1]u16{0} ** 100; const needle = [_]u16{ 0xbbaa, 0xccbb, 0xddcc, 0xeedd, 0xffee }; try testing.expectEqual(lastIndexOf(u16, &haystack, &needle), 0);