Make mem.split and mem.tokenize generic instead of assuming u8

This allows these functions to work on slices of u16, etc
2026-02-13 04:48:20 +00:00 · 2021-08-06 01:53:07 -07:00 · 2021-08-06 01:53:07 -07:00 · 05fd20dc10
commit 05fd20dc10
parent ea7bdeb67d
1 changed files with 128 additions and 80 deletions
--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@ -1575,8 +1575,8 @@ test "bswapAllFields" {
 /// If `delimiter_bytes` does not exist in buffer,
 /// the iterator will return `buffer`, null, in that order.
 /// See also the related function `split`.
-pub fn tokenize(buffer: []const u8, delimiter_bytes: []const u8) TokenIterator {
-    return TokenIterator{
+pub fn tokenize(comptime T: type, buffer: []const T, delimiter_bytes: []const T) TokenIterator(T) {
+    return .{
        .index = 0,
        .buffer = buffer,
        .delimiter_bytes = delimiter_bytes,
@ -1584,51 +1584,71 @@ pub fn tokenize(buffer: []const u8, delimiter_bytes: []const u8) TokenIterator {
 }

 test "mem.tokenize" {
-    var it = tokenize("   abc def   ghi  ", " ");
+    var it = tokenize(u8, "   abc def   ghi  ", " ");
    try testing.expect(eql(u8, it.next().?, "abc"));
    try testing.expect(eql(u8, it.next().?, "def"));
    try testing.expect(eql(u8, it.next().?, "ghi"));
    try testing.expect(it.next() == null);

-    it = tokenize("..\\bob", "\\");
+    it = tokenize(u8, "..\\bob", "\\");
    try testing.expect(eql(u8, it.next().?, ".."));
    try testing.expect(eql(u8, "..", "..\\bob"[0..it.index]));
    try testing.expect(eql(u8, it.next().?, "bob"));
    try testing.expect(it.next() == null);

-    it = tokenize("//a/b", "/");
+    it = tokenize(u8, "//a/b", "/");
    try testing.expect(eql(u8, it.next().?, "a"));
    try testing.expect(eql(u8, it.next().?, "b"));
    try testing.expect(eql(u8, "//a/b", "//a/b"[0..it.index]));
    try testing.expect(it.next() == null);

-    it = tokenize("|", "|");
+    it = tokenize(u8, "|", "|");
    try testing.expect(it.next() == null);

-    it = tokenize("", "|");
+    it = tokenize(u8, "", "|");
    try testing.expect(it.next() == null);

-    it = tokenize("hello", "");
+    it = tokenize(u8, "hello", "");
    try testing.expect(eql(u8, it.next().?, "hello"));
    try testing.expect(it.next() == null);

-    it = tokenize("hello", " ");
+    it = tokenize(u8, "hello", " ");
    try testing.expect(eql(u8, it.next().?, "hello"));
    try testing.expect(it.next() == null);
+
+    var it16 = tokenize(
+        u16,
+        std.unicode.utf8ToUtf16LeStringLiteral("hello"),
+        std.unicode.utf8ToUtf16LeStringLiteral(" "),
+    );
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello")));
+    try testing.expect(it16.next() == null);
 }

 test "mem.tokenize (multibyte)" {
-    var it = tokenize("a|b,c/d e", " /,|");
+    var it = tokenize(u8, "a|b,c/d e", " /,|");
    try testing.expect(eql(u8, it.next().?, "a"));
    try testing.expect(eql(u8, it.next().?, "b"));
    try testing.expect(eql(u8, it.next().?, "c"));
    try testing.expect(eql(u8, it.next().?, "d"));
    try testing.expect(eql(u8, it.next().?, "e"));
    try testing.expect(it.next() == null);
+
+    var it16 = tokenize(
+        u16,
+        std.unicode.utf8ToUtf16LeStringLiteral("a|b,c/d e"),
+        std.unicode.utf8ToUtf16LeStringLiteral(" /,|"),
+    );
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("e")));
+    try testing.expect(it16.next() == null);
 }

 test "mem.tokenize (reset)" {
-    var it = tokenize("   abc def   ghi  ", " ");
+    var it = tokenize(u8, "   abc def   ghi  ", " ");
    try testing.expect(eql(u8, it.next().?, "abc"));
    try testing.expect(eql(u8, it.next().?, "def"));
    try testing.expect(eql(u8, it.next().?, "ghi"));
@ -1649,9 +1669,9 @@ test "mem.tokenize (reset)" {
 /// the iterator will return `buffer`, null, in that order.
 /// The delimiter length must not be zero.
 /// See also the related function `tokenize`.
-pub fn split(buffer: []const u8, delimiter: []const u8) SplitIterator {
+pub fn split(comptime T: type, buffer: []const T, delimiter: []const T) SplitIterator(T) {
    assert(delimiter.len != 0);
-    return SplitIterator{
+    return .{
        .index = 0,
        .buffer = buffer,
        .delimiter = delimiter,
@ -1661,35 +1681,55 @@ pub fn split(buffer: []const u8, delimiter: []const u8) SplitIterator {
 pub const separate = @compileError("deprecated: renamed to split (behavior remains unchanged)");

 test "mem.split" {
-    var it = split("abc|def||ghi", "|");
+    var it = split(u8, "abc|def||ghi", "|");
    try testing.expect(eql(u8, it.next().?, "abc"));
    try testing.expect(eql(u8, it.next().?, "def"));
    try testing.expect(eql(u8, it.next().?, ""));
    try testing.expect(eql(u8, it.next().?, "ghi"));
    try testing.expect(it.next() == null);

-    it = split("", "|");
+    it = split(u8, "", "|");
    try testing.expect(eql(u8, it.next().?, ""));
    try testing.expect(it.next() == null);

-    it = split("|", "|");
+    it = split(u8, "|", "|");
    try testing.expect(eql(u8, it.next().?, ""));
    try testing.expect(eql(u8, it.next().?, ""));
    try testing.expect(it.next() == null);

-    it = split("hello", " ");
+    it = split(u8, "hello", " ");
    try testing.expect(eql(u8, it.next().?, "hello"));
    try testing.expect(it.next() == null);
+
+    var it16 = split(
+        u16,
+        std.unicode.utf8ToUtf16LeStringLiteral("hello"),
+        std.unicode.utf8ToUtf16LeStringLiteral(" "),
+    );
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("hello")));
+    try testing.expect(it16.next() == null);
 }

 test "mem.split (multibyte)" {
-    var it = split("a, b ,, c, d, e", ", ");
+    var it = split(u8, "a, b ,, c, d, e", ", ");
    try testing.expect(eql(u8, it.next().?, "a"));
    try testing.expect(eql(u8, it.next().?, "b ,"));
    try testing.expect(eql(u8, it.next().?, "c"));
    try testing.expect(eql(u8, it.next().?, "d"));
    try testing.expect(eql(u8, it.next().?, "e"));
    try testing.expect(it.next() == null);
+
+    var it16 = split(
+        u16,
+        std.unicode.utf8ToUtf16LeStringLiteral("a, b ,, c, d, e"),
+        std.unicode.utf8ToUtf16LeStringLiteral(", "),
+    );
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("a")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("b ,")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("c")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("d")));
+    try testing.expect(eql(u16, it16.next().?, std.unicode.utf8ToUtf16LeStringLiteral("e")));
+    try testing.expect(it16.next() == null);
 }

 pub fn startsWith(comptime T: type, haystack: []const T, needle: []const T) bool {
@ -1710,75 +1750,83 @@ test "mem.endsWith" {
    try testing.expect(!endsWith(u8, "Bob", "Bo"));
 }

-pub const TokenIterator = struct {
-    buffer: []const u8,
-    delimiter_bytes: []const u8,
-    index: usize,
+pub fn TokenIterator(comptime T: type) type {
+    return struct {
+        buffer: []const T,
+        delimiter_bytes: []const T,
+        index: usize,

-    /// Returns a slice of the next token, or null if tokenization is complete.
-    pub fn next(self: *TokenIterator) ?[]const u8 {
-        // move to beginning of token
-        while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
-        const start = self.index;
-        if (start == self.buffer.len) {
-            return null;
-        }
+        const Self = @This();

-        // move to end of token
-        while (self.index < self.buffer.len and !self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
-        const end = self.index;
-
-        return self.buffer[start..end];
-    }
-
-    /// Returns a slice of the remaining bytes. Does not affect iterator state.
-    pub fn rest(self: TokenIterator) []const u8 {
-        // move to beginning of token
-        var index: usize = self.index;
-        while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
-        return self.buffer[index..];
-    }
-
-    /// Resets the iterator to the initial token.
-    pub fn reset(self: *TokenIterator) void {
-        self.index = 0;
-    }
-
-    fn isSplitByte(self: TokenIterator, byte: u8) bool {
-        for (self.delimiter_bytes) |delimiter_byte| {
-            if (byte == delimiter_byte) {
-                return true;
+        /// Returns a slice of the next token, or null if tokenization is complete.
+        pub fn next(self: *Self) ?[]const T {
+            // move to beginning of token
+            while (self.index < self.buffer.len and self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+            const start = self.index;
+            if (start == self.buffer.len) {
+                return null;
            }
+
+            // move to end of token
+            while (self.index < self.buffer.len and !self.isSplitByte(self.buffer[self.index])) : (self.index += 1) {}
+            const end = self.index;
+
+            return self.buffer[start..end];
        }
-        return false;
-    }
-};

-pub const SplitIterator = struct {
-    buffer: []const u8,
-    index: ?usize,
-    delimiter: []const u8,
+        /// Returns a slice of the remaining bytes. Does not affect iterator state.
+        pub fn rest(self: Self) []const T {
+            // move to beginning of token
+            var index: usize = self.index;
+            while (index < self.buffer.len and self.isSplitByte(self.buffer[index])) : (index += 1) {}
+            return self.buffer[index..];
+        }

-    /// Returns a slice of the next field, or null if splitting is complete.
-    pub fn next(self: *SplitIterator) ?[]const u8 {
-        const start = self.index orelse return null;
-        const end = if (indexOfPos(u8, self.buffer, start, self.delimiter)) |delim_start| blk: {
-            self.index = delim_start + self.delimiter.len;
-            break :blk delim_start;
-        } else blk: {
-            self.index = null;
-            break :blk self.buffer.len;
-        };
-        return self.buffer[start..end];
-    }
+        /// Resets the iterator to the initial token.
+        pub fn reset(self: *Self) void {
+            self.index = 0;
+        }

-    /// Returns a slice of the remaining bytes. Does not affect iterator state.
-    pub fn rest(self: SplitIterator) []const u8 {
-        const end = self.buffer.len;
-        const start = self.index orelse end;
-        return self.buffer[start..end];
-    }
-};
+        fn isSplitByte(self: Self, byte: T) bool {
+            for (self.delimiter_bytes) |delimiter_byte| {
+                if (byte == delimiter_byte) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    };
+}
+
+pub fn SplitIterator(comptime T: type) type {
+    return struct {
+        buffer: []const T,
+        index: ?usize,
+        delimiter: []const T,
+
+        const Self = @This();
+
+        /// Returns a slice of the next field, or null if splitting is complete.
+        pub fn next(self: *Self) ?[]const T {
+            const start = self.index orelse return null;
+            const end = if (indexOfPos(T, self.buffer, start, self.delimiter)) |delim_start| blk: {
+                self.index = delim_start + self.delimiter.len;
+                break :blk delim_start;
+            } else blk: {
+                self.index = null;
+                break :blk self.buffer.len;
+            };
+            return self.buffer[start..end];
+        }
+
+        /// Returns a slice of the remaining bytes. Does not affect iterator state.
+        pub fn rest(self: Self) []const T {
+            const end = self.buffer.len;
+            const start = self.index orelse end;
+            return self.buffer[start..end];
+        }
+    };
+}

 /// Naively combines a series of slices with a separator.
 /// Allocates memory for the result, which must be freed by the caller.