diff --git a/lib/std/hash/benchmark.zig b/lib/std/hash/benchmark.zig
index 699de5ceb4..322adeb61c 100644
--- a/lib/std/hash/benchmark.zig
+++ b/lib/std/hash/benchmark.zig
@@ -18,6 +18,7 @@ const Hash = struct {
     name: []const u8,
     has_iterative_api: bool = true,
     has_crypto_api: bool = false,
+    has_anytype_api: ?[]const comptime_int = null,
     init_u8s: ?[]const u8 = null,
     init_u64: ?u64 = null,
 };
@@ -27,11 +28,13 @@ const hashes = [_]Hash{
         .ty = hash.XxHash64,
         .name = "xxhash64",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.XxHash32,
         .name = "xxhash32",
         .init_u64 = 0,
+        .has_anytype_api = @as([]const comptime_int, &[_]comptime_int{ 8, 16, 32, 48, 64, 80, 96, 112, 128 }),
     },
     Hash{
         .ty = hash.Wyhash,
@@ -99,14 +102,14 @@ const Result = struct {
 };
 
 const block_size: usize = 8 * 8192;
-const alignment: usize = 64;
 
 pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Allocator) !Result {
-    const blocks_count = bytes / block_size;
-    var blocks = try allocator.alloc(u8, block_size + alignment * (blocks_count - 1));
+    var blocks = try allocator.alloc(u8, bytes);
     defer allocator.free(blocks);
     random.bytes(blocks);
 
+    const block_count = bytes / block_size;
+
     var h = blk: {
         if (H.init_u8s) |init| {
             break :blk H.ty.init(init[0..H.ty.key_length]);
@@ -118,17 +121,17 @@ pub fn benchmarkHash(comptime H: anytype, bytes: usize, allocator: std.mem.Alloc
     };
 
     var timer = try Timer.start();
-    const start = timer.lap();
-    for (0..blocks_count) |i| {
-        h.update(blocks[i * alignment ..][0..block_size]);
+    for (0..block_count) |i| {
+        h.update(blocks[i * block_size ..][0..block_size]);
     }
     const final = if (H.has_crypto_api) @as(u64, @truncate(h.finalInt())) else h.final();
     std.mem.doNotOptimizeAway(final);
 
-    const end = timer.read();
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(block_size * block_count);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
 
     return Result{
         .hash = final,
@@ -144,7 +147,6 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     const key_count = bytes / key_size;
 
     var timer = try Timer.start();
-    const start = timer.lap();
 
     var sum: u64 = 0;
     for (0..key_count) |i| {
@@ -164,10 +166,11 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
         };
         sum +%= final;
     }
-    const end = timer.read();
+    const elapsed_ns = timer.read();
 
-    const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s;
-    const throughput = @as(u64, @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s));
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const size_float: f64 = @floatFromInt(key_count * key_size);
+    const throughput: u64 = @intFromFloat(size_float / elapsed_s);
 
     std.mem.doNotOptimizeAway(sum);
 
@@ -177,6 +180,143 @@ pub fn benchmarkHashSmallKeys(comptime H: anytype, key_size: usize, bytes: usize
     };
 }
 
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArrayPtr(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key);
+            }
+            break :blk H.ty.hash(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+// the array and array pointer benchmarks for xxhash are very sensitive to in-lining,
+// if you see strange performance changes consider using `.never_inline` or `.always_inline`
+// to ensure the changes are not only due to the optimiser inlining the benchmark differently
+pub fn benchmarkHashSmallKeysArray(
+    comptime H: anytype,
+    comptime key_size: usize,
+    bytes: usize,
+    allocator: std.mem.Allocator,
+) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var i: usize = 0;
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    while (i < key_count) : (i += 1) {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hash(init, small_key.*);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hash(init, small_key.*);
+            }
+            break :blk H.ty.hash(small_key.*);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .hash = sum,
+        .throughput = throughput,
+    };
+}
+
+pub fn benchmarkHashSmallApi(comptime H: anytype, key_size: usize, bytes: usize, allocator: std.mem.Allocator) !Result {
+    var blocks = try allocator.alloc(u8, bytes);
+    defer allocator.free(blocks);
+    random.bytes(blocks);
+
+    const key_count = bytes / key_size;
+
+    var timer = try Timer.start();
+
+    var sum: u64 = 0;
+    for (0..key_count) |i| {
+        const small_key = blocks[i * key_size ..][0..key_size];
+        const final: u64 = blk: {
+            if (H.init_u8s) |init| {
+                if (H.has_crypto_api) {
+                    break :blk @truncate(H.ty.toInt(small_key, init[0..H.ty.key_length]));
+                } else {
+                    break :blk H.ty.hashSmall(init, small_key);
+                }
+            }
+            if (H.init_u64) |init| {
+                break :blk H.ty.hashSmall(init, small_key);
+            }
+            break :blk H.ty.hashSmall(small_key);
+        };
+        sum +%= final;
+    }
+    const elapsed_ns = timer.read();
+
+    const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / time.ns_per_s;
+    const throughput: u64 = @intFromFloat(@as(f64, @floatFromInt(bytes)) / elapsed_s);
+
+    std.mem.doNotOptimizeAway(sum);
+
+    return Result{
+        .throughput = throughput,
+        .hash = sum,
+    };
+}
+
 fn usage() void {
     std.debug.print(
         \\throughput_test [options]
@@ -205,9 +345,12 @@ pub fn main() !void {
 
     var filter: ?[]u8 = "";
     var count: usize = mode(128 * MiB);
-    var key_size: usize = 32;
+    var key_size: ?usize = null;
     var seed: u32 = 0;
     var test_iterative_only = false;
+    var test_arrays = false;
+
+    const default_small_key_size = 32;
 
     var i: usize = 1;
     while (i < args.len) : (i += 1) {
@@ -248,12 +391,14 @@ pub fn main() !void {
             }
 
             key_size = try std.fmt.parseUnsigned(usize, args[i], 10);
-            if (key_size > block_size) {
+            if (key_size.? > block_size) {
                 try stdout.print("key_size cannot exceed block size of {}\n", .{block_size});
                 std.os.exit(1);
             }
         } else if (std.mem.eql(u8, args[i], "--iterative-only")) {
             test_iterative_only = true;
+        } else if (std.mem.eql(u8, args[i], "--include-array")) {
+            test_arrays = true;
         } else if (std.mem.eql(u8, args[i], "--help")) {
             usage();
             return;
@@ -268,7 +413,7 @@ pub fn main() !void {
     const allocator = gpa.allocator();
 
     inline for (hashes) |H| {
-        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) {
+        if (filter == null or std.mem.indexOf(u8, H.name, filter.?) != null) hash: {
             if (!test_iterative_only or H.has_iterative_api) {
                 try stdout.print("{s}\n", .{H.name});
 
@@ -281,9 +426,69 @@ pub fn main() !void {
                 }
 
                 if (!test_iterative_only) {
-                    prng.seed(seed);
-                    const result_small = try benchmarkHashSmallKeys(H, key_size, count, allocator);
-                    try stdout.print("  small keys: {:5} MiB/s [{x:0<16}]\n", .{ result_small.throughput / (1 * MiB), result_small.hash });
+                    if (key_size) |size| {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            inline for (sizes) |exact_size| {
+                                if (size == exact_size) {
+                                    prng.seed(seed);
+                                    const result_array = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                    prng.seed(seed);
+                                    const result_ptr = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                    try stdout.print("       array: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_array.throughput / (1 * MiB),
+                                        result_array.hash,
+                                    });
+                                    try stdout.print("   array ptr: {:5} MiB/s [{x:0<16}]\n", .{
+                                        result_ptr.throughput / (1 * MiB),
+                                        result_ptr.hash,
+                                    });
+                                }
+                            }
+                        }
+                    } else {
+                        prng.seed(seed);
+                        const result_small = try benchmarkHashSmallKeys(H, default_small_key_size, count, allocator);
+                        try stdout.print("  small keys: {:3}B {:5} MiB/s {} Hashes/s [{x:0<16}]\n", .{
+                            default_small_key_size,
+                            result_small.throughput / (1 * MiB),
+                            result_small.throughput / default_small_key_size,
+                            result_small.hash,
+                        });
+
+                        if (!test_arrays) break :hash;
+                        if (H.has_anytype_api) |sizes| {
+                            try stdout.print("       array:\n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArray(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                            try stdout.print("   array ptr: \n", .{});
+                            inline for (sizes) |exact_size| {
+                                prng.seed(seed);
+                                const result = try benchmarkHashSmallKeysArrayPtr(H, exact_size, count, allocator);
+                                try stdout.print("       {d: >3}B {:5} MiB/s [{x:0<16}]\n", .{
+                                    exact_size,
+                                    result.throughput / (1 * MiB),
+                                    result.hash,
+                                });
+                            }
+                        }
+                    }
                 }
             }
         }
diff --git a/lib/std/hash/xxhash.zig b/lib/std/hash/xxhash.zig
index f1d1da429d..8e25b9ce7b 100644
--- a/lib/std/hash/xxhash.zig
+++ b/lib/std/hash/xxhash.zig
@@ -5,11 +5,7 @@ const expectEqual = std.testing.expectEqual;
 const rotl = std.math.rotl;
 
 pub const XxHash64 = struct {
-    acc1: u64,
-    acc2: u64,
-    acc3: u64,
-    acc4: u64,
-
+    accumulator: Accumulator,
     seed: u64,
     buf: [32]u8,
     buf_len: usize,
@@ -21,20 +17,174 @@ pub const XxHash64 = struct {
     const prime_4 = 0x85EBCA77C2B2AE63; // 0b1000010111101011110010100111011111000010101100101010111001100011
     const prime_5 = 0x27D4EB2F165667C5; // 0b0010011111010100111010110010111100010110010101100110011111000101
 
+    const Accumulator = struct {
+        acc1: u64,
+        acc2: u64,
+        acc3: u64,
+        acc4: u64,
+
+        fn init(seed: u64) Accumulator {
+            return .{
+                .acc1 = seed +% prime_1 +% prime_2,
+                .acc2 = seed +% prime_2,
+                .acc3 = seed,
+                .acc4 = seed -% prime_1,
+            };
+        }
+
+        fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
+            var i: usize = 0;
+
+            if (unroll_count > 0) {
+                const unrolled_bytes = unroll_count * 32;
+                while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
+                    inline for (0..unroll_count) |j| {
+                        self.processStripe(input[i + j * 32 ..][0..32]);
+                    }
+                }
+            }
+
+            while (i + 32 <= input.len) : (i += 32) {
+                self.processStripe(input[i..][0..32]);
+            }
+
+            return i;
+        }
+
+        fn processStripe(self: *Accumulator, buf: *const [32]u8) void {
+            self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
+            self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
+            self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
+            self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
+        }
+
+        fn merge(self: Accumulator) u64 {
+            var acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
+                rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
+            acc = mergeAccumulator(acc, self.acc1);
+            acc = mergeAccumulator(acc, self.acc2);
+            acc = mergeAccumulator(acc, self.acc3);
+            acc = mergeAccumulator(acc, self.acc4);
+            return acc;
+        }
+
+        fn mergeAccumulator(acc: u64, other: u64) u64 {
+            const a = acc ^ round(0, other);
+            const b = a *% prime_1;
+            return b +% prime_4;
+        }
+    };
+
+    fn finalize(
+        unfinished: u64,
+        byte_count: usize,
+        partial: anytype,
+    ) u64 {
+        std.debug.assert(partial.len < 32);
+        var acc = unfinished +% @as(u64, byte_count) +% @as(u64, partial.len);
+
+        switch (partial.len) {
+            inline 0, 1, 2, 3 => |count| {
+                inline for (0..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 4, 5, 6, 7 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                inline for (4..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 8, 9, 10, 11 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                inline for (8..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 12, 13, 14, 15 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize4(acc, partial[8..12]);
+                inline for (12..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 16, 17, 18, 19 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                inline for (16..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 20, 21, 22, 23 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize4(acc, partial[16..20]);
+                inline for (20..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 24, 25, 26, 27 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize8(acc, partial[16..24]);
+                inline for (24..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 28, 29, 30, 31 => |count| {
+                acc = finalize8(acc, partial[0..8]);
+                acc = finalize8(acc, partial[8..16]);
+                acc = finalize8(acc, partial[16..24]);
+                acc = finalize4(acc, partial[24..28]);
+                inline for (28..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            else => unreachable,
+        }
+    }
+
+    fn finalize8(v: u64, bytes: *const [8]u8) u64 {
+        var acc = v;
+        const lane = mem.readIntLittle(u64, bytes);
+        acc ^= round(0, lane);
+        acc = rotl(u64, acc, 27) *% prime_1;
+        acc +%= prime_4;
+        return acc;
+    }
+
+    fn finalize4(v: u64, bytes: *const [4]u8) u64 {
+        var acc = v;
+        const lane = @as(u64, mem.readIntLittle(u32, bytes));
+        acc ^= lane *% prime_1;
+        acc = rotl(u64, acc, 23) *% prime_2;
+        acc +%= prime_3;
+        return acc;
+    }
+
+    fn finalize1(v: u64, byte: u8) u64 {
+        var acc = v;
+        const lane = @as(u64, byte);
+        acc ^= lane *% prime_5;
+        acc = rotl(u64, acc, 11) *% prime_1;
+        return acc;
+    }
+
+    fn avalanche(value: u64) u64 {
+        var result = value ^ (value >> 33);
+        result *%= prime_2;
+        result ^= result >> 29;
+        result *%= prime_3;
+        result ^= result >> 32;
+
+        return result;
+    }
+
     pub fn init(seed: u64) XxHash64 {
         return XxHash64{
+            .accumulator = Accumulator.init(seed),
             .seed = seed,
-            .acc1 = seed +% prime_1 +% prime_2,
-            .acc2 = seed +% prime_2,
-            .acc3 = seed,
-            .acc4 = seed -% prime_1,
             .buf = undefined,
             .buf_len = 0,
             .byte_count = 0,
         };
     }
 
-    pub fn update(self: *XxHash64, input: []const u8) void {
+    pub fn update(self: *XxHash64, input: anytype) void {
+        validateType(@TypeOf(input));
+
         if (input.len < 32 - self.buf_len) {
             @memcpy(self.buf[self.buf_len..][0..input.len], input);
             self.buf_len += input.len;
@@ -46,99 +196,54 @@ pub const XxHash64 = struct {
         if (self.buf_len > 0) {
             i = 32 - self.buf_len;
             @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
-            self.processStripe(&self.buf);
-            self.buf_len = 0;
+            self.accumulator.processStripe(&self.buf);
+            self.byte_count += self.buf_len;
         }
 
-        while (i + 32 <= input.len) : (i += 32) {
-            self.processStripe(input[i..][0..32]);
-        }
+        i += self.accumulator.updateEmpty(input[i..], 32);
+        self.byte_count += i;
 
         const remaining_bytes = input[i..];
         @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
         self.buf_len = remaining_bytes.len;
     }
 
-    inline fn processStripe(self: *XxHash64, buf: *const [32]u8) void {
-        self.acc1 = round(self.acc1, mem.readIntLittle(u64, buf[0..8]));
-        self.acc2 = round(self.acc2, mem.readIntLittle(u64, buf[8..16]));
-        self.acc3 = round(self.acc3, mem.readIntLittle(u64, buf[16..24]));
-        self.acc4 = round(self.acc4, mem.readIntLittle(u64, buf[24..32]));
-        self.byte_count += 32;
-    }
-
-    inline fn round(acc: u64, lane: u64) u64 {
+    fn round(acc: u64, lane: u64) u64 {
         const a = acc +% (lane *% prime_2);
         const b = rotl(u64, a, 31);
         return b *% prime_1;
     }
 
     pub fn final(self: *XxHash64) u64 {
-        var acc: u64 = undefined;
+        const unfinished = if (self.byte_count < 32)
+            self.seed +% prime_5
+        else
+            self.accumulator.merge();
 
-        if (self.byte_count < 32) {
-            acc = self.seed +% prime_5;
+        return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
+    }
+
+    const Size = enum {
+        small,
+        large,
+        unknown,
+    };
+
+    pub fn hash(seed: u64, input: anytype) u64 {
+        validateType(@TypeOf(input));
+
+        if (input.len < 32) {
+            return finalize(seed +% prime_5, 0, input);
         } else {
-            acc = rotl(u64, self.acc1, 1) +% rotl(u64, self.acc2, 7) +%
-                rotl(u64, self.acc3, 12) +% rotl(u64, self.acc4, 18);
-            acc = mergeAccumulator(acc, self.acc1);
-            acc = mergeAccumulator(acc, self.acc2);
-            acc = mergeAccumulator(acc, self.acc3);
-            acc = mergeAccumulator(acc, self.acc4);
+            var hasher = Accumulator.init(seed);
+            const i = hasher.updateEmpty(input, 0);
+            return finalize(hasher.merge(), i, input[i..]);
         }
-
-        acc = acc +% @as(u64, self.byte_count) +% @as(u64, self.buf_len);
-
-        var pos: usize = 0;
-        while (pos + 8 <= self.buf_len) : (pos += 8) {
-            const lane = mem.readIntLittle(u64, self.buf[pos..][0..8]);
-            acc ^= round(0, lane);
-            acc = rotl(u64, acc, 27) *% prime_1;
-            acc +%= prime_4;
-        }
-
-        if (pos + 4 <= self.buf_len) {
-            const lane = @as(u64, mem.readIntLittle(u32, self.buf[pos..][0..4]));
-            acc ^= lane *% prime_1;
-            acc = rotl(u64, acc, 23) *% prime_2;
-            acc +%= prime_3;
-            pos += 4;
-        }
-
-        while (pos < self.buf_len) : (pos += 1) {
-            const lane = @as(u64, self.buf[pos]);
-            acc ^= lane *% prime_5;
-            acc = rotl(u64, acc, 11) *% prime_1;
-        }
-
-        acc ^= acc >> 33;
-        acc *%= prime_2;
-        acc ^= acc >> 29;
-        acc *%= prime_3;
-        acc ^= acc >> 32;
-
-        return acc;
-    }
-
-    inline fn mergeAccumulator(acc: u64, other: u64) u64 {
-        const a = acc ^ round(0, other);
-        const b = a *% prime_1;
-        return b +% prime_4;
-    }
-
-    pub fn hash(seed: u64, input: []const u8) u64 {
-        var hasher = XxHash64.init(seed);
-        hasher.update(input);
-        return hasher.final();
     }
 };
 
 pub const XxHash32 = struct {
-    acc1: u32,
-    acc2: u32,
-    acc3: u32,
-    acc4: u32,
-
+    accumulator: Accumulator,
     seed: u32,
     buf: [16]u8,
     buf_len: usize,
@@ -150,13 +255,57 @@ pub const XxHash32 = struct {
     const prime_4 = 0x27D4EB2F; // 0b00100111110101001110101100101111
     const prime_5 = 0x165667B1; // 0b00010110010101100110011110110001
 
+    const Accumulator = struct {
+        acc1: u32,
+        acc2: u32,
+        acc3: u32,
+        acc4: u32,
+
+        fn init(seed: u32) Accumulator {
+            return .{
+                .acc1 = seed +% prime_1 +% prime_2,
+                .acc2 = seed +% prime_2,
+                .acc3 = seed,
+                .acc4 = seed -% prime_1,
+            };
+        }
+
+        fn updateEmpty(self: *Accumulator, input: anytype, comptime unroll_count: usize) usize {
+            var i: usize = 0;
+
+            if (unroll_count > 0) {
+                const unrolled_bytes = unroll_count * 16;
+                while (i + unrolled_bytes <= input.len) : (i += unrolled_bytes) {
+                    inline for (0..unroll_count) |j| {
+                        self.processStripe(input[i + j * 16 ..][0..16]);
+                    }
+                }
+            }
+
+            while (i + 16 <= input.len) : (i += 16) {
+                self.processStripe(input[i..][0..16]);
+            }
+
+            return i;
+        }
+
+        fn processStripe(self: *Accumulator, buf: *const [16]u8) void {
+            self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
+            self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
+            self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
+            self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
+        }
+
+        fn merge(self: Accumulator) u32 {
+            return rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
+                rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
+        }
+    };
+
     pub fn init(seed: u32) XxHash32 {
         return XxHash32{
+            .accumulator = Accumulator.init(seed),
             .seed = seed,
-            .acc1 = seed +% prime_1 +% prime_2,
-            .acc2 = seed +% prime_2,
-            .acc3 = seed,
-            .acc4 = seed -% prime_1,
             .buf = undefined,
             .buf_len = 0,
             .byte_count = 0,
@@ -164,6 +313,8 @@ pub const XxHash32 = struct {
     }
 
     pub fn update(self: *XxHash32, input: []const u8) void {
+        validateType(@TypeOf(input));
+
         if (input.len < 16 - self.buf_len) {
             @memcpy(self.buf[self.buf_len..][0..input.len], input);
             self.buf_len += input.len;
@@ -175,59 +326,85 @@ pub const XxHash32 = struct {
         if (self.buf_len > 0) {
             i = 16 - self.buf_len;
             @memcpy(self.buf[self.buf_len..][0..i], input[0..i]);
-            self.processStripe(&self.buf);
+            self.accumulator.processStripe(&self.buf);
+            self.byte_count += self.buf_len;
             self.buf_len = 0;
         }
 
-        while (i + 16 <= input.len) : (i += 16) {
-            self.processStripe(input[i..][0..16]);
-        }
+        i += self.accumulator.updateEmpty(input[i..], 16);
+        self.byte_count += i;
 
         const remaining_bytes = input[i..];
         @memcpy(self.buf[0..remaining_bytes.len], remaining_bytes);
         self.buf_len = remaining_bytes.len;
     }
 
-    inline fn processStripe(self: *XxHash32, buf: *const [16]u8) void {
-        self.acc1 = round(self.acc1, mem.readIntLittle(u32, buf[0..4]));
-        self.acc2 = round(self.acc2, mem.readIntLittle(u32, buf[4..8]));
-        self.acc3 = round(self.acc3, mem.readIntLittle(u32, buf[8..12]));
-        self.acc4 = round(self.acc4, mem.readIntLittle(u32, buf[12..16]));
-        self.byte_count += 16;
-    }
-
-    inline fn round(acc: u32, lane: u32) u32 {
+    fn round(acc: u32, lane: u32) u32 {
         const a = acc +% (lane *% prime_2);
         const b = rotl(u32, a, 13);
         return b *% prime_1;
     }
 
     pub fn final(self: *XxHash32) u32 {
-        var acc: u32 = undefined;
+        const unfinished = if (self.byte_count < 16)
+            self.seed +% prime_5
+        else
+            self.accumulator.merge();
 
-        if (self.byte_count < 16) {
-            acc = self.seed +% prime_5;
-        } else {
-            acc = rotl(u32, self.acc1, 1) +% rotl(u32, self.acc2, 7) +%
-                rotl(u32, self.acc3, 12) +% rotl(u32, self.acc4, 18);
+        return finalize(unfinished, self.byte_count, self.buf[0..self.buf_len]);
+    }
+
+    fn finalize(unfinished: u32, byte_count: usize, partial: anytype) u32 {
+        std.debug.assert(partial.len < 16);
+        var acc = unfinished +% @as(u32, @intCast(byte_count)) +% @as(u32, @intCast(partial.len));
+
+        switch (partial.len) {
+            inline 0, 1, 2, 3 => |count| {
+                inline for (0..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 4, 5, 6, 7 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                inline for (4..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 8, 9, 10, 11 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                acc = finalize4(acc, partial[4..8]);
+                inline for (8..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            inline 12, 13, 14, 15 => |count| {
+                acc = finalize4(acc, partial[0..4]);
+                acc = finalize4(acc, partial[4..8]);
+                acc = finalize4(acc, partial[8..12]);
+                inline for (12..count) |i| acc = finalize1(acc, partial[i]);
+                return avalanche(acc);
+            },
+            else => unreachable,
         }
 
-        acc = acc +% @as(u32, @intCast(self.byte_count)) +% @as(u32, @intCast(self.buf_len));
+        return avalanche(acc);
+    }
 
-        var pos: usize = 0;
-        while (pos + 4 <= self.buf_len) : (pos += 4) {
-            const lane = mem.readIntLittle(u32, self.buf[pos..][0..4]);
-            acc +%= lane *% prime_3;
-            acc = rotl(u32, acc, 17) *% prime_4;
-        }
+    fn finalize4(v: u32, bytes: *const [4]u8) u32 {
+        var acc = v;
+        const lane = mem.readIntLittle(u32, bytes);
+        acc +%= lane *% prime_3;
+        acc = rotl(u32, acc, 17) *% prime_4;
+        return acc;
+    }
 
-        while (pos < self.buf_len) : (pos += 1) {
-            const lane = @as(u32, self.buf[pos]);
-            acc +%= lane *% prime_5;
-            acc = rotl(u32, acc, 11) *% prime_1;
-        }
+    fn finalize1(v: u32, byte: u8) u32 {
+        var acc = v;
+        const lane = @as(u32, byte);
+        acc +%= lane *% prime_5;
+        acc = rotl(u32, acc, 11) *% prime_1;
+        return acc;
+    }
 
-        acc ^= acc >> 15;
+    fn avalanche(value: u32) u32 {
+        var acc = value ^ value >> 15;
         acc *%= prime_2;
         acc ^= acc >> 13;
         acc *%= prime_3;
@@ -236,33 +413,58 @@ pub const XxHash32 = struct {
         return acc;
     }
 
-    pub fn hash(seed: u32, input: []const u8) u32 {
-        var hasher = XxHash32.init(seed);
-        hasher.update(input);
-        return hasher.final();
+    pub fn hash(seed: u32, input: anytype) u32 {
+        validateType(@TypeOf(input));
+
+        if (input.len < 16) {
+            return finalize(seed +% prime_5, 0, input);
+        } else {
+            var hasher = Accumulator.init(seed);
+            const i = hasher.updateEmpty(input, 0);
+            return finalize(hasher.merge(), i, input[i..]);
+        }
     }
 };
 
-test "xxhash64" {
-    const hash = XxHash64.hash;
+fn validateType(comptime T: type) void {
+    comptime {
+        if (!((std.meta.trait.isSlice(T) or
+            std.meta.trait.is(.Array)(T) or
+            std.meta.trait.isPtrTo(.Array)(T)) and
+            std.meta.Elem(T) == u8))
+        {
+            @compileError("expect a slice, array or pointer to array of u8, got " ++ @typeName(T));
+        }
+    }
+}
 
-    try expectEqual(hash(0, ""), 0xef46db3751d8e999);
-    try expectEqual(hash(0, "a"), 0xd24ec4f1a98c6e5b);
-    try expectEqual(hash(0, "abc"), 0x44bc2cf5ad770999);
-    try expectEqual(hash(0, "message digest"), 0x066ed728fceeb3be);
-    try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0xcfe1f278fa89835c);
-    try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0xaaa46907d3047814);
-    try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0xe04a477f19ee145d);
+fn testExpect(comptime H: type, seed: anytype, input: []const u8, expected: u64) !void {
+    try expectEqual(expected, H.hash(0, input));
+
+    var hasher = H.init(seed);
+    hasher.update(input);
+    try expectEqual(expected, hasher.final());
+}
+
+test "xxhash64" {
+    const H = XxHash64;
+    try testExpect(H, 0, "", 0xef46db3751d8e999);
+    try testExpect(H, 0, "a", 0xd24ec4f1a98c6e5b);
+    try testExpect(H, 0, "abc", 0x44bc2cf5ad770999);
+    try testExpect(H, 0, "message digest", 0x066ed728fceeb3be);
+    try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0xcfe1f278fa89835c);
+    try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0xaaa46907d3047814);
+    try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0xe04a477f19ee145d);
 }
 
 test "xxhash32" {
-    const hash = XxHash32.hash;
+    const H = XxHash32;
 
-    try expectEqual(hash(0, ""), 0x02cc5d05);
-    try expectEqual(hash(0, "a"), 0x550d7456);
-    try expectEqual(hash(0, "abc"), 0x32d153ff);
-    try expectEqual(hash(0, "message digest"), 0x7c948494);
-    try expectEqual(hash(0, "abcdefghijklmnopqrstuvwxyz"), 0x63a14d5f);
-    try expectEqual(hash(0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"), 0x9c285e64);
-    try expectEqual(hash(0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890"), 0x9c05f475);
+    try testExpect(H, 0, "", 0x02cc5d05);
+    try testExpect(H, 0, "a", 0x550d7456);
+    try testExpect(H, 0, "abc", 0x32d153ff);
+    try testExpect(H, 0, "message digest", 0x7c948494);
+    try testExpect(H, 0, "abcdefghijklmnopqrstuvwxyz", 0x63a14d5f);
+    try testExpect(H, 0, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 0x9c285e64);
+    try testExpect(H, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890", 0x9c05f475);
 }