mirror of
https://github.com/ziglang/zig.git
synced 2026-02-20 08:14:48 +00:00
Merge pull request #17161 from tiehuis/vectorize-index-of-scalar
std.mem: add vectorized indexOfScalarPos and indexOfSentinel
This commit is contained in:
commit
376242e586
166
lib/std/mem.zig
166
lib/std/mem.zig
@ -953,14 +953,105 @@ test "len" {
|
||||
try testing.expect(len(c_ptr) == 2);
|
||||
}
|
||||
|
||||
pub fn indexOfSentinel(comptime Elem: type, comptime sentinel: Elem, ptr: [*:sentinel]const Elem) usize {
|
||||
const backend_supports_vectors = switch (builtin.zig_backend) {
|
||||
.stage2_llvm, .stage2_c => true,
|
||||
else => false,
|
||||
};
|
||||
|
||||
pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]const T) usize {
|
||||
var i: usize = 0;
|
||||
while (ptr[i] != sentinel) {
|
||||
|
||||
if (backend_supports_vectors and
|
||||
!@inComptime() and
|
||||
(@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T)))
|
||||
{
|
||||
switch (@import("builtin").cpu.arch) {
|
||||
// The below branch assumes that reading past the end of the buffer is valid, as long
|
||||
// as we don't read into a new page. This should be the case for most architectures
|
||||
// which use paged memory, however should be confirmed before adding a new arch below.
|
||||
.aarch64, .x86, .x86_64 => if (comptime std.simd.suggestVectorSize(T)) |block_len| {
|
||||
comptime std.debug.assert(std.mem.page_size % block_len == 0);
|
||||
const Block = @Vector(block_len, T);
|
||||
const mask: Block = @splat(sentinel);
|
||||
|
||||
// First block may be unaligned
|
||||
const start_addr = @intFromPtr(&p[i]);
|
||||
const offset_in_page = start_addr & (std.mem.page_size - 1);
|
||||
if (offset_in_page < std.mem.page_size - block_len) {
|
||||
// Will not read past the end of a page, full block.
|
||||
const block: Block = p[i..][0..block_len].*;
|
||||
const matches = block == mask;
|
||||
if (@reduce(.Or, matches)) {
|
||||
return i + std.simd.firstTrue(matches).?;
|
||||
}
|
||||
|
||||
i += (std.mem.alignForward(usize, start_addr, @alignOf(Block)) - start_addr) / @sizeOf(T);
|
||||
} else {
|
||||
// Would read over a page boundary. Per-byte at a time until aligned or found.
|
||||
// 0.39% chance this branch is taken for 4K pages at 16b block length.
|
||||
//
|
||||
// An alternate strategy is to do read a full block (the last in the page) and
|
||||
// mask the entries before the pointer.
|
||||
while ((@intFromPtr(&p[i]) & (@alignOf(Block) - 1)) != 0) : (i += 1) {
|
||||
if (p[i] == sentinel) return i;
|
||||
}
|
||||
}
|
||||
|
||||
std.debug.assert(std.mem.isAligned(@intFromPtr(&p[i]), @alignOf(Block)));
|
||||
while (true) {
|
||||
const block: *const Block = @ptrCast(@alignCast(p[i..][0..block_len]));
|
||||
const matches = block.* == mask;
|
||||
if (@reduce(.Or, matches)) {
|
||||
return i + std.simd.firstTrue(matches).?;
|
||||
}
|
||||
i += block_len;
|
||||
}
|
||||
},
|
||||
else => {},
|
||||
}
|
||||
}
|
||||
|
||||
while (p[i] != sentinel) {
|
||||
i += 1;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
test "indexOfSentinel vector paths" {
|
||||
const Types = [_]type{ u8, u16, u32, u64 };
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
inline for (Types) |T| {
|
||||
const block_len = comptime std.simd.suggestVectorSize(T) orelse continue;
|
||||
|
||||
// Allocate three pages so we guarantee a page-crossing address with a full page after
|
||||
const memory = try allocator.alloc(T, 3 * std.mem.page_size / @sizeOf(T));
|
||||
defer allocator.free(memory);
|
||||
@memset(memory, 0xaa);
|
||||
|
||||
// Find starting page-alignment = 0
|
||||
var start: usize = 0;
|
||||
const start_addr = @intFromPtr(&memory);
|
||||
start += (std.mem.alignForward(usize, start_addr, std.mem.page_size) - start_addr) / @sizeOf(T);
|
||||
try testing.expect(start < std.mem.page_size / @sizeOf(T));
|
||||
|
||||
// Validate all sub-block alignments
|
||||
const search_len = std.mem.page_size / @sizeOf(T);
|
||||
memory[start + search_len] = 0;
|
||||
for (0..block_len) |offset| {
|
||||
try testing.expectEqual(search_len - offset, indexOfSentinel(T, 0, @ptrCast(&memory[start + offset])));
|
||||
}
|
||||
memory[start + search_len] = 0xaa;
|
||||
|
||||
// Validate page boundary crossing
|
||||
const start_page_boundary = start + (std.mem.page_size / @sizeOf(T));
|
||||
memory[start_page_boundary + block_len] = 0;
|
||||
for (0..block_len) |offset| {
|
||||
try testing.expectEqual(2 * block_len - offset, indexOfSentinel(T, 0, @ptrCast(&memory[start_page_boundary - block_len + offset])));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if all elements in a slice are equal to the scalar value provided
|
||||
pub fn allEqual(comptime T: type, slice: []const T, scalar: T) bool {
|
||||
for (slice) |item| {
|
||||
@ -1016,12 +1107,79 @@ pub fn lastIndexOfScalar(comptime T: type, slice: []const T, value: T) ?usize {
|
||||
|
||||
pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize, value: T) ?usize {
|
||||
if (start_index >= slice.len) return null;
|
||||
for (slice[start_index..], start_index..) |c, i| {
|
||||
if (c == value) return i;
|
||||
|
||||
var i: usize = start_index;
|
||||
if (backend_supports_vectors and
|
||||
!@inComptime() and
|
||||
(@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T)))
|
||||
{
|
||||
if (comptime std.simd.suggestVectorSize(T)) |block_len| {
|
||||
// For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
|
||||
// in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
|
||||
//
|
||||
// Use `comptime std.simd.suggestVectorSize(T)` to get the same alignment as used in this function
|
||||
// however this usually isn't necessary unless your arch has a performance penalty due to this.
|
||||
//
|
||||
// This may differ for other arch's. Arm for example costs a cycle when loading across a cache
|
||||
// line so explicit alignment prologues may be worth exploration.
|
||||
|
||||
// Unrolling here is ~10% improvement. We can then do one bounds check every 2 blocks
|
||||
// instead of one which adds up.
|
||||
const Block = @Vector(block_len, T);
|
||||
if (i + 2 * block_len < slice.len) {
|
||||
const mask: Block = @splat(value);
|
||||
while (true) {
|
||||
inline for (0..2) |_| {
|
||||
const block: Block = slice[i..][0..block_len].*;
|
||||
const matches = block == mask;
|
||||
if (@reduce(.Or, matches)) {
|
||||
return i + std.simd.firstTrue(matches).?;
|
||||
}
|
||||
i += block_len;
|
||||
}
|
||||
if (i + 2 * block_len >= slice.len) break;
|
||||
}
|
||||
}
|
||||
|
||||
// {block_len, block_len / 2} check
|
||||
inline for (0..2) |j| {
|
||||
const block_x_len = block_len / (1 << j);
|
||||
comptime if (block_x_len < 4) break;
|
||||
|
||||
const BlockX = @Vector(block_x_len, T);
|
||||
if (i + block_x_len < slice.len) {
|
||||
const mask: BlockX = @splat(value);
|
||||
const block: BlockX = slice[i..][0..block_x_len].*;
|
||||
const matches = block == mask;
|
||||
if (@reduce(.Or, matches)) {
|
||||
return i + std.simd.firstTrue(matches).?;
|
||||
}
|
||||
i += block_x_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (slice[i..], i..) |c, j| {
|
||||
if (c == value) return j;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
test "indexOfScalarPos" {
|
||||
const Types = [_]type{ u8, u16, u32, u64 };
|
||||
|
||||
inline for (Types) |T| {
|
||||
var memory: [64 / @sizeOf(T)]T = undefined;
|
||||
@memset(&memory, 0xaa);
|
||||
memory[memory.len - 1] = 0;
|
||||
|
||||
for (0..memory.len) |i| {
|
||||
try testing.expectEqual(memory.len - i - 1, indexOfScalarPos(T, memory[i..], 0, 0).?);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn indexOfAny(comptime T: type, slice: []const T, values: []const T) ?usize {
|
||||
return indexOfAnyPos(T, slice, 0, values);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user