mirror of
https://github.com/ziglang/zig.git
synced 2025-12-29 09:33:18 +00:00
Merge pull request #18318 from castholm/simd-segfault
Rename `simd.suggestVectorSize` to clarify intent and fix related segfault
This commit is contained in:
commit
6a32d58876
@ -158,11 +158,7 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
|
||||
/// clmulSoft128_64 is faster on platforms with no native 128-bit registers.
|
||||
const clmulSoft = switch (builtin.cpu.arch) {
|
||||
.wasm32, .wasm64 => clmulSoft128_64,
|
||||
else => impl: {
|
||||
const vector_size = std.simd.suggestVectorSize(u128) orelse 0;
|
||||
if (vector_size < 128) break :impl clmulSoft128_64;
|
||||
break :impl clmulSoft128;
|
||||
},
|
||||
else => if (std.simd.suggestVectorLength(u128) != null) clmulSoft128 else clmulSoft128_64,
|
||||
};
|
||||
|
||||
// Software carryless multiplication of two 64-bit integers using native 128-bit registers.
|
||||
|
||||
@ -84,7 +84,7 @@ pub const HeadersParser = struct {
|
||||
/// If the amount returned is less than `bytes.len`, you may assume that the parser is in a content state and the
|
||||
/// first byte of content is located at `bytes[result]`.
|
||||
pub fn findHeadersEnd(r: *HeadersParser, bytes: []const u8) u32 {
|
||||
const vector_len: comptime_int = @max(std.simd.suggestVectorSize(u8) orelse 1, 8);
|
||||
const vector_len: comptime_int = @max(std.simd.suggestVectorLength(u8) orelse 1, 8);
|
||||
const len: u32 = @intCast(bytes.len);
|
||||
var index: u32 = 0;
|
||||
|
||||
|
||||
@ -1032,15 +1032,16 @@ pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]co
|
||||
// The below branch assumes that reading past the end of the buffer is valid, as long
|
||||
// as we don't read into a new page. This should be the case for most architectures
|
||||
// which use paged memory, however should be confirmed before adding a new arch below.
|
||||
.aarch64, .x86, .x86_64 => if (std.simd.suggestVectorSize(T)) |block_len| {
|
||||
comptime std.debug.assert(std.mem.page_size % block_len == 0);
|
||||
.aarch64, .x86, .x86_64 => if (std.simd.suggestVectorLength(T)) |block_len| {
|
||||
const Block = @Vector(block_len, T);
|
||||
const mask: Block = @splat(sentinel);
|
||||
|
||||
comptime std.debug.assert(std.mem.page_size % @sizeOf(Block) == 0);
|
||||
|
||||
// First block may be unaligned
|
||||
const start_addr = @intFromPtr(&p[i]);
|
||||
const offset_in_page = start_addr & (std.mem.page_size - 1);
|
||||
if (offset_in_page < std.mem.page_size - block_len) {
|
||||
if (offset_in_page <= std.mem.page_size - @sizeOf(Block)) {
|
||||
// Will not read past the end of a page, full block.
|
||||
const block: Block = p[i..][0..block_len].*;
|
||||
const matches = block == mask;
|
||||
@ -1085,7 +1086,7 @@ test "indexOfSentinel vector paths" {
|
||||
const allocator = std.testing.allocator;
|
||||
|
||||
inline for (Types) |T| {
|
||||
const block_len = std.simd.suggestVectorSize(T) orelse continue;
|
||||
const block_len = std.simd.suggestVectorLength(T) orelse continue;
|
||||
|
||||
// Allocate three pages so we guarantee a page-crossing address with a full page after
|
||||
const memory = try allocator.alloc(T, 3 * std.mem.page_size / @sizeOf(T));
|
||||
@ -1176,11 +1177,11 @@ pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize,
|
||||
!@inComptime() and
|
||||
(@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T)))
|
||||
{
|
||||
if (std.simd.suggestVectorSize(T)) |block_len| {
|
||||
if (std.simd.suggestVectorLength(T)) |block_len| {
|
||||
// For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
|
||||
// in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
|
||||
//
|
||||
// Use `std.simd.suggestVectorSize(T)` to get the same alignment as used in this function
|
||||
// Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function
|
||||
// however this usually isn't necessary unless your arch has a performance penalty due to this.
|
||||
//
|
||||
// This may differ for other arch's. Arm for example costs a cycle when loading across a cache
|
||||
|
||||
@ -6,7 +6,9 @@
|
||||
const std = @import("std");
|
||||
const builtin = @import("builtin");
|
||||
|
||||
pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
|
||||
pub const suggestVectorSizeForCpu = @compileError("deprecated; use 'suggestVectorLengthForCpu'");
|
||||
|
||||
pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
|
||||
// This is guesswork, if you have better suggestions can add it or edit the current here
|
||||
// This can run in comptime only, but stage 1 fails at it, stage 2 can understand it
|
||||
const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
|
||||
@ -53,24 +55,26 @@ pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?
|
||||
return @divExact(vector_bit_size, element_bit_size);
|
||||
}
|
||||
|
||||
/// Suggests a target-dependant vector size for a given type, or null if scalars are recommended.
|
||||
pub const suggestVectorSize = @compileError("deprecated; use 'suggestVectorLength'");
|
||||
|
||||
/// Suggests a target-dependant vector length for a given type, or null if scalars are recommended.
|
||||
/// Not yet implemented for every CPU architecture.
|
||||
pub fn suggestVectorSize(comptime T: type) ?comptime_int {
|
||||
return suggestVectorSizeForCpu(T, builtin.cpu);
|
||||
pub fn suggestVectorLength(comptime T: type) ?comptime_int {
|
||||
return suggestVectorLengthForCpu(T, builtin.cpu);
|
||||
}
|
||||
|
||||
test "suggestVectorSizeForCpu works with signed and unsigned values" {
|
||||
test "suggestVectorLengthForCpu works with signed and unsigned values" {
|
||||
comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64);
|
||||
comptime cpu.features.addFeature(@intFromEnum(std.Target.x86.Feature.avx512f));
|
||||
comptime cpu.features.populateDependencies(&std.Target.x86.all_features);
|
||||
const expected_size: usize = switch (builtin.zig_backend) {
|
||||
const expected_len: usize = switch (builtin.zig_backend) {
|
||||
.stage2_x86_64 => 8,
|
||||
else => 16,
|
||||
};
|
||||
const signed_integer_size = suggestVectorSizeForCpu(i32, cpu).?;
|
||||
const unsigned_integer_size = suggestVectorSizeForCpu(u32, cpu).?;
|
||||
try std.testing.expectEqual(expected_size, unsigned_integer_size);
|
||||
try std.testing.expectEqual(expected_size, signed_integer_size);
|
||||
const signed_integer_len = suggestVectorLengthForCpu(i32, cpu).?;
|
||||
const unsigned_integer_len = suggestVectorLengthForCpu(u32, cpu).?;
|
||||
try std.testing.expectEqual(expected_len, unsigned_integer_len);
|
||||
try std.testing.expectEqual(expected_len, signed_integer_len);
|
||||
}
|
||||
|
||||
fn vectorLength(comptime VectorType: type) comptime_int {
|
||||
@ -232,7 +236,7 @@ test "vector patterns" {
|
||||
}
|
||||
}
|
||||
|
||||
/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the size of a and b.
|
||||
/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the length of a and b.
|
||||
pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) {
|
||||
const len = vectorLength(@TypeOf(a, b));
|
||||
|
||||
@ -240,7 +244,7 @@ pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a,
|
||||
}
|
||||
|
||||
/// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off
|
||||
/// so that the size of the vector stays the same.
|
||||
/// so that the length of the vector stays the same.
|
||||
pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
|
||||
// It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the
|
||||
// slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount.
|
||||
|
||||
@ -202,7 +202,7 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {
|
||||
pub fn utf8ValidateSlice(input: []const u8) bool {
|
||||
var remaining = input;
|
||||
|
||||
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u8);
|
||||
|
||||
// Fast path. Check for and skip ASCII characters at the start of the input.
|
||||
@ -758,7 +758,7 @@ pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8
|
||||
|
||||
var remaining = utf16le;
|
||||
if (builtin.zig_backend != .stage2_x86_64) {
|
||||
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u16);
|
||||
|
||||
// Fast path. Check for and encode ASCII characters at the start of the input.
|
||||
@ -801,7 +801,7 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]
|
||||
|
||||
var remaining = utf16le;
|
||||
if (builtin.zig_backend != .stage2_x86_64) {
|
||||
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u16);
|
||||
|
||||
// Fast path. Check for and encode ASCII characters at the start of the input.
|
||||
@ -842,7 +842,7 @@ pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {
|
||||
|
||||
var remaining = utf16le;
|
||||
if (builtin.zig_backend != .stage2_x86_64) {
|
||||
const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u16);
|
||||
|
||||
// Fast path. Check for and encode ASCII characters at the start of the input.
|
||||
@ -941,7 +941,7 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1
|
||||
var remaining = utf8;
|
||||
// Need support for std.simd.interlace
|
||||
if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
|
||||
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u8);
|
||||
|
||||
// Fast path. Check for and encode ASCII characters at the start of the input.
|
||||
@ -986,7 +986,7 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
|
||||
var remaining = utf8;
|
||||
// Need support for std.simd.interlace
|
||||
if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
|
||||
const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
|
||||
const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
|
||||
const Chunk = @Vector(chunk_len, u8);
|
||||
|
||||
// Fast path. Check for and encode ASCII characters at the start of the input.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user