Merge pull request #18318 from castholm/simd-segfault

Rename `simd.suggestVectorSize` to clarify intent and fix related segfault
2026-02-12 20:37:54 +00:00 · 2024-01-09 17:13:58 -08:00 · 2024-01-09 17:13:58 -08:00 · 6a32d58876
commit 6a32d58876
parent aaf1e0b25b 564b1da214
5 changed files with 31 additions and 30 deletions
--- a/lib/std/crypto/ghash_polyval.zig
+++ b/lib/std/crypto/ghash_polyval.zig
@ -158,11 +158,7 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type {
        /// clmulSoft128_64 is faster on platforms with no native 128-bit registers.
        const clmulSoft = switch (builtin.cpu.arch) {
            .wasm32, .wasm64 => clmulSoft128_64,
-            else => impl: {
-                const vector_size = std.simd.suggestVectorSize(u128) orelse 0;
-                if (vector_size < 128) break :impl clmulSoft128_64;
-                break :impl clmulSoft128;
-            },
+            else => if (std.simd.suggestVectorLength(u128) != null) clmulSoft128 else clmulSoft128_64,
        };

        // Software carryless multiplication of two 64-bit integers using native 128-bit registers.
--- a/lib/std/http/protocol.zig
+++ b/lib/std/http/protocol.zig
@ -84,7 +84,7 @@ pub const HeadersParser = struct {
    /// If the amount returned is less than `bytes.len`, you may assume that the parser is in a content state and the
    /// first byte of content is located at `bytes[result]`.
    pub fn findHeadersEnd(r: *HeadersParser, bytes: []const u8) u32 {
-        const vector_len: comptime_int = @max(std.simd.suggestVectorSize(u8) orelse 1, 8);
+        const vector_len: comptime_int = @max(std.simd.suggestVectorLength(u8) orelse 1, 8);
        const len: u32 = @intCast(bytes.len);
        var index: u32 = 0;

--- a/lib/std/mem.zig
+++ b/lib/std/mem.zig
@ -1032,15 +1032,16 @@ pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]co
            // The below branch assumes that reading past the end of the buffer is valid, as long
            // as we don't read into a new page. This should be the case for most architectures
            // which use paged memory, however should be confirmed before adding a new arch below.
-            .aarch64, .x86, .x86_64 => if (std.simd.suggestVectorSize(T)) |block_len| {
-                comptime std.debug.assert(std.mem.page_size % block_len == 0);
+            .aarch64, .x86, .x86_64 => if (std.simd.suggestVectorLength(T)) |block_len| {
                const Block = @Vector(block_len, T);
                const mask: Block = @splat(sentinel);

+                comptime std.debug.assert(std.mem.page_size % @sizeOf(Block) == 0);
+
                // First block may be unaligned
                const start_addr = @intFromPtr(&p[i]);
                const offset_in_page = start_addr & (std.mem.page_size - 1);
-                if (offset_in_page < std.mem.page_size - block_len) {
+                if (offset_in_page <= std.mem.page_size - @sizeOf(Block)) {
                    // Will not read past the end of a page, full block.
                    const block: Block = p[i..][0..block_len].*;
                    const matches = block == mask;
@ -1085,7 +1086,7 @@ test "indexOfSentinel vector paths" {
    const allocator = std.testing.allocator;

    inline for (Types) |T| {
-        const block_len = std.simd.suggestVectorSize(T) orelse continue;
+        const block_len = std.simd.suggestVectorLength(T) orelse continue;

        // Allocate three pages so we guarantee a page-crossing address with a full page after
        const memory = try allocator.alloc(T, 3 * std.mem.page_size / @sizeOf(T));
@ -1176,11 +1177,11 @@ pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize,
        !@inComptime() and
        (@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T)))
    {
-        if (std.simd.suggestVectorSize(T)) |block_len| {
+        if (std.simd.suggestVectorLength(T)) |block_len| {
            // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result
            // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning.
            //
-            // Use `std.simd.suggestVectorSize(T)` to get the same alignment as used in this function
+            // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function
            // however this usually isn't necessary unless your arch has a performance penalty due to this.
            //
            // This may differ for other arch's. Arm for example costs a cycle when loading across a cache
--- a/lib/std/simd.zig
+++ b/lib/std/simd.zig
@ -6,7 +6,9 @@
 const std = @import("std");
 const builtin = @import("builtin");

-pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
+pub const suggestVectorSizeForCpu = @compileError("deprecated; use 'suggestVectorLengthForCpu'");
+
+pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int {
    // This is guesswork, if you have better suggestions can add it or edit the current here
    // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it
    const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable);
@ -53,24 +55,26 @@ pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?
    return @divExact(vector_bit_size, element_bit_size);
 }

-/// Suggests a target-dependant vector size for a given type, or null if scalars are recommended.
+pub const suggestVectorSize = @compileError("deprecated; use 'suggestVectorLength'");
+
+/// Suggests a target-dependant vector length for a given type, or null if scalars are recommended.
 /// Not yet implemented for every CPU architecture.
-pub fn suggestVectorSize(comptime T: type) ?comptime_int {
-    return suggestVectorSizeForCpu(T, builtin.cpu);
+pub fn suggestVectorLength(comptime T: type) ?comptime_int {
+    return suggestVectorLengthForCpu(T, builtin.cpu);
 }

-test "suggestVectorSizeForCpu works with signed and unsigned values" {
+test "suggestVectorLengthForCpu works with signed and unsigned values" {
    comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64);
    comptime cpu.features.addFeature(@intFromEnum(std.Target.x86.Feature.avx512f));
    comptime cpu.features.populateDependencies(&std.Target.x86.all_features);
-    const expected_size: usize = switch (builtin.zig_backend) {
+    const expected_len: usize = switch (builtin.zig_backend) {
        .stage2_x86_64 => 8,
        else => 16,
    };
-    const signed_integer_size = suggestVectorSizeForCpu(i32, cpu).?;
-    const unsigned_integer_size = suggestVectorSizeForCpu(u32, cpu).?;
-    try std.testing.expectEqual(expected_size, unsigned_integer_size);
-    try std.testing.expectEqual(expected_size, signed_integer_size);
+    const signed_integer_len = suggestVectorLengthForCpu(i32, cpu).?;
+    const unsigned_integer_len = suggestVectorLengthForCpu(u32, cpu).?;
+    try std.testing.expectEqual(expected_len, unsigned_integer_len);
+    try std.testing.expectEqual(expected_len, signed_integer_len);
 }

 fn vectorLength(comptime VectorType: type) comptime_int {
@ -232,7 +236,7 @@ test "vector patterns" {
    }
 }

-/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the size of a and b.
+/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the length of a and b.
 pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) {
    const len = vectorLength(@TypeOf(a, b));

@ -240,7 +244,7 @@ pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a,
 }

 /// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off
-/// so that the size of the vector stays the same.
+/// so that the length of the vector stays the same.
 pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) {
    // It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the
    // slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount.
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@ -202,7 +202,7 @@ pub fn utf8CountCodepoints(s: []const u8) !usize {
 pub fn utf8ValidateSlice(input: []const u8) bool {
    var remaining = input;

-    const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
+    const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
    const Chunk = @Vector(chunk_len, u8);

    // Fast path. Check for and skip ASCII characters at the start of the input.
@ -758,7 +758,7 @@ pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8

    var remaining = utf16le;
    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -801,7 +801,7 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0]

    var remaining = utf16le;
    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -842,7 +842,7 @@ pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize {

    var remaining = utf16le;
    if (builtin.zig_backend != .stage2_x86_64) {
-        const chunk_len = std.simd.suggestVectorSize(u16) orelse 1;
+        const chunk_len = std.simd.suggestVectorLength(u16) orelse 1;
        const Chunk = @Vector(chunk_len, u16);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -941,7 +941,7 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1
    var remaining = utf8;
    // Need support for std.simd.interlace
    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
+        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and encode ASCII characters at the start of the input.
@ -986,7 +986,7 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
    var remaining = utf8;
    // Need support for std.simd.interlace
    if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) {
-        const chunk_len = std.simd.suggestVectorSize(u8) orelse 1;
+        const chunk_len = std.simd.suggestVectorLength(u8) orelse 1;
        const Chunk = @Vector(chunk_len, u8);

        // Fast path. Check for and encode ASCII characters at the start of the input.