From 781c3a985c2c6e31c57165c02582aa79c286e431 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl=20=C3=85stholm?= Date: Tue, 19 Dec 2023 21:14:48 +0100 Subject: [PATCH 1/3] Prevent reading over a page boundary in `mem.indexOfSentinel` The size of the slice element was not correctly taken into account when determining whether a read would cross a page boundary. --- lib/std/mem.zig | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/std/mem.zig b/lib/std/mem.zig index 8e325e0b8e..a1f23668e5 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -967,14 +967,15 @@ pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]co // as we don't read into a new page. This should be the case for most architectures // which use paged memory, however should be confirmed before adding a new arch below. .aarch64, .x86, .x86_64 => if (std.simd.suggestVectorSize(T)) |block_len| { - comptime std.debug.assert(std.mem.page_size % block_len == 0); const Block = @Vector(block_len, T); const mask: Block = @splat(sentinel); + comptime std.debug.assert(std.mem.page_size % @sizeOf(Block) == 0); + // First block may be unaligned const start_addr = @intFromPtr(&p[i]); const offset_in_page = start_addr & (std.mem.page_size - 1); - if (offset_in_page < std.mem.page_size - block_len) { + if (offset_in_page < std.mem.page_size - @sizeOf(Block)) { // Will not read past the end of a page, full block. const block: Block = p[i..][0..block_len].*; const matches = block == mask; From 59ac0d1eed561483b4d97145eafa7d0763fb8ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl=20=C3=85stholm?= Date: Tue, 19 Dec 2023 22:21:03 +0100 Subject: [PATCH 2/3] Deprecate `suggestVectorSize` in favor of `suggestVectorLength` The function returns the vector length, not the byte size of the vector or the bit size of individual elements. This distinction is very important and some usages of this function in the stdlib operated under these incorrect assumptions. --- lib/std/crypto/ghash_polyval.zig | 6 +----- lib/std/http/protocol.zig | 2 +- lib/std/mem.zig | 8 ++++---- lib/std/simd.zig | 28 ++++++++++++++++------------ lib/std/unicode.zig | 12 ++++++------ 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/lib/std/crypto/ghash_polyval.zig b/lib/std/crypto/ghash_polyval.zig index 6ccbf1f228..b89e4b1c59 100644 --- a/lib/std/crypto/ghash_polyval.zig +++ b/lib/std/crypto/ghash_polyval.zig @@ -158,11 +158,7 @@ fn Hash(comptime endian: std.builtin.Endian, comptime shift_key: bool) type { /// clmulSoft128_64 is faster on platforms with no native 128-bit registers. const clmulSoft = switch (builtin.cpu.arch) { .wasm32, .wasm64 => clmulSoft128_64, - else => impl: { - const vector_size = std.simd.suggestVectorSize(u128) orelse 0; - if (vector_size < 128) break :impl clmulSoft128_64; - break :impl clmulSoft128; - }, + else => if (std.simd.suggestVectorLength(u128) != null) clmulSoft128 else clmulSoft128_64, }; // Software carryless multiplication of two 64-bit integers using native 128-bit registers. diff --git a/lib/std/http/protocol.zig b/lib/std/http/protocol.zig index e757095803..227b376ed3 100644 --- a/lib/std/http/protocol.zig +++ b/lib/std/http/protocol.zig @@ -84,7 +84,7 @@ pub const HeadersParser = struct { /// If the amount returned is less than `bytes.len`, you may assume that the parser is in a content state and the /// first byte of content is located at `bytes[result]`. pub fn findHeadersEnd(r: *HeadersParser, bytes: []const u8) u32 { - const vector_len: comptime_int = @max(std.simd.suggestVectorSize(u8) orelse 1, 8); + const vector_len: comptime_int = @max(std.simd.suggestVectorLength(u8) orelse 1, 8); const len: u32 = @intCast(bytes.len); var index: u32 = 0; diff --git a/lib/std/mem.zig b/lib/std/mem.zig index a1f23668e5..b3224243c6 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -966,7 +966,7 @@ pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]co // The below branch assumes that reading past the end of the buffer is valid, as long // as we don't read into a new page. This should be the case for most architectures // which use paged memory, however should be confirmed before adding a new arch below. - .aarch64, .x86, .x86_64 => if (std.simd.suggestVectorSize(T)) |block_len| { + .aarch64, .x86, .x86_64 => if (std.simd.suggestVectorLength(T)) |block_len| { const Block = @Vector(block_len, T); const mask: Block = @splat(sentinel); @@ -1020,7 +1020,7 @@ test "indexOfSentinel vector paths" { const allocator = std.testing.allocator; inline for (Types) |T| { - const block_len = std.simd.suggestVectorSize(T) orelse continue; + const block_len = std.simd.suggestVectorLength(T) orelse continue; // Allocate three pages so we guarantee a page-crossing address with a full page after const memory = try allocator.alloc(T, 3 * std.mem.page_size / @sizeOf(T)); @@ -1111,11 +1111,11 @@ pub fn indexOfScalarPos(comptime T: type, slice: []const T, start_index: usize, !@inComptime() and (@typeInfo(T) == .Int or @typeInfo(T) == .Float) and std.math.isPowerOfTwo(@bitSizeOf(T))) { - if (std.simd.suggestVectorSize(T)) |block_len| { + if (std.simd.suggestVectorLength(T)) |block_len| { // For Intel Nehalem (2009) and AMD Bulldozer (2012) or later, unaligned loads on aligned data result // in the same execution as aligned loads. We ignore older arch's here and don't bother pre-aligning. // - // Use `std.simd.suggestVectorSize(T)` to get the same alignment as used in this function + // Use `std.simd.suggestVectorLength(T)` to get the same alignment as used in this function // however this usually isn't necessary unless your arch has a performance penalty due to this. // // This may differ for other arch's. Arm for example costs a cycle when loading across a cache diff --git a/lib/std/simd.zig b/lib/std/simd.zig index 72b71938cb..c3ae2d8dbe 100644 --- a/lib/std/simd.zig +++ b/lib/std/simd.zig @@ -6,7 +6,9 @@ const std = @import("std"); const builtin = @import("builtin"); -pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int { +pub const suggestVectorSizeForCpu = @compileError("deprecated; use 'suggestVectorLengthForCpu'"); + +pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?comptime_int { // This is guesswork, if you have better suggestions can add it or edit the current here // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it const element_bit_size = @max(8, std.math.ceilPowerOfTwo(u16, @bitSizeOf(T)) catch unreachable); @@ -53,24 +55,26 @@ pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ? return @divExact(vector_bit_size, element_bit_size); } -/// Suggests a target-dependant vector size for a given type, or null if scalars are recommended. +pub const suggestVectorSize = @compileError("deprecated; use 'suggestVectorLength'"); + +/// Suggests a target-dependant vector length for a given type, or null if scalars are recommended. /// Not yet implemented for every CPU architecture. -pub fn suggestVectorSize(comptime T: type) ?comptime_int { - return suggestVectorSizeForCpu(T, builtin.cpu); +pub fn suggestVectorLength(comptime T: type) ?comptime_int { + return suggestVectorLengthForCpu(T, builtin.cpu); } -test "suggestVectorSizeForCpu works with signed and unsigned values" { +test "suggestVectorLengthForCpu works with signed and unsigned values" { comptime var cpu = std.Target.Cpu.baseline(std.Target.Cpu.Arch.x86_64); comptime cpu.features.addFeature(@intFromEnum(std.Target.x86.Feature.avx512f)); comptime cpu.features.populateDependencies(&std.Target.x86.all_features); - const expected_size: usize = switch (builtin.zig_backend) { + const expected_len: usize = switch (builtin.zig_backend) { .stage2_x86_64 => 8, else => 16, }; - const signed_integer_size = suggestVectorSizeForCpu(i32, cpu).?; - const unsigned_integer_size = suggestVectorSizeForCpu(u32, cpu).?; - try std.testing.expectEqual(expected_size, unsigned_integer_size); - try std.testing.expectEqual(expected_size, signed_integer_size); + const signed_integer_len = suggestVectorLengthForCpu(i32, cpu).?; + const unsigned_integer_len = suggestVectorLengthForCpu(u32, cpu).?; + try std.testing.expectEqual(expected_len, unsigned_integer_len); + try std.testing.expectEqual(expected_len, signed_integer_len); } fn vectorLength(comptime VectorType: type) comptime_int { @@ -232,7 +236,7 @@ test "vector patterns" { } } -/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the size of a and b. +/// Joins two vectors, shifts them leftwards (towards lower indices) and extracts the leftmost elements into a vector the length of a and b. pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, b))) @TypeOf(a, b) { const len = vectorLength(@TypeOf(a, b)); @@ -240,7 +244,7 @@ pub fn mergeShift(a: anytype, b: anytype, comptime shift: VectorCount(@TypeOf(a, } /// Elements are shifted rightwards (towards higher indices). New elements are added to the left, and the rightmost elements are cut off -/// so that the size of the vector stays the same. +/// so that the length of the vector stays the same. pub fn shiftElementsRight(vec: anytype, comptime amount: VectorCount(@TypeOf(vec)), shift_in: std.meta.Child(@TypeOf(vec))) @TypeOf(vec) { // It may be possible to implement shifts and rotates with a runtime-friendly slice of two joined vectors, as the length of the // slice would be comptime-known. This would permit vector shifts and rotates by a non-comptime-known amount. diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 1c0c6b32ac..9a8fc7ce29 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -202,7 +202,7 @@ pub fn utf8CountCodepoints(s: []const u8) !usize { pub fn utf8ValidateSlice(input: []const u8) bool { var remaining = input; - const chunk_len = std.simd.suggestVectorSize(u8) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and skip ASCII characters at the start of the input. @@ -758,7 +758,7 @@ pub fn utf16leToUtf8Alloc(allocator: mem.Allocator, utf16le: []const u16) ![]u8 var remaining = utf16le; if (builtin.zig_backend != .stage2_x86_64) { - const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u16) orelse 1; const Chunk = @Vector(chunk_len, u16); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -801,7 +801,7 @@ pub fn utf16leToUtf8AllocZ(allocator: mem.Allocator, utf16le: []const u16) ![:0] var remaining = utf16le; if (builtin.zig_backend != .stage2_x86_64) { - const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u16) orelse 1; const Chunk = @Vector(chunk_len, u16); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -842,7 +842,7 @@ pub fn utf16leToUtf8(utf8: []u8, utf16le: []const u16) !usize { var remaining = utf16le; if (builtin.zig_backend != .stage2_x86_64) { - const chunk_len = std.simd.suggestVectorSize(u16) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u16) orelse 1; const Chunk = @Vector(chunk_len, u16); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -941,7 +941,7 @@ pub fn utf8ToUtf16LeWithNull(allocator: mem.Allocator, utf8: []const u8) ![:0]u1 var remaining = utf8; // Need support for std.simd.interlace if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) { - const chunk_len = std.simd.suggestVectorSize(u8) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and encode ASCII characters at the start of the input. @@ -986,7 +986,7 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { var remaining = utf8; // Need support for std.simd.interlace if (builtin.zig_backend != .stage2_x86_64 and comptime !builtin.cpu.arch.isMIPS()) { - const chunk_len = std.simd.suggestVectorSize(u8) orelse 1; + const chunk_len = std.simd.suggestVectorLength(u8) orelse 1; const Chunk = @Vector(chunk_len, u8); // Fast path. Check for and encode ASCII characters at the start of the input. From 564b1da2144a94fd3cf4c66614968d8b669bc26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl=20=C3=85stholm?= Date: Mon, 1 Jan 2024 16:25:17 +0100 Subject: [PATCH 3/3] Change `<` to `<=` in `indexOfSentinel` SIMD path --- lib/std/mem.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/mem.zig b/lib/std/mem.zig index b3224243c6..2ed8b57acf 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -975,7 +975,7 @@ pub fn indexOfSentinel(comptime T: type, comptime sentinel: T, p: [*:sentinel]co // First block may be unaligned const start_addr = @intFromPtr(&p[i]); const offset_in_page = start_addr & (std.mem.page_size - 1); - if (offset_in_page < std.mem.page_size - @sizeOf(Block)) { + if (offset_in_page <= std.mem.page_size - @sizeOf(Block)) { // Will not read past the end of a page, full block. const block: Block = p[i..][0..block_len].*; const matches = block == mask;