From 47c932f8960e361282269202dc8ad7e63e3a06eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Thu, 18 Sep 2025 16:38:57 +0200
Subject: [PATCH] std.simd: suggest more sensible vector sizes across the board

---
 lib/std/simd.zig | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/lib/std/simd.zig b/lib/std/simd.zig
index dace996cdc..25f454ef24 100644
--- a/lib/std/simd.zig
+++ b/lib/std/simd.zig
@@ -23,20 +23,24 @@ pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu)
         } else if (cpu.arch.isArm()) {
             if (cpu.has(.arm, .neon)) break :blk 128;
         } else if (cpu.arch.isAARCH64()) {
-            // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit
-            // I think is safer to just be on 128 until is more common
-            // TODO: Check on this return when bigger values are more common
-            if (cpu.has(.aarch64, .sve)) break :blk 128;
+            // NVIDIA Grace supports 128-bit SVE
+            // AWS Graviton3 supports 256-bit SVE
+            // Fujitsu A64FX supports 512-bit SVE
+            // -> 256-bit seems like a good default for now.
+            if (cpu.has(.aarch64, .sve)) break :blk 256;
             if (cpu.has(.aarch64, .neon)) break :blk 128;
-        } else if (cpu.arch.isPowerPC()) {
-            if (cpu.has(.powerpc, .altivec)) break :blk 128;
+        } else if (cpu.arch == .hexagon) {
+            if (cpu.has(.hexagon, .hvx_length64b)) break :blk 512;
+            if (cpu.has(.hexagon, .hvx)) break :blk 1024;
+        } else if (cpu.arch.isLoongArch()) {
+            if (cpu.has(.loongarch, .lasx)) break :blk 256;
+            if (cpu.has(.loongarch, .lsx)) break :blk 128;
         } else if (cpu.arch.isMIPS()) {
             if (cpu.has(.mips, .msa)) break :blk 128;
-            // TODO: Test MIPS capability to handle bigger vectors
-            //       In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel
-            //       for multiple processing, but I don't know what's optimal here, if using
-            //       the 2048 bits or using just 64 per vector or something in between
             if (cpu.has(.mips, .mips3d)) break :blk 64;
+        } else if (cpu.arch.isPowerPC()) {
+            if (cpu.has(.powerpc, .vsx)) break :blk 128;
+            if (cpu.has(.powerpc, .altivec)) break :blk 128;
         } else if (cpu.arch.isRISCV()) {
             // In RISC-V Vector Registers are length agnostic so there's no good way to determine the best size.
             // The usual vector length in most RISC-V cpus is 256 bits, however it can get to multiple kB.
@@ -60,12 +64,12 @@ pub fn suggestVectorLengthForCpu(comptime T: type, comptime cpu: std.Target.Cpu)
 
                 break :blk 256;
             }
+        } else if (cpu.arch == .s390x) {
+            if (cpu.has(.s390x, .vector)) break :blk 128;
         } else if (cpu.arch.isSPARC()) {
-            // TODO: Test Sparc capability to handle bigger vectors
-            //       In theory Sparc have 32 registers of 64 bits which can use in parallel
-            //       for multiple processing, but I don't know what's optimal here, if using
-            //       the 2048 bits or using just 64 per vector or something in between
             if (cpu.hasAny(.sparc, &.{ .vis, .vis2, .vis3 })) break :blk 64;
+        } else if (cpu.arch == .ve) {
+            if (cpu.has(.ve, .vpu)) break :blk 2048;
         } else if (cpu.arch.isWasm()) {
             if (cpu.has(.wasm, .simd128)) break :blk 128;
         }