From d1d892c83ca7beaf235147341b7e68d3619dd829 Mon Sep 17 00:00:00 2001
From: Dan Ellis Echavarria <19101das@gmail.com>
Date: Sun, 17 Jul 2022 02:20:35 -0500
Subject: [PATCH] SIMD size suggestions: suggestions code now compiles, added
 more architectures The idea behind this is using the register capabilities in
 safe amounts,  there is still some consideration to be done. + Fixed compile
 error using std.Target.<arch>.featureSetHas + X86 MMX and "3DNOW" 64 bits
 register usage considered for vector size + Added ARM Neon recommened usage
 of 128 bits (The size of the register) + Added AARCH64 Neon and SVE for 128
 bits. SVE could use in theory up to   2048 bits, but found only evidence of
 functional 512 bits on a super        computer, decided on using 128 bits as
 a safety + Added Altivec recommendation of using the 128 bits long register +
 Using MIPS msa 2x64bits capabilities, usage of 64 bits registers for MDMX  
 systems, need testing on how using bigger values affect performance + Using V
 extension on RISC-V, which is extendable via instructions, decided on 128
 bits   as a value to not use all registers + in SPARC the 64 bits registers
 are used, a max of 32 registers   are to be used for configurable simd
 instructions, decided on using the size of the register, need testing on
 performance hit on using a bigger sized register vector size

---
 lib/std/simd.zig | 60 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 19 deletions(-)
diff --git a/lib/std/simd.zig b/lib/std/simd.zig
index 1a71bd7069..b2655758c0 100644
--- a/lib/std/simd.zig
+++ b/lib/std/simd.zig
@@ -6,27 +6,49 @@
 const std = @import("std");
 const builtin = @import("builtin");
 
-pub fn suggestVectorSizeForCpu(comptime T: type, cpu: std.Target.Cpu) ?usize {
-    switch (cpu.arch) {
-        .x86_64 => {
-            // Note: This is mostly just guesswork. It'd be great if someone more qualified were to take a
-            // proper look at this.
-
+pub fn suggestVectorSizeForCpu(comptime T: type, comptime cpu: std.Target.Cpu) ?usize {
+    // This is guesswork, if you have better suggestions can add it or edit the current here
+    // This can run in comptime only, but stage 1 fails at it, stage 2 can understand it
+    const element_bit_size = @maximum(8, std.math.ceilPowerOfTwo(T, @bitSizeOf(T)) catch unreachable);
+    const vector_bit_size: u16 = blk: {
+        if (cpu.arch.isX86()) {
             if (T == bool and std.Target.x86.featureSetHas(.prefer_mask_registers)) return 64;
+            if (std.Target.x86.featureSetHas(cpu.features, .avx512f) and !std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .prefer_128_bit })) break :blk 512;
+            if (std.Target.x86.featureSetHasAny(cpu.features, .{ .prefer_256_bit, .avx2 }) and !std.Target.x86.featureSetHas(cpu.features, .prefer_128_bit)) break :blk 256;
+            if (std.Target.x86.featureSetHas(cpu.features, .sse)) break :blk 128;
+            if (std.Target.x86.featureSetHasAny(cpu.features, .{ .mmx, .@"3dnow" })) break :blk 64;
+        } else if (cpu.arch.isARM()) {
+            if (std.Target.arm.featureSetHas(cpu.features, .neon)) break :blk 128;
+        } else if (cpu.arch.isAARCH64()) {
+            // SVE allows up to 2048 bits in the specification, as of 2022 the most powerful machine has implemented 512-bit
+            // I think is safer to just be on 128 until is more common
+            // TODO: Check on this return when bigger values are more common
+            if (std.Target.aarch64.featureSetHas(cpu.features, .sve)) break :blk 128;
+            if (std.Target.aarch64.featureSetHas(cpu.features, .neon)) break :blk 128;
+        } else if (cpu.arch.isPPC() or cpu.arch.isPPC64()) {
+            if (std.Target.powerpc.featureSetHas(cpu.features, .altivec)) break :blk 128;
+        } else if (cpu.arch.isMIPS()) {
+            if (std.Target.mips.featureSetHas(cpu.features, .msa)) break :blk 128;
+            // TODO: Test MIPS capability to handle bigger vectors
+            //       In theory MDMX and by extension mips3d have 32 registers of 64 bits which can use in parallel
+            //       for multiple processing, but I don't know what's optimal here, if using
+            //       the 2048 bits or using just 64 per vector or something in between
+            if (std.Target.mips.featureSetHas(cpu.features, std.Target.mips.Feature.mips3d)) break :blk 64;
+        } else if (cpu.arch.isRISCV()) {
+            // in risc-v the Vector Extension allows configurable vector sizes, but a standard size of 128 is a safe estimate
+            if (std.Target.riscv.featureSetHas(cpu.features, .v)) break :blk 128;
+        } else if (cpu.arch.isSPARC()) {
+            // TODO: Test Sparc capability to handle bigger vectors
+            //       In theory Sparc have 32 registers of 64 bits which can use in parallel
+            //       for multiple processing, but I don't know what's optimal here, if using
+            //       the 2048 bits or using just 64 per vector or something in between
+            if (std.Target.sparc.featureSetHasAny(cpu.features, .{ .vis, .vis2, .vis3 })) break :blk 64;
+        }
+        return null;
+    };
+    if (vector_bit_size <= element_bit_size) return null;
 
-            const vector_bit_size = blk: {
-                if (std.Target.x86.featureSetHas(.avx512f)) break :blk 512;
-                if (std.Target.x86.featureSetHas(.prefer_256_bit)) break :blk 256;
-                if (std.Target.x86.featureSetHas(.prefer_128_bit)) break :blk 128;
-                return null;
-            };
-            const element_bit_size = std.math.max(8, std.math.ceilPowerOfTwo(T, @bitSizeOf(T)));
-            return @divExact(vector_bit_size, element_bit_size);
-        },
-        else => {
-            return null;
-        },
-    }
+    return @divExact(vector_bit_size, element_bit_size);
 }
 
 /// Suggests a target-dependant vector size for a given type, or null if scalars are recommended.