12 changed files with 502 additions and 4031 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 zig-out
 .zig-cache
 mkdocs.yaml
-zig-pkg
--- a/build.zig
+++ b/build.zig
@ -4,27 +4,20 @@ pub fn build(b: *std.Build) void {
    const target = b.standardTargetOptions(.{});
    const optimize = b.standardOptimizeOption(.{});

-    const zig_wgpu = b.dependency("zig_wgpu", .{
-        .target = target,
-        .optimize = optimize,
-    });
-
    // 1. Define the module so other projects can import it
-    const mod = b.addModule("dimal", .{
-        .root_source_file = b.path("src/lib.zig"),
+    _ = b.addModule("dimal", .{
+        .root_source_file = b.path("src/main.zig"),
    });
-    mod.addImport("gpu", zig_wgpu.module("zig-wgpu"));

    const exe_tests = b.addTest(.{
        .root_module = b.createModule(.{
-            .root_source_file = b.path("src/test.zig"),
+            .root_source_file = b.path("src/main.zig"),
            .target = target,
            .optimize = optimize,
        }),
        .test_runner = .{ .path = b.path("test_runner.zig"), .mode = .simple },
    });

-    exe_tests.root_module.addImport("gpu", zig_wgpu.module("zig-wgpu"));
    const run_exe_tests = b.addRunArtifact(exe_tests);
    const test_step = b.step("test", "Run tests");
    test_step.dependOn(&run_exe_tests.step);
@ -37,7 +30,6 @@ pub fn build(b: *std.Build) void {
            .imports = &.{},
        }),
    });
-    bench_exe.root_module.addImport("gpu", zig_wgpu.module("zig-wgpu"));

    b.installArtifact(bench_exe);

--- a/build.zig.zon
+++ b/build.zig.zon
@ -1,14 +1,9 @@
 .{
    .name = .dimal,
-    .version = "0.3.0",
+    .version = "0.2.2",
    .fingerprint = 0x9453b1ff1e52d858,
    .minimum_zig_version = "0.16.0",
-    .dependencies = .{
-        .zig_wgpu = .{
-            .url = "git+https://git.bouvais.lu/adrien/zig-wgpu?ref=0.2.2#5f8da0940d77c40eacd39c268d09acbeaea0b2a5",
-            .hash = "zig_wgpu-0.2.0-xsLAy2-s0QPNwR2QNd8ZX2kWiVfV5oB92N3ga1V1Uwpu",
-        },
-    },
+    .dependencies = .{},
    .paths = .{
        "build.zig",
        "build.zig.zon",
--- a/src/Base.zig
+++ b/src/Base.zig
@ -3,7 +3,7 @@ const std = @import("std");
 // Adjust these imports to match your actual file names
 const Dimensions = @import("Dimensions.zig");
 const Scales = @import("Scales.zig");
-const Tensor = @import("TensorStatic.zig").Tensor;
+const Tensor = @import("Tensor.zig").TensorStatic;

 fn PhysicalConstant(comptime d: Dimensions.ArgOpts, comptime val: f64, comptime s: Scales.ArgOpts) type {
    return struct {
--- a/src/TensorStatic.zig
+++ b/src/TensorStatic.zig
--- a/src/TensorAlloc.zig
+++ b/src/TensorAlloc.zig
--- a/src/TensorGpu.zig
+++ b/src/TensorGpu.zig
--- a/src/benchmark.zig
+++ b/src/benchmark.zig
@ -1,6 +1,6 @@
 const std = @import("std");
 const Io = std.Io;
-const Tensor = @import("Tensor.zig").Tensor;
+const Tensor = @import("Tensor.zig").TensorStatic;

 var io: Io = undefined;
 pub fn main(init: std.process.Init) !void {
@ -10,8 +10,23 @@ pub fn main(init: std.process.Init) !void {

    io = init.io;

+    try vectorSIMDvsNative(f64, &stdout_writer.interface);
+    try stdout_writer.flush();
+    try vectorSIMDvsNative(f32, &stdout_writer.interface);
+    try stdout_writer.flush();
+    try vectorSIMDvsNative(i32, &stdout_writer.interface);
+    try stdout_writer.flush();
+    try vectorSIMDvsNative(i64, &stdout_writer.interface);
+    try stdout_writer.flush();
+    try vectorSIMDvsNative(i128, &stdout_writer.interface);
+    try stdout_writer.flush();
+
    try bench_Scalar(&stdout_writer.interface);
    try stdout_writer.flush();
+    try bench_vsNative(&stdout_writer.interface);
+    try stdout_writer.flush();
+    try bench_crossTypeVsNative(&stdout_writer.interface);
+    try stdout_writer.flush();
    try bench_Vector(&stdout_writer.interface);
    try stdout_writer.flush();
    try bench_HighDimTensor(&stdout_writer.interface);
@ -154,6 +169,245 @@ fn bench_Scalar(writer: *std.Io.Writer) !void {
    try writer.print("└──────────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘\n", .{});
 }

+fn bench_vsNative(writer: *std.Io.Writer) !void {
+    const ITERS: usize = 100_000;
+    const SAMPLES: usize = 100;
+
+    const getValT = struct {
+        fn f(comptime TT: type, i: usize) TT {
+            const v = (i % 100) + 1;
+            return if (comptime @typeInfo(TT) == .float) @floatFromInt(v) else @intCast(v);
+        }
+    }.f;
+
+    const Types = .{ i32, i64, i128, f32, f64 };
+    const TNames = .{ "i32", "i64", "i128", "f32", "f64" };
+    // Expanded Ops to match bench_Scalar
+    const Ops = .{ "add", "sub", "mul", "div", "abs", "eq", "gt" };
+
+    try writer.print(
+        \\
+        \\ Scalar vs Native Overhead Analysis
+        \\
+        \\┌───────────┬──────┬───────────┬───────────┬───────────┬───────────────────────┐
+        \\│ Operation │ Type │ Native    │ @Vector   │ Tensor{{1}} │ Slowdown  Nat | Vec   │
+        \\├───────────┼──────┼───────────┼───────────┼───────────┼───────────────────────┤
+        \\
+    , .{});
+
+    inline for (Ops, 0..) |op_name, j| {
+        inline for (Types, 0..) |T, tidx| {
+            var native_total_ns: f64 = 0;
+            var vector_total_ns: f64 = 0;
+            var tensor_total_ns: f64 = 0;
+
+            const M = Tensor(T, .{}, .{}, &.{1});
+
+            for (0..SAMPLES) |_| {
+                // --- 1. Benchmark Native ---
+                const n_start = getTime();
+                const a = getValT(T, 10);
+                const b = getValT(T, 2);
+                for (0..ITERS) |_| {
+                    // Native logic branch
+                    _ = if (comptime std.mem.eql(u8, op_name, "add"))
+                        if (comptime @typeInfo(T) == .int) a +| b else a + b
+                    else if (comptime std.mem.eql(u8, op_name, "sub"))
+                        if (comptime @typeInfo(T) == .int) a -| b else a - b
+                    else if (comptime std.mem.eql(u8, op_name, "mul"))
+                        if (comptime @typeInfo(T) == .int) a *| b else a * b
+                    else if (comptime std.mem.eql(u8, op_name, "div"))
+                        if (comptime @typeInfo(T) == .int) @divTrunc(a, b) else a / b
+                    else if (comptime std.mem.eql(u8, op_name, "abs"))
+                        if (comptime @typeInfo(T) == .int) @abs(a) else @as(T, @abs(a))
+                    else if (comptime std.mem.eql(u8, op_name, "eq"))
+                        a == b
+                    else if (comptime std.mem.eql(u8, op_name, "gt"))
+                        a > b
+                    else
+                        unreachable;
+                }
+                const n_end = getTime();
+                native_total_ns += @as(f64, @floatFromInt(n_start.durationTo(n_end).toNanoseconds()));
+
+                const v_start = getTime();
+                const va = @Vector(1, T){getValT(T, 10)};
+                const vb = @Vector(1, T){getValT(T, 2)};
+                for (0..ITERS) |_| {
+                    // Native logic branch
+                    _ = if (comptime std.mem.eql(u8, op_name, "add"))
+                        if (comptime @typeInfo(T) == .int) va +| vb else va + vb
+                    else if (comptime std.mem.eql(u8, op_name, "sub"))
+                        if (comptime @typeInfo(T) == .int) va -| vb else va - vb
+                    else if (comptime std.mem.eql(u8, op_name, "mul"))
+                        if (comptime @typeInfo(T) == .int) va *| vb else va * vb
+                    else if (comptime std.mem.eql(u8, op_name, "div"))
+                        if (comptime @typeInfo(T) == .int) @divTrunc(va, vb) else va / vb
+                    else if (comptime std.mem.eql(u8, op_name, "abs"))
+                        if (comptime @typeInfo(T) == .int) @as(T, @intCast(@abs(va[0]))) else @abs(va)
+                    else if (comptime std.mem.eql(u8, op_name, "eq"))
+                        va == vb
+                    else if (comptime std.mem.eql(u8, op_name, "gt"))
+                        va > vb
+                    else
+                        unreachable;
+                }
+                const v_end = getTime();
+                vector_total_ns += @as(f64, @floatFromInt(v_start.durationTo(v_end).toNanoseconds()));
+
+                // --- 2. Benchmark Scalar ---
+                const q_start = getTime();
+                const qa = M.splat(getValT(T, 10));
+                const qb = M.splat(getValT(T, 2));
+                for (0..ITERS) |_| {
+                    // Scalar logic branch
+                    _ = if (comptime std.mem.eql(u8, op_name, "add"))
+                        qa.add(qb)
+                    else if (comptime std.mem.eql(u8, op_name, "sub"))
+                        qa.sub(qb)
+                    else if (comptime std.mem.eql(u8, op_name, "mul"))
+                        qa.mul(qb)
+                    else if (comptime std.mem.eql(u8, op_name, "div"))
+                        qa.div(qb)
+                    else if (comptime std.mem.eql(u8, op_name, "abs"))
+                        qa.abs()
+                    else if (comptime std.mem.eql(u8, op_name, "eq"))
+                        qa.eq(qb)
+                    else if (comptime std.mem.eql(u8, op_name, "gt"))
+                        qa.gt(qb)
+                    else
+                        unreachable;
+                }
+                const q_end = getTime();
+                tensor_total_ns += @as(f64, @floatFromInt(q_start.durationTo(q_end).toNanoseconds()));
+            }
+
+            const avg_n = (native_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
+            const avg_v = (vector_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
+            const avg_t = (tensor_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
+            const slowdown_nt = avg_t / avg_n;
+            const slowdown_vt = avg_t / avg_v;
+
+            try writer.print("│ {s:<9} │ {s:<4} │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>8.2}x   {d:>8.2}x │\n", .{
+                op_name, TNames[tidx], avg_n, avg_v, avg_t, slowdown_nt, slowdown_vt,
+            });
+        }
+        if (j != Ops.len - 1) try writer.print("├───────────┼──────┼───────────┼───────────┼───────────┼───────────────────────┤\n", .{});
+    }
+
+    try writer.print("└───────────┴──────┴───────────┴───────────┴───────────┴───────────────────────┘\n", .{});
+}
+
+fn bench_crossTypeVsNative(writer: *std.Io.Writer) !void {
+    const ITERS: usize = 100_000;
+    const SAMPLES: usize = 5;
+
+    const getValT = struct {
+        fn f(comptime TT: type, i: usize) TT {
+            // Keep values safe and non-zero to avoid division by zero or overflows during cross-casting
+            const v = (i % 50) + 1;
+            return if (comptime @typeInfo(TT) == .float) @floatFromInt(v) else @intCast(v);
+        }
+    }.f;
+
+    // Helper for the Native baseline: explicitly casting T2 to T1 before the operation
+    const castTo = struct {
+        fn f(comptime DestT: type, comptime SrcT: type, val: SrcT) DestT {
+            if (comptime DestT == SrcT) return val;
+            const src_info = @typeInfo(SrcT);
+            const dest_info = @typeInfo(DestT);
+
+            if (dest_info == .int and src_info == .int) return @intCast(val);
+            if (dest_info == .float and src_info == .int) return @floatFromInt(val);
+            if (dest_info == .int and src_info == .float) return @intFromFloat(val);
+            if (dest_info == .float and src_info == .float) return @floatCast(val);
+            unreachable;
+        }
+    }.f;
+
+    const Types = .{ i16, i64, i128, f32, f64 };
+    const TNames = .{ "i16", "i64", "i128", "f32", "f64" };
+    const Ops = .{ "add", "mul", "div" };
+
+    try writer.print(
+        \\
+        \\ Cross-Type Overhead Analysis: Scalar vs Native
+        \\
+        \\┌─────────┬──────┬──────┬───────────┬───────────┬───────────┐
+        \\│ Op      │ T1   │ T2   │ Native    │ Scalar    │ Slowdown  │
+        \\├─────────┼──────┼──────┼───────────┼───────────┼───────────┤
+        \\
+    , .{});
+
+    inline for (Ops, 0..) |op_name, j| {
+        inline for (Types, 0..) |T1, t1_idx| {
+            inline for (Types, 0..) |T2, t2_idx| {
+                var native_total_ns: f64 = 0;
+                var quantity_total_ns: f64 = 0;
+
+                const M1 = Tensor(T1, .{ .L = 1 }, .{}, &.{1});
+                const M2 = Tensor(T2, .{ .L = 1 }, .{}, &.{1});
+                const S2 = Tensor(T2, .{ .T = 1 }, .{}, &.{1});
+
+                std.mem.doNotOptimizeAway({
+                    for (0..SAMPLES) |_| {
+                        // --- 1. Benchmark Native (Cast T2 to T1, then math) ---
+                        const n_start = getTime();
+                        for (0..ITERS) |i| {
+                            const a = getValT(T1, i);
+                            const b_raw = getValT(T2, 2);
+                            const b = castTo(T1, T2, b_raw);
+
+                            _ = if (comptime std.mem.eql(u8, op_name, "add"))
+                                a + b
+                            else if (comptime std.mem.eql(u8, op_name, "mul"))
+                                a * b
+                            else if (comptime @typeInfo(T1) == .int)
+                                @divTrunc(a, b)
+                            else
+                                a / b;
+                        }
+                        const n_end = getTime();
+                        native_total_ns += @as(f64, @floatFromInt(n_start.durationTo(n_end).toNanoseconds()));
+
+                        // --- 2. Benchmark Scalar ---
+                        const q_start = getTime();
+                        for (0..ITERS) |i| {
+                            const qa = M1.splat(getValT(T1, i));
+                            const qb = if (comptime std.mem.eql(u8, op_name, "div"))
+                                S2.splat(getValT(T2, 2))
+                            else
+                                M2.splat(getValT(T2, 2));
+
+                            _ = if (comptime std.mem.eql(u8, op_name, "add"))
+                                qa.add(qb)
+                            else if (comptime std.mem.eql(u8, op_name, "mul"))
+                                qa.mul(qb)
+                            else
+                                qa.div(qb);
+                        }
+                        const q_end = getTime();
+                        quantity_total_ns += @as(f64, @floatFromInt(q_start.durationTo(q_end).toNanoseconds()));
+                    }
+
+                    const avg_n = (native_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
+                    const avg_q = (quantity_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
+                    const slowdown = avg_q / avg_n;
+
+                    try writer.print("│ {s:<7} │ {s:<4} │ {s:<4} │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>8.2}x │\n", .{
+                        op_name, TNames[t1_idx], TNames[t2_idx], avg_n, avg_q, slowdown,
+                    });
+                });
+            }
+        }
+        if (j != Ops.len - 1) {
+            try writer.print("├─────────┼──────┼──────┼───────────┼───────────┼───────────┤\n", .{});
+        }
+    }
+
+    try writer.print("└─────────┴──────┴──────┴───────────┴───────────┴───────────┘\n", .{});
+}
+
 fn bench_Vector(writer: *std.Io.Writer) !void {
    const ITERS: usize = 10_000;
    const SAMPLES: usize = 10;
@ -192,7 +446,7 @@ fn bench_Vector(writer: *std.Io.Writer) !void {
    const TNames = .{ "i32", "i64", "i128", "f32", "f64" };
    const Lengths = .{ 1, 3, 4, 16, 100 };
    // "cross" is only valid for len=3; other cells will show "  ---  "
-    const Ops = .{ "add", "div", "mulScalar", "dot", "product", "pow", "length" };
+    const Ops = .{ "add", "div", "mulScalar", "dot", "cross", "product", "pow", "length" };

    inline for (Ops, 0..) |op_name, o_idx| {
        inline for (Types, TNames) |T, tname| {
@ -228,6 +482,10 @@ fn bench_Vector(writer: *std.Io.Writer) !void {
                            } else if (comptime std.mem.eql(u8, op_name, "dot")) {
                                const v2 = V.splat(getVal(T, i +% 5, 63));
                                _ = v1.contract(v2, 0, 0);
+                            } else if (comptime std.mem.eql(u8, op_name, "cross")) {
+                                // len == 3 guaranteed by the guard above
+                                const v2 = V.splat(getVal(T, i +% 5, 63));
+                                _ = v1.cross(v2);
                            } else if (comptime std.mem.eql(u8, op_name, "product")) {
                                _ = v1.product();
                            } else if (comptime std.mem.eql(u8, op_name, "pow")) {
@ -350,3 +608,62 @@ fn bench_HighDimTensor(writer: *std.Io.Writer) !void {
    }
    try writer.print("└─────────────────┴──────┴──────────────┴──────────────┴──────────────┴──────────────┘\n", .{});
 }
+
+fn vectorSIMDvsNative(comptime T: type, writer: *std.Io.Writer) !void {
+    const iterations: u64 = 10_000;
+    const lens = [_]u32{ 1, 2, 3, 4, 5, 10, 100, 1_000, 10_000 };
+
+    try writer.print("\nSIMD Speedup Analysis: {s}\n", .{@typeName(T)});
+    try writer.print("┌────────────┬────────────┬────────────┬────────────┐\n", .{});
+    try writer.print("│ Vector Len │ Scalar (us)│ Vector (us)│ Speedup    │\n", .{});
+    try writer.print("├────────────┼────────────┼────────────┼────────────┤\n", .{});
+
+    inline for (lens) |vector_len| {
+        // --- Scalar Test ---
+        var scalar_val: T = 10;
+        const start_scalar = getTime();
+
+        var i: u64 = 0;
+        while (i < iterations * vector_len) : (i += 1) {
+            if (comptime @typeInfo(T) == .int)
+                scalar_val = scalar_val +% 1
+            else
+                scalar_val = scalar_val + 1;
+        }
+        const scalar_time = start_scalar.durationTo(getTime()).toMicroseconds();
+
+        // --- Vector Test ---
+        var vector_val: @Vector(vector_len, T) = @splat(20);
+        const start_vector = getTime();
+
+        i = 0;
+        const increment: @Vector(vector_len, T) = @splat(1);
+        while (i < iterations) : (i += 1) {
+            if (comptime @typeInfo(T) == .int)
+                vector_val = vector_val +% increment
+            else
+                vector_val = vector_val + increment;
+        }
+        const vector_time = start_vector.durationTo(getTime()).toMicroseconds();
+
+        // --- Results ---
+        const s_float = @as(f64, @floatFromInt(scalar_time));
+        const v_float = @as(f64, @floatFromInt(vector_time));
+
+        // Speedup = ScalarTime / VectorTime.
+        // > 1.0 means SIMD is faster.
+        const speedup = if (vector_time > 0) s_float / v_float else 0;
+
+        try writer.print("│ {d:<10} │ {d:>10} │ {d:>10} │ {d:>9.2}x │\n", .{
+            vector_len,
+            scalar_time,
+            vector_time,
+            speedup,
+        });
+        try writer.flush();
+
+        std.mem.doNotOptimizeAway(scalar_val);
+        std.mem.doNotOptimizeAway(vector_val);
+    }
+    try writer.print("└────────────┴────────────┴────────────┴────────────┘\n", .{});
+}
--- a/src/lib.zig
+++ b/src/lib.zig
@ -1,9 +0,0 @@
-const std = @import("std");
-
-pub const TensorStatic = @import("TensorStatic.zig").Tensor;
-pub const TensorAlloc = @import("TensorAlloc.zig").Tensor;
-pub const TensorGpu = @import("TensorGpu.zig").Tensor;
-pub const Dimensions = @import("Dimensions.zig");
-pub const Scales = @import("Scales.zig");
-pub const Base = @import("Base.zig");
-pub const UnitParser = @import("UnitParser.zig");
--- a/src/main.zig
+++ b/src/main.zig
@ -0,0 +1,15 @@
+const std = @import("std");
+
+pub const Tensor = @import("Tensor.zig").TensorStatic;
+pub const Dimensions = @import("Dimensions.zig");
+pub const Scales = @import("Scales.zig");
+pub const Base = @import("Base.zig");
+pub const UnitParser = @import("UnitParser.zig");
+
+test {
+    _ = @import("Tensor.zig");
+    _ = @import("Dimensions.zig");
+    _ = @import("Scales.zig");
+    _ = @import("Base.zig");
+    _ = @import("UnitParser.zig");
+}
--- a/src/shared.zig
+++ b/src/shared.zig
@ -4,12 +4,6 @@ const UnitScale = Scales.UnitScale;
 const Dimensions = @import("Dimensions.zig");
 const Dimension = Dimensions.Dimension;

-pub const TensorKind = enum { static, alloc, gpu };
-
-pub fn isTensor(comptime T: type) bool {
-    return comptime @typeInfo(T) == .@"struct" and @hasDecl(T, "ISTENSOR");
-}
-
 pub fn shapeTotal(shape: []const comptime_int) usize {
    var t: comptime_int = 1;
    for (shape) |s| t *= s;
--- a/src/test.zig
+++ b/src/test.zig
@ -1,9 +0,0 @@
-test {
-    _ = @import("TensorStatic.zig");
-    _ = @import("TensorAlloc.zig");
-    _ = @import("TensorGpu.zig");
-    _ = @import("Dimensions.zig");
-    _ = @import("Scales.zig");
-    _ = @import("Base.zig");
-    _ = @import("UnitParser.zig");
-}