From 8ff91d35ecbbb9f8b25cda4b49afb5211190f4f6 Mon Sep 17 00:00:00 2001
From: adrien <adrien@bouvais.lu>
Date: Fri, 15 May 2026 09:44:53 +0200
Subject: [PATCH] Basic GpuAlloc + Mat

---
 src/gpu.zig       | 399 ++++++++++++++++++++++++++++++++++++++++++++++
 src/main.zig      | 258 ++++--------------------------
 src/reference.zig | 239 +++++++++++++++++++++++++++
 src/wgpu.zig      | 137 ----------------
 4 files changed, 672 insertions(+), 361 deletions(-)
 create mode 100644 src/gpu.zig
 create mode 100644 src/reference.zig
 delete mode 100644 src/wgpu.zig
diff --git a/src/gpu.zig b/src/gpu.zig
new file mode 100644
index 0000000..614d140
--- /dev/null
+++ b/src/gpu.zig
@@ -0,0 +1,399 @@
+/// gpu.zig — WebGPU compute module
+///
+/// Usage:
+///   var gloc = try GpuAllocator.init();
+///   defer gloc.deinit();
+///
+///   const a = try Mat.load(&gloc, &data_a, 4, 4);
+///   defer a.deinit();
+///   const b = try Mat.load(&gloc, &data_b, 4, 4);
+///   defer b.deinit();
+///
+///   const c = try a.add(&gloc, b);
+///   defer c.deinit();
+///
+///   var out: [16]f32 = undefined;
+///   try c.read(&gloc, &out);
+const std = @import("std");
+const c = @cImport(@cInclude("wgpu.h"));
+
+// ── Shaders ───────────────────────────────────────────────────────────────────
+
+const SHADER_ADD =
+    \\@group(0) @binding(0) var<storage, read>       a : array<f32>;
+    \\@group(0) @binding(1) var<storage, read>       b : array<f32>;
+    \\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
+    \\
+    \\@compute @workgroup_size(64)
+    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+    \\    let i = gid.x;
+    \\    if (i < arrayLength(&out)) {
+    \\        out[i] = a[i] + b[i];
+    \\    }
+    \\}
+;
+
+const SHADER_SCALE =
+    \\struct Uniforms { scalar : f32 }
+    \\@group(0) @binding(0) var<storage, read>       a   : array<f32>;
+    \\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
+    \\@group(0) @binding(2) var<uniform>             u   : Uniforms;
+    \\
+    \\@compute @workgroup_size(64)
+    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+    \\    let i = gid.x;
+    \\    if (i < arrayLength(&out)) {
+    \\        out[i] = a[i] * u.scalar;
+    \\    }
+    \\}
+;
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+fn sv(s: []const u8) c.WGPUStringView {
+    return .{ .data = s.ptr, .length = s.len };
+}
+
+fn ceilDiv(n: u32, d: u32) u32 {
+    return (n + d - 1) / d;
+}
+
+// ── Callbacks ─────────────────────────────────────────────────────────────────
+
+const Ctx = struct {
+    adapter: c.WGPUAdapter = null,
+    device: c.WGPUDevice = null,
+};
+
+fn onAdapter(
+    status: c.WGPURequestAdapterStatus,
+    adapter: c.WGPUAdapter,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestAdapterStatus_Success) {
+        std.log.err("Adapter request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.adapter = adapter;
+}
+
+fn onDevice(
+    status: c.WGPURequestDeviceStatus,
+    device: c.WGPUDevice,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestDeviceStatus_Success) {
+        std.log.err("Device request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.device = device;
+    std.debug.print("{?}", .{device});
+}
+
+fn onMapped(
+    status: c.WGPUMapAsyncStatus,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
+    flag.* = (status == c.WGPUMapAsyncStatus_Success);
+}
+
+// ── Pipeline factory ──────────────────────────────────────────────────────────
+
+fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
+    var wgsl_src = c.WGPUShaderSourceWGSL{
+        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
+        .code = sv(wgsl),
+    };
+    const shader = c.wgpuDeviceCreateShaderModule(device, &.{
+        .nextInChain = @ptrCast(&wgsl_src),
+    }) orelse return error.Shader;
+    defer c.wgpuShaderModuleRelease(shader);
+
+    return c.wgpuDeviceCreateComputePipeline(device, &.{
+        .compute = .{ .module = shader, .entryPoint = sv("main") },
+    }) orelse error.Pipeline;
+}
+
+// ── GpuAllocator ──────────────────────────────────────────────────────────────
+
+/// Owns the WebGPU context. One per application.
+/// All Mat instances must not outlive their GpuAllocator.
+pub const GpuAllocator = struct {
+    instance: c.WGPUInstance,
+    adapter: c.WGPUAdapter,
+    device: c.WGPUDevice,
+    queue: c.WGPUQueue,
+
+    // Lazily created, cached for lifetime of allocator
+    _pip_add: c.WGPUComputePipeline = null,
+    _pip_scale: c.WGPUComputePipeline = null,
+
+    pub fn init() !GpuAllocator {
+        const instance = c.wgpuCreateInstance(
+            &std.mem.zeroes(c.WGPUInstanceDescriptor),
+        ) orelse return error.NoInstance;
+        errdefer c.wgpuInstanceRelease(instance);
+
+        var ctx = Ctx{};
+        _ = c.wgpuInstanceRequestAdapter(
+            instance,
+            &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
+            .{ .callback = onAdapter, .userdata1 = &ctx },
+        );
+        c.wgpuInstanceProcessEvents(instance);
+        const adapter = ctx.adapter orelse return error.NoAdapter;
+        errdefer c.wgpuAdapterRelease(adapter);
+
+        _ = c.wgpuAdapterRequestDevice(
+            adapter,
+            null,
+            .{ .callback = onDevice, .userdata1 = &ctx },
+        );
+        c.wgpuInstanceProcessEvents(instance);
+        const device = ctx.device orelse return error.NoDevice;
+
+        return .{
+            .instance = instance,
+            .adapter = adapter,
+            .device = device,
+            .queue = c.wgpuDeviceGetQueue(device),
+        };
+    }
+
+    pub fn deinit(self: *GpuAllocator) void {
+        if (self._pip_add) |p| c.wgpuComputePipelineRelease(p);
+        if (self._pip_scale) |p| c.wgpuComputePipelineRelease(p);
+        c.wgpuQueueRelease(self.queue);
+        c.wgpuDeviceRelease(self.device);
+        c.wgpuAdapterRelease(self.adapter);
+        c.wgpuInstanceRelease(self.instance);
+    }
+
+    // ── Internal ─────────────────────────────────────────────────────────────
+
+    pub fn makeBuffer(
+        self: *GpuAllocator,
+        bytes: u64,
+        usage: c.WGPUBufferUsage,
+    ) !c.WGPUBuffer {
+        return c.wgpuDeviceCreateBuffer(self.device, &.{
+            .usage = usage,
+            .size = bytes,
+        }) orelse error.BufferAlloc;
+    }
+
+    fn pipAdd(self: *GpuAllocator) !c.WGPUComputePipeline {
+        if (self._pip_add == null)
+            self._pip_add = try buildPipeline(self.device, SHADER_ADD);
+        return self._pip_add.?;
+    }
+
+    fn pipScale(self: *GpuAllocator) !c.WGPUComputePipeline {
+        if (self._pip_scale == null)
+            self._pip_scale = try buildPipeline(self.device, SHADER_SCALE);
+        return self._pip_scale.?;
+    }
+
+    /// Poll until GPU work completes. Use after submit if you need CPU sync.
+    pub fn poll(self: *GpuAllocator) void {
+        _ = c.wgpuDevicePoll(self.device, 1, null);
+    }
+};
+
+/// GPU-resident f32 matrix.
+/// Buffer usage: Storage | CopyDst | CopySrc (usable as input and output).
+pub const Mat = struct {
+    buf: c.WGPUBuffer,
+    rows: u32,
+    cols: u32,
+
+    // ── Lifecycle ─────────────────────────────────────────────────────────────
+
+    /// Allocate GPU buffer and upload `data`. `data.len` must equal rows*cols.
+    pub fn load(
+        gloc: *GpuAllocator,
+        data: []const f32,
+        rows: u32,
+        cols: u32,
+    ) !Mat {
+        std.debug.assert(data.len == @as(usize, rows) * cols);
+        const bytes = data.len * @sizeOf(f32);
+        const buf = try gloc.makeBuffer(
+            bytes,
+            c.WGPUBufferUsage_Storage |
+                c.WGPUBufferUsage_CopyDst |
+                c.WGPUBufferUsage_CopySrc,
+        );
+        c.wgpuQueueWriteBuffer(gloc.queue, buf, 0, data.ptr, bytes);
+        return .{ .buf = buf, .rows = rows, .cols = cols };
+    }
+
+    /// Allocate zeroed GPU buffer (no upload).
+    pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat {
+        const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
+        const buf = try gloc.makeBuffer(
+            bytes,
+            c.WGPUBufferUsage_Storage |
+                c.WGPUBufferUsage_CopyDst |
+                c.WGPUBufferUsage_CopySrc,
+        );
+        return .{ .buf = buf, .rows = rows, .cols = cols };
+    }
+
+    pub fn deinit(self: Mat) void {
+        c.wgpuBufferRelease(self.buf);
+    }
+
+    pub fn len(self: Mat) u32 {
+        return self.rows * self.cols;
+    }
+
+    pub fn byteSize(self: Mat) u64 {
+        return @as(u64, self.len()) * @sizeOf(f32);
+    }
+
+    /// Element-wise add. Shapes must match. Returns new Mat (caller owns).
+    pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
+        std.debug.assert(self.rows == other.rows and self.cols == other.cols);
+
+        const result = try Mat.zeros(gloc, self.rows, self.cols);
+        errdefer result.deinit();
+
+        const pipeline = try gloc.pipAdd();
+        try dispatch2in1out(gloc, pipeline, self.buf, other.buf, result.buf, self.byteSize(), self.len());
+
+        return result;
+    }
+
+    /// Element-wise multiply by scalar. Returns new Mat (caller owns).
+    pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
+        const result = try Mat.zeros(gloc, self.rows, self.cols);
+        errdefer result.deinit();
+
+        const bytes = self.byteSize();
+        const n = self.len();
+
+        // Upload scalar as uniform buffer
+        const uni_buf = try gloc.makeBuffer(
+            @sizeOf(f32),
+            c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
+        );
+        defer c.wgpuBufferRelease(uni_buf);
+        c.wgpuQueueWriteBuffer(gloc.queue, uni_buf, 0, &scalar, @sizeOf(f32));
+
+        const pipeline = try gloc.pipScale();
+        const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+        defer c.wgpuBindGroupLayoutRelease(bgl);
+
+        const entries = [_]c.WGPUBindGroupEntry{
+            .{ .binding = 0, .buffer = self.buf, .offset = 0, .size = bytes },
+            .{ .binding = 1, .buffer = result.buf, .offset = 0, .size = bytes },
+            .{ .binding = 2, .buffer = uni_buf, .offset = 0, .size = @sizeOf(f32) },
+        };
+        try submitPass(gloc, pipeline, &entries, n);
+
+        return result;
+    }
+
+    /// Read GPU buffer back to CPU. `out.len` must be >= rows*cols.
+    pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
+        std.debug.assert(out.len >= self.len());
+        const bytes = self.byteSize();
+
+        const staging = try gloc.makeBuffer(
+            bytes,
+            c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
+        );
+        defer c.wgpuBufferRelease(staging);
+
+        // Copy result → staging
+        const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
+            return error.Encoder;
+        c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf, 0, staging, 0, bytes);
+        const cmd = c.wgpuCommandEncoderFinish(enc, null);
+        defer c.wgpuCommandEncoderRelease(enc);
+        defer c.wgpuCommandBufferRelease(cmd);
+        c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+
+        // Map and copy to slice
+        var mapped = false;
+        _ = c.wgpuBufferMapAsync(
+            staging,
+            c.WGPUMapMode_Read,
+            0,
+            bytes,
+            .{ .callback = onMapped, .userdata1 = &mapped },
+        );
+        while (!mapped) gloc.poll();
+
+        const ptr: [*]const f32 = @ptrCast(@alignCast(
+            c.wgpuBufferGetConstMappedRange(staging, 0, bytes),
+        ));
+        @memcpy(out[0..self.len()], ptr[0..self.len()]);
+        c.wgpuBufferUnmap(staging);
+    }
+};
+
+// ── Dispatch helpers ──────────────────────────────────────────────────────────
+
+/// Encode + submit a 2-input, 1-output compute pass (used by add).
+fn dispatch2in1out(
+    gloc: *GpuAllocator,
+    pipeline: c.WGPUComputePipeline,
+    buf_a: c.WGPUBuffer,
+    buf_b: c.WGPUBuffer,
+    buf_out: c.WGPUBuffer,
+    bytes: u64,
+    n: u32,
+) !void {
+    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+    defer c.wgpuBindGroupLayoutRelease(bgl);
+
+    const entries = [_]c.WGPUBindGroupEntry{
+        .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = bytes },
+        .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = bytes },
+        .{ .binding = 2, .buffer = buf_out, .offset = 0, .size = bytes },
+    };
+    try submitPass(gloc, pipeline, &entries, n);
+}
+
+/// Create bind group, encode pass, submit. workgroup_size=64.
+fn submitPass(
+    gloc: *GpuAllocator,
+    pipeline: c.WGPUComputePipeline,
+    entries: []const c.WGPUBindGroupEntry,
+    n: u32,
+) !void {
+    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+    defer c.wgpuBindGroupLayoutRelease(bgl);
+
+    const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{
+        .layout = bgl,
+        .entries = entries.ptr,
+        .entryCount = entries.len,
+    }) orelse return error.BindGroup;
+    defer c.wgpuBindGroupRelease(bg);
+
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
+        return error.Encoder;
+    const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
+    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
+    c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
+    c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1);
+    c.wgpuComputePassEncoderEnd(pass);
+    c.wgpuComputePassEncoderRelease(pass);
+
+    const cmd = c.wgpuCommandEncoderFinish(enc, null);
+    defer c.wgpuCommandEncoderRelease(enc);
+    defer c.wgpuCommandBufferRelease(cmd);
+    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+}
diff --git a/src/main.zig b/src/main.zig
index 6c80dfa..92d9de8 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,239 +1,49 @@
-// Minimal WebGPU compute in Zig: element-wise matrix addition
-// Uses wgpu-native C bindings.
-// Build: see ../build.zig
-//
-// Data flow:
-//   CPU (mat_a, mat_b) → GPU storage buffers → compute shader → GPU buf_c
-//   → staging buffer (mapped) → CPU read → print
-
 const std = @import("std");
-const c = @cImport(@cInclude("wgpu.h"));
+const gpu = @import("gpu.zig");
 
-// ── Config ────────────────────────────────────────────────────────────────────
-const ROWS: u32 = 4;
-const COLS: u32 = 4;
-const N = ROWS * COLS; // 16 elements
-const BUF_BYTES = N * @sizeOf(f32);
-
-// ── WGSL Compute Shader ───────────────────────────────────────────────────────
-//   workgroup_size(4,4) matches one full 4×4 matrix → dispatch(1,1,1)
-const SHADER =
-    \\@group(0) @binding(0) var<storage, read>       mat_a : array<f32>;
-    \\@group(0) @binding(1) var<storage, read>       mat_b : array<f32>;
-    \\@group(0) @binding(2) var<storage, read_write> mat_c : array<f32>;
-    \\
-    \\@compute @workgroup_size(4, 4)
-    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
-    \\    let idx = gid.y * 4u + gid.x;
-    \\    if (idx < arrayLength(&mat_c)) {
-    \\        mat_c[idx] = mat_a[idx] + mat_b[idx];
-    \\    }
-    \\}
-;
-
-// ── Callback state ────────────────────────────────────────────────────────────
-const Ctx = struct {
-    adapter: c.WGPUAdapter = null,
-    device: c.WGPUDevice = null,
-};
-
-fn onAdapter(
-    status: c.WGPURequestAdapterStatus,
-    adapter: c.WGPUAdapter,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestAdapterStatus_Success) {
-        std.log.err("Adapter request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.adapter = adapter;
-}
-
-fn onDevice(
-    status: c.WGPURequestDeviceStatus,
-    device: c.WGPUDevice,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestDeviceStatus_Success) {
-        std.log.err("Device request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.device = device;
-    std.debug.print("{?}", .{device});
-}
-
-fn onMapped(
-    status: c.WGPUMapAsyncStatus,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
-    flag.* = (status == c.WGPUMapAsyncStatus_Success);
-}
-
-fn sv(s: []const u8) c.WGPUStringView {
-    return .{ .data = s.ptr, .length = s.len };
-}
-
-// ── Main ──────────────────────────────────────────────────────────────────────
 pub fn main() !void {
+    var gloc = try gpu.GpuAllocator.init();
+    defer gloc.deinit();
 
-    // 1. Instance ──────────────────────────────────────────────────────────────
-    const instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse
-        return error.NoInstance;
-    defer c.wgpuInstanceRelease(instance);
-
-    // 2. Adapter (async → poll) ────────────────────────────────────────────────
-    var ctx = Ctx{};
-    _ = c.wgpuInstanceRequestAdapter(
-        instance,
-        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
-        .{ .callback = onAdapter, .userdata1 = &ctx },
-    );
-    c.wgpuInstanceProcessEvents(instance); // drive callbacks
-    const adapter = ctx.adapter orelse return error.NoAdapter;
-    defer c.wgpuAdapterRelease(adapter);
-
-    // 3. Device ────────────────────────────────────────────────────────────────
-    _ = c.wgpuAdapterRequestDevice(adapter, null, .{ .callback = onDevice, .userdata1 = &ctx });
-    c.wgpuInstanceProcessEvents(instance);
-    const device = ctx.device orelse return error.NoDevice;
-    defer c.wgpuDeviceRelease(device);
-
-    const queue = c.wgpuDeviceGetQueue(device);
-    defer c.wgpuQueueRelease(queue);
-
-    // 4. Input data ────────────────────────────────────────────────────────────
-    //   mat_a[i] = i        (0 … 15)
-    //   mat_b[i] = 15 − i   → every element of mat_c should equal 15
-    var mat_a: [N]f32 = undefined;
-    var mat_b: [N]f32 = undefined;
-    for (0..N) |i| {
-        mat_a[i] = @floatFromInt(i);
-        mat_b[i] = @floatFromInt(N - 1 - i);
+    // Input data: a[i] = i, b[i] = 15 - i  →  add should give all 15s
+    var data_a: [16]f32 = undefined;
+    var data_b: [16]f32 = undefined;
+    for (0..16) |i| {
+        data_a[i] = @floatFromInt(i);
+        data_b[i] = @floatFromInt(15 - i);
     }
 
-    // 5. GPU Buffers ───────────────────────────────────────────────────────────
-    const buf_a = c.wgpuDeviceCreateBuffer(device, &.{
-        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
-        .size = BUF_BYTES,
-    }) orelse return error.BufferA;
+    const a = try gpu.Mat.load(&gloc, &data_a, 4, 4);
+    defer a.deinit();
+    const b = try gpu.Mat.load(&gloc, &data_b, 4, 4);
+    defer b.deinit();
 
-    const buf_b = c.wgpuDeviceCreateBuffer(device, &.{
-        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
-        .size = BUF_BYTES,
-    }) orelse return error.BufferB;
+    // a + b
+    const sum = try a.add(&gloc, b);
+    defer sum.deinit();
 
-    // buf_c: GPU-only result; staging: CPU-readable copy
-    const buf_c = c.wgpuDeviceCreateBuffer(device, &.{
-        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopySrc,
-        .size = BUF_BYTES,
-    }) orelse return error.BufferC;
+    // sum * 2
+    const scaled = try sum.scale(&gloc, 2.0);
+    defer scaled.deinit();
 
-    const buf_staging = c.wgpuDeviceCreateBuffer(device, &.{
-        .usage = c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
-        .size = BUF_BYTES,
-    }) orelse return error.BufferStaging;
+    // Read back
+    var out_sum: [16]f32 = undefined;
+    var out_scaled: [16]f32 = undefined;
+    try sum.read(&gloc, &out_sum);
+    try scaled.read(&gloc, &out_scaled);
 
-    defer c.wgpuBufferRelease(buf_a);
-    defer c.wgpuBufferRelease(buf_b);
-    defer c.wgpuBufferRelease(buf_c);
-    defer c.wgpuBufferRelease(buf_staging);
+    // Print
+    std.debug.print("\na + b  (expect all 15):\n", .{});
+    printMat(&out_sum, 4, 4);
 
-    // Upload inputs
-    c.wgpuQueueWriteBuffer(queue, buf_a, 0, &mat_a, BUF_BYTES);
-    c.wgpuQueueWriteBuffer(queue, buf_b, 0, &mat_b, BUF_BYTES);
+    std.debug.print("\n(a + b) * 2  (expect all 30):\n", .{});
+    printMat(&out_scaled, 4, 4);
+}
 
-    // 6. Shader module ─────────────────────────────────────────────────────────
-    // ✅ New API (0.20+)
-    var wgsl_src = c.WGPUShaderSourceWGSL{
-        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
-        .code = sv(SHADER),
-    };
-    const shader = c.wgpuDeviceCreateShaderModule(device, &.{
-        .nextInChain = @ptrCast(&wgsl_src),
-    }) orelse return error.Shader;
-
-    // 7. Compute pipeline (layout auto-inferred from shader) ───────────────────
-    // ✅
-    const pipeline = c.wgpuDeviceCreateComputePipeline(device, &.{
-        .compute = .{
-            .module = shader,
-            .entryPoint = sv("main"),
-        },
-    }) orelse return error.Pipeline;
-    defer c.wgpuComputePipelineRelease(pipeline);
-
-    // 8. Bind group ────────────────────────────────────────────────────────────
-    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
-    defer c.wgpuBindGroupLayoutRelease(bgl);
-
-    const entries = [_]c.WGPUBindGroupEntry{
-        .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = BUF_BYTES },
-        .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = BUF_BYTES },
-        .{ .binding = 2, .buffer = buf_c, .offset = 0, .size = BUF_BYTES },
-    };
-    const bind_group = c.wgpuDeviceCreateBindGroup(device, &.{
-        .layout = bgl,
-        .entries = &entries,
-        .entryCount = entries.len,
-    }) orelse return error.BindGroup;
-    defer c.wgpuBindGroupRelease(bind_group);
-
-    // 9. Encode compute pass + buffer copy ────────────────────────────────────
-    const encoder = c.wgpuDeviceCreateCommandEncoder(device, null) orelse
-        return error.Encoder;
-
-    const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null);
-    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
-    c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null);
-    // dispatch(1,1,1): one workgroup of size (4,4) covers the whole 4×4 matrix
-    c.wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1);
-    c.wgpuComputePassEncoderEnd(pass);
-    c.wgpuComputePassEncoderRelease(pass);
-
-    // Copy result buffer → CPU-readable staging buffer
-    c.wgpuCommandEncoderCopyBufferToBuffer(encoder, buf_c, 0, buf_staging, 0, BUF_BYTES);
-
-    const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null);
-    defer c.wgpuCommandEncoderRelease(encoder);
-    defer c.wgpuCommandBufferRelease(cmdbuf);
-
-    // 10. Submit ───────────────────────────────────────────────────────────────
-    c.wgpuQueueSubmit(queue, 1, &cmdbuf);
-
-    // 11. Map staging buffer back to CPU ──────────────────────────────────────
-    var mapped = false;
-    _ = c.wgpuBufferMapAsync(
-        buf_staging,
-        c.WGPUMapMode_Read,
-        0,
-        BUF_BYTES,
-        .{ .callback = onMapped, .userdata1 = &mapped },
-    );
-    // Poll the device until the async map completes
-    while (!mapped) _ = c.wgpuDevicePoll(device, 1, null);
-
-    const ptr: [*]const f32 = @ptrCast(@alignCast(
-        c.wgpuBufferGetConstMappedRange(buf_staging, 0, BUF_BYTES),
-    ));
-    const result = ptr[0..N];
-
-    // 12. Print ────────────────────────────────────────────────────────────────
-    std.debug.print("\nmat_a + mat_b  ({d}×{d}):\n", .{ ROWS, COLS });
-    for (0..ROWS) |r| {
-        for (0..COLS) |col|
-            std.debug.print("{d:6.0}", .{result[r * COLS + col]});
+fn printMat(data: []const f32, rows: u32, cols: u32) void {
+    for (0..rows) |r| {
+        for (0..cols) |col|
+            std.debug.print("{d:6.0}", .{data[r * cols + col]});
         std.debug.print("\n", .{});
     }
-    // Expected output: every cell = 15.0
-
-    c.wgpuBufferUnmap(buf_staging);
 }
diff --git a/src/reference.zig b/src/reference.zig
new file mode 100644
index 0000000..6c80dfa
--- /dev/null
+++ b/src/reference.zig
@@ -0,0 +1,239 @@
+// Minimal WebGPU compute in Zig: element-wise matrix addition
+// Uses wgpu-native C bindings.
+// Build: see ../build.zig
+//
+// Data flow:
+//   CPU (mat_a, mat_b) → GPU storage buffers → compute shader → GPU buf_c
+//   → staging buffer (mapped) → CPU read → print
+
+const std = @import("std");
+const c = @cImport(@cInclude("wgpu.h"));
+
+// ── Config ────────────────────────────────────────────────────────────────────
+const ROWS: u32 = 4;
+const COLS: u32 = 4;
+const N = ROWS * COLS; // 16 elements
+const BUF_BYTES = N * @sizeOf(f32);
+
+// ── WGSL Compute Shader ───────────────────────────────────────────────────────
+//   workgroup_size(4,4) matches one full 4×4 matrix → dispatch(1,1,1)
+const SHADER =
+    \\@group(0) @binding(0) var<storage, read>       mat_a : array<f32>;
+    \\@group(0) @binding(1) var<storage, read>       mat_b : array<f32>;
+    \\@group(0) @binding(2) var<storage, read_write> mat_c : array<f32>;
+    \\
+    \\@compute @workgroup_size(4, 4)
+    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+    \\    let idx = gid.y * 4u + gid.x;
+    \\    if (idx < arrayLength(&mat_c)) {
+    \\        mat_c[idx] = mat_a[idx] + mat_b[idx];
+    \\    }
+    \\}
+;
+
+// ── Callback state ────────────────────────────────────────────────────────────
+const Ctx = struct {
+    adapter: c.WGPUAdapter = null,
+    device: c.WGPUDevice = null,
+};
+
+fn onAdapter(
+    status: c.WGPURequestAdapterStatus,
+    adapter: c.WGPUAdapter,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestAdapterStatus_Success) {
+        std.log.err("Adapter request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.adapter = adapter;
+}
+
+fn onDevice(
+    status: c.WGPURequestDeviceStatus,
+    device: c.WGPUDevice,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestDeviceStatus_Success) {
+        std.log.err("Device request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.device = device;
+    std.debug.print("{?}", .{device});
+}
+
+fn onMapped(
+    status: c.WGPUMapAsyncStatus,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
+    flag.* = (status == c.WGPUMapAsyncStatus_Success);
+}
+
+fn sv(s: []const u8) c.WGPUStringView {
+    return .{ .data = s.ptr, .length = s.len };
+}
+
+// ── Main ──────────────────────────────────────────────────────────────────────
+pub fn main() !void {
+
+    // 1. Instance ──────────────────────────────────────────────────────────────
+    const instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse
+        return error.NoInstance;
+    defer c.wgpuInstanceRelease(instance);
+
+    // 2. Adapter (async → poll) ────────────────────────────────────────────────
+    var ctx = Ctx{};
+    _ = c.wgpuInstanceRequestAdapter(
+        instance,
+        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
+        .{ .callback = onAdapter, .userdata1 = &ctx },
+    );
+    c.wgpuInstanceProcessEvents(instance); // drive callbacks
+    const adapter = ctx.adapter orelse return error.NoAdapter;
+    defer c.wgpuAdapterRelease(adapter);
+
+    // 3. Device ────────────────────────────────────────────────────────────────
+    _ = c.wgpuAdapterRequestDevice(adapter, null, .{ .callback = onDevice, .userdata1 = &ctx });
+    c.wgpuInstanceProcessEvents(instance);
+    const device = ctx.device orelse return error.NoDevice;
+    defer c.wgpuDeviceRelease(device);
+
+    const queue = c.wgpuDeviceGetQueue(device);
+    defer c.wgpuQueueRelease(queue);
+
+    // 4. Input data ────────────────────────────────────────────────────────────
+    //   mat_a[i] = i        (0 … 15)
+    //   mat_b[i] = 15 − i   → every element of mat_c should equal 15
+    var mat_a: [N]f32 = undefined;
+    var mat_b: [N]f32 = undefined;
+    for (0..N) |i| {
+        mat_a[i] = @floatFromInt(i);
+        mat_b[i] = @floatFromInt(N - 1 - i);
+    }
+
+    // 5. GPU Buffers ───────────────────────────────────────────────────────────
+    const buf_a = c.wgpuDeviceCreateBuffer(device, &.{
+        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
+        .size = BUF_BYTES,
+    }) orelse return error.BufferA;
+
+    const buf_b = c.wgpuDeviceCreateBuffer(device, &.{
+        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
+        .size = BUF_BYTES,
+    }) orelse return error.BufferB;
+
+    // buf_c: GPU-only result; staging: CPU-readable copy
+    const buf_c = c.wgpuDeviceCreateBuffer(device, &.{
+        .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopySrc,
+        .size = BUF_BYTES,
+    }) orelse return error.BufferC;
+
+    const buf_staging = c.wgpuDeviceCreateBuffer(device, &.{
+        .usage = c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
+        .size = BUF_BYTES,
+    }) orelse return error.BufferStaging;
+
+    defer c.wgpuBufferRelease(buf_a);
+    defer c.wgpuBufferRelease(buf_b);
+    defer c.wgpuBufferRelease(buf_c);
+    defer c.wgpuBufferRelease(buf_staging);
+
+    // Upload inputs
+    c.wgpuQueueWriteBuffer(queue, buf_a, 0, &mat_a, BUF_BYTES);
+    c.wgpuQueueWriteBuffer(queue, buf_b, 0, &mat_b, BUF_BYTES);
+
+    // 6. Shader module ─────────────────────────────────────────────────────────
+    // ✅ New API (0.20+)
+    var wgsl_src = c.WGPUShaderSourceWGSL{
+        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
+        .code = sv(SHADER),
+    };
+    const shader = c.wgpuDeviceCreateShaderModule(device, &.{
+        .nextInChain = @ptrCast(&wgsl_src),
+    }) orelse return error.Shader;
+
+    // 7. Compute pipeline (layout auto-inferred from shader) ───────────────────
+    // ✅
+    const pipeline = c.wgpuDeviceCreateComputePipeline(device, &.{
+        .compute = .{
+            .module = shader,
+            .entryPoint = sv("main"),
+        },
+    }) orelse return error.Pipeline;
+    defer c.wgpuComputePipelineRelease(pipeline);
+
+    // 8. Bind group ────────────────────────────────────────────────────────────
+    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+    defer c.wgpuBindGroupLayoutRelease(bgl);
+
+    const entries = [_]c.WGPUBindGroupEntry{
+        .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = BUF_BYTES },
+        .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = BUF_BYTES },
+        .{ .binding = 2, .buffer = buf_c, .offset = 0, .size = BUF_BYTES },
+    };
+    const bind_group = c.wgpuDeviceCreateBindGroup(device, &.{
+        .layout = bgl,
+        .entries = &entries,
+        .entryCount = entries.len,
+    }) orelse return error.BindGroup;
+    defer c.wgpuBindGroupRelease(bind_group);
+
+    // 9. Encode compute pass + buffer copy ────────────────────────────────────
+    const encoder = c.wgpuDeviceCreateCommandEncoder(device, null) orelse
+        return error.Encoder;
+
+    const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null);
+    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
+    c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null);
+    // dispatch(1,1,1): one workgroup of size (4,4) covers the whole 4×4 matrix
+    c.wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1);
+    c.wgpuComputePassEncoderEnd(pass);
+    c.wgpuComputePassEncoderRelease(pass);
+
+    // Copy result buffer → CPU-readable staging buffer
+    c.wgpuCommandEncoderCopyBufferToBuffer(encoder, buf_c, 0, buf_staging, 0, BUF_BYTES);
+
+    const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null);
+    defer c.wgpuCommandEncoderRelease(encoder);
+    defer c.wgpuCommandBufferRelease(cmdbuf);
+
+    // 10. Submit ───────────────────────────────────────────────────────────────
+    c.wgpuQueueSubmit(queue, 1, &cmdbuf);
+
+    // 11. Map staging buffer back to CPU ──────────────────────────────────────
+    var mapped = false;
+    _ = c.wgpuBufferMapAsync(
+        buf_staging,
+        c.WGPUMapMode_Read,
+        0,
+        BUF_BYTES,
+        .{ .callback = onMapped, .userdata1 = &mapped },
+    );
+    // Poll the device until the async map completes
+    while (!mapped) _ = c.wgpuDevicePoll(device, 1, null);
+
+    const ptr: [*]const f32 = @ptrCast(@alignCast(
+        c.wgpuBufferGetConstMappedRange(buf_staging, 0, BUF_BYTES),
+    ));
+    const result = ptr[0..N];
+
+    // 12. Print ────────────────────────────────────────────────────────────────
+    std.debug.print("\nmat_a + mat_b  ({d}×{d}):\n", .{ ROWS, COLS });
+    for (0..ROWS) |r| {
+        for (0..COLS) |col|
+            std.debug.print("{d:6.0}", .{result[r * COLS + col]});
+        std.debug.print("\n", .{});
+    }
+    // Expected output: every cell = 15.0
+
+    c.wgpuBufferUnmap(buf_staging);
+}
diff --git a/src/wgpu.zig b/src/wgpu.zig
deleted file mode 100644
index 52480ce..0000000
--- a/src/wgpu.zig
+++ /dev/null
@@ -1,137 +0,0 @@
-const std = @import("std");
-const c = @cImport(@cInclude("wgpu.h"));
-
-/// Replace enum_WGPURequestAdapterStatus
-pub const RequestAdapterStatus = enum {
-    Success,
-    CallbackCancelled,
-    Unavailable,
-    Error,
-    Force32,
-};
-
-pub const BufferUsage = enum(u64) {
-    None = 0,
-    MapRead = 1, // CPU can read after GPU finishes
-    MapWrite = 2,
-    CopySrc = 4, // can copy from GPU to staging.
-    CopyDst = 8, // CPU can write to it
-    Index = 16,
-    Vertex = 32,
-    Uniform = 64,
-    Storage = 128,
-    Indirect = 256,
-    QueryResolve = 512,
-};
-
-const Ctx = struct {
-    adapter: c.WGPUAdapter = null,
-    device: c.WGPUDevice = null,
-};
-
-fn onAdapter(
-    status: c.WGPURequestAdapterStatus,
-    adapter: c.WGPUAdapter,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestAdapterStatus_Success) {
-        std.log.err("Adapter request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.adapter = adapter;
-}
-
-fn onDevice(
-    status: c.WGPURequestDeviceStatus,
-    device: c.WGPUDevice,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestDeviceStatus_Success) {
-        std.log.err("Device request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.device = device;
-    std.debug.print("{?}", .{device});
-}
-
-fn onMapped(
-    status: c.WGPUMapAsyncStatus,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
-    flag.* = (status == c.WGPUMapAsyncStatus_Success);
-}
-
-fn sv(s: []const u8) c.WGPUStringView {
-    return .{ .data = s.ptr, .length = s.len };
-}
-
-const AllocatorGPU = @This();
-
-allocator: std.mem.Allocator,
-instance: *c.struct_WGPUInstanceImpl,
-adapter: *c.struct_WGPUAdapterImpl,
-device: c.struct_WGPUDeviceImpl,
-queue: c.struct_WGPUQueueImpl,
-ctx: Ctx,
-
-buffers: std.AutoHashMap(*c.struct_WGPUBufferImpl, void),
-
-pub fn init(allocator: std.mem.Allocator) !AllocatorGPU {
-    var self: AllocatorGPU = undefined;
-    self.allocator = allocator;
-    self.ctx = .{};
-    self.buffers = try .init(allocator);
-
-    // 1. Instance ──────────────────────────────────────────────────────────────
-    self.instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse
-        return error.NoInstance;
-
-    // 2. Adapter (async → poll) ────────────────────────────────────────────────
-    _ = c.wgpuInstanceRequestAdapter(
-        self.instance,
-        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
-        .{ .callback = onAdapter, .userdata1 = &self.ctx },
-    );
-    c.wgpuInstanceProcessEvents(self.instance); // drive callbacks
-    self.adapter = self.ctx.adapter orelse return error.NoAdapter;
-
-    // 3. Device ────────────────────────────────────────────────────────────────
-    _ = c.wgpuAdapterRequestDevice(self.adapter, null, .{ .callback = onDevice, .userdata1 = &self.ctx });
-    c.wgpuInstanceProcessEvents(self.instance);
-    self.device = self.ctx.device orelse return error.NoDevice;
-
-    self.queue = c.wgpuDeviceGetQueue(self.device);
-
-    return self;
-}
-
-pub fn deinit(self: AllocatorGPU) void {
-    c.wgpuInstanceRelease(self.instance);
-    defer c.wgpuAdapterRelease(self.adapter);
-    defer c.wgpuDeviceRelease(self.device);
-    defer c.wgpuQueueRelease(self.queue);
-}
-
-pub fn addBuff(
-    self: AllocatorGPU,
-    comptime T: type,
-    comptime len: comptime_int,
-    comptime opt: struct {},
-) !void {
-    self.buffers.put(
-        c.wgpuDeviceCreateBuffer(self.device, &.{
-            .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
-            .size = len * @bitSizeOf(T),
-        }) orelse return error.Buffer,
-        {},
-    );
-}