7 changed files with 133 additions and 187 deletions
--- a/build.zig.zon
+++ b/build.zig.zon
@ -31,7 +31,45 @@
    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
    // Once all dependencies are fetched, `zig build` no longer requires
    // internet connectivity.
-    .dependencies = .{},
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL. If the contents of a URL change this will result in a hash mismatch
+        //    // which will prevent zig from using it.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+        //
+        //    // When this is set to `true`, a package is declared to be lazily
+        //    // fetched. This makes the dependency only get fetched if it is
+        //    // actually used.
+        //    .lazy = false,
+        //},
+    },
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package. Only files listed here will remain on disk
+    // when using the zig package manager. As a rule of thumb, one should list
+    // files required for compilation plus any license(s).
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
    .paths = .{
        "build.zig",
        "build.zig.zon",
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@ -4,19 +4,16 @@ const c = @import("c.zig").c;

 const GpuAllocator = @This();

-cpu_allocator: std.mem.Allocator,
 instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,

-tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
-
 // Lazily created, cached for lifetime of allocator
 _pip_add: c.WGPUComputePipeline = null,
 _pip_scale: c.WGPUComputePipeline = null,

-pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
+pub fn init() !GpuAllocator {
    const instance = c.wgpuCreateInstance(
        &std.mem.zeroes(c.WGPUInstanceDescriptor),
    ) orelse return error.NoInstance;
@ -41,54 +38,22 @@ pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
    const device = ctx.device orelse return error.NoDevice;

    return .{
-        .cpu_allocator = cpu_allocator,
        .instance = instance,
        .adapter = adapter,
        .device = device,
        .queue = c.wgpuDeviceGetQueue(device),
-        .tracked_buffers = .init(cpu_allocator),
    };
 }

 pub fn deinit(self: *GpuAllocator) void {
    if (self._pip_add) |p| c.wgpuComputePipelineRelease(p);
    if (self._pip_scale) |p| c.wgpuComputePipelineRelease(p);
-
-    var it = self.tracked_buffers.keyIterator();
-    while (it.next()) |buf_ptr| {
-        const buf = buf_ptr.*;
-        c.wgpuBufferDestroy(buf);
-        c.wgpuBufferRelease(buf);
-    }
-    self.tracked_buffers.deinit();
-
    c.wgpuQueueRelease(self.queue);
    c.wgpuDeviceRelease(self.device);
    c.wgpuAdapterRelease(self.adapter);
    c.wgpuInstanceRelease(self.instance);
 }

-pub fn registerBuffer(
-    self: *GpuAllocator,
-    bytes: u64,
-    usage: c.WGPUBufferUsage,
-) !c.WGPUBuffer {
-    const buf = c.wgpuDeviceCreateBuffer(self.device, &.{
-        .usage = usage,
-        .size = bytes,
-    }) orelse return error.BufferAlloc;
-
-    try self.tracked_buffers.put(buf, {});
-    return buf;
-}
-
-pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
-    if (self.tracked_buffers.remove(buf)) {
-        c.wgpuBufferDestroy(buf);
-        c.wgpuBufferRelease(buf);
-    }
-}
-
 // ── Internal ─────────────────────────────────────────────────────────────

 pub fn makeBuffer(
@ -152,6 +117,7 @@ fn onDevice(
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
+    std.debug.print("{?}", .{device});
 }

 fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@ -1,47 +0,0 @@
-const std = @import("std");
-const c = @import("c.zig").c;
-const GpuAllocator = @import("GpuAllocator.zig");
-
-const GpuBuffer = @This();
-
-raw: c.WGPUBuffer,
-size: u64,
-usage: c.WGPUBufferUsage,
-gloc: *GpuAllocator,
-
-/// Allocates the underlying WebGPU handle and registers it to the parent GpuAllocator
-pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffer {
-    const raw_handle = try gloc.registerBuffer(bytes, usage);
-    return .{
-        .raw = raw_handle,
-        .size = bytes,
-        .usage = usage,
-        .gloc = gloc,
-    };
-}
-
-/// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
-pub fn deinit(self: GpuBuffer) void {
-    self.gloc.unregisterAndDestroyBuffer(self.raw);
-}
-
-/// Native mapAsync wrapper
-pub fn mapAsync(
-    self: GpuBuffer,
-    mode: c.WGPUMapMode,
-    offset: u64,
-    size: u64,
-    callback_info: c.WGPUBufferMapCallbackInfo,
-) void {
-    _ = c.wgpuBufferMapAsync(self.raw, mode, offset, size, callback_info);
-}
-
-/// Native getConstMappedRange wrapper
-pub fn getConstMappedRange(self: GpuBuffer, offset: u64, size: u64) ?*const anyopaque {
-    return c.wgpuBufferGetConstMappedRange(self.raw, offset, size);
-}
-
-/// Native unmap wrapper
-pub fn unmap(self: GpuBuffer) void {
-    c.wgpuBufferUnmap(self.raw);
-}
--- a/src/Mat.zig
+++ b/src/Mat.zig
@ -1,49 +1,51 @@
 const std = @import("std");
 const c = @import("c.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
-const GpuBuffer = @import("GpuBuffer.zig");

 const Mat = @This();

-buf: GpuBuffer,
-rows: usize,
-cols: usize,
+buf: c.WGPUBuffer,
+rows: u32,
+cols: u32,

+// ── Lifecycle ─────────────────────────────────────────────────────────────
+
+/// Allocate GPU buffer and upload `data`. `data.len` must equal rows*cols.
 pub fn load(
    gloc: *GpuAllocator,
    data: []const f32,
-    rows: usize,
-    cols: usize,
+    rows: u32,
+    cols: u32,
 ) !Mat {
    std.debug.assert(data.len == @as(usize, rows) * cols);
    const bytes = data.len * @sizeOf(f32);
-
-    // Uses structural constructor initialization
-    const buf = try GpuBuffer.init(
-        gloc,
+    const buf = try gloc.makeBuffer(
        bytes,
-        c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
+        c.WGPUBufferUsage_Storage |
+            c.WGPUBufferUsage_CopyDst |
+            c.WGPUBufferUsage_CopySrc,
    );
-
-    c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
+    c.wgpuQueueWriteBuffer(gloc.queue, buf, 0, data.ptr, bytes);
    return .{ .buf = buf, .rows = rows, .cols = cols };
 }

-pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat {
+/// Allocate zeroed GPU buffer (no upload).
+pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat {
    const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
-    const buf = try GpuBuffer.init(
-        gloc,
+    const buf = try gloc.makeBuffer(
        bytes,
-        c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
+        c.WGPUBufferUsage_Storage |
+            c.WGPUBufferUsage_CopyDst |
+            c.WGPUBufferUsage_CopySrc,
    );
    return .{ .buf = buf, .rows = rows, .cols = cols };
 }

 pub fn deinit(self: Mat) void {
-    self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory
+    c.wgpuBufferRelease(self.buf);
 }

-pub fn len(self: Mat) usize {
+pub fn len(self: Mat) u32 {
    return self.rows * self.cols;
 }

@ -51,6 +53,7 @@ pub fn byteSize(self: Mat) u64 {
    return @as(u64, self.len()) * @sizeOf(f32);
 }

+/// Element-wise add. Shapes must match. Returns new Mat (caller owns).
 pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
    std.debug.assert(self.rows == other.rows and self.cols == other.cols);

@ -63,6 +66,7 @@ pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
    return result;
 }

+/// Element-wise multiply by scalar. Returns new Mat (caller owns).
 pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
    const result = try Mat.zeros(gloc, self.rows, self.cols);
    errdefer result.deinit();
@ -70,46 +74,52 @@ pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
    const bytes = self.byteSize();
    const n = self.len();

-    const uni_buf = try GpuBuffer.init(
-        gloc,
+    // Upload scalar as uniform buffer
+    const uni_buf = try gloc.makeBuffer(
        @sizeOf(f32),
        c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
    );
-    defer uni_buf.deinit(); // Gracefully deinitializes locally
-
-    c.wgpuQueueWriteBuffer(gloc.queue, uni_buf.raw, 0, &scalar, @sizeOf(f32));
+    defer c.wgpuBufferRelease(uni_buf);
+    c.wgpuQueueWriteBuffer(gloc.queue, uni_buf, 0, &scalar, @sizeOf(f32));

    const pipeline = try gloc.pipScale();
+    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+    defer c.wgpuBindGroupLayoutRelease(bgl);
+
    const entries = [_]c.WGPUBindGroupEntry{
-        .{ .binding = 0, .buffer = self.buf.raw, .offset = 0, .size = bytes },
-        .{ .binding = 1, .buffer = result.buf.raw, .offset = 0, .size = bytes },
-        .{ .binding = 2, .buffer = uni_buf.raw, .offset = 0, .size = @sizeOf(f32) },
+        .{ .binding = 0, .buffer = self.buf, .offset = 0, .size = bytes },
+        .{ .binding = 1, .buffer = result.buf, .offset = 0, .size = bytes },
+        .{ .binding = 2, .buffer = uni_buf, .offset = 0, .size = @sizeOf(f32) },
    };
    try submitPass(gloc, pipeline, &entries, n);

    return result;
 }

+/// Read GPU buffer back to CPU. `out.len` must be >= rows*cols.
 pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
    std.debug.assert(out.len >= self.len());
    const bytes = self.byteSize();

-    const staging = try GpuBuffer.init(
-        gloc,
+    const staging = try gloc.makeBuffer(
        bytes,
        c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
    );
-    defer staging.deinit();
+    defer c.wgpuBufferRelease(staging);

-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
-    c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
+    // Copy result → staging
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
+        return error.Encoder;
+    c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf, 0, staging, 0, bytes);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandEncoderRelease(enc);
    defer c.wgpuCommandBufferRelease(cmd);
    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);

+    // Map and copy to slice
    var mapped = false;
-    staging.mapAsync(
+    _ = c.wgpuBufferMapAsync(
+        staging,
        c.WGPUMapMode_Read,
        0,
        bytes,
@ -118,10 +128,10 @@ pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
    while (!mapped) gloc.poll();

    const ptr: [*]const f32 = @ptrCast(@alignCast(
-        staging.getConstMappedRange(0, bytes),
+        c.wgpuBufferGetConstMappedRange(staging, 0, bytes),
    ));
    @memcpy(out[0..self.len()], ptr[0..self.len()]);
-    staging.unmap();
+    c.wgpuBufferUnmap(staging);
 }

 fn onMapped(
@ -140,19 +150,19 @@ fn onMapped(
 fn dispatch2in1out(
    gloc: *GpuAllocator,
    pipeline: c.WGPUComputePipeline,
-    buf_a: GpuBuffer,
-    buf_b: GpuBuffer,
-    buf_out: GpuBuffer,
+    buf_a: c.WGPUBuffer,
+    buf_b: c.WGPUBuffer,
+    buf_out: c.WGPUBuffer,
    bytes: u64,
-    n: usize,
+    n: u32,
 ) !void {
    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
    defer c.wgpuBindGroupLayoutRelease(bgl);

    const entries = [_]c.WGPUBindGroupEntry{
-        .{ .binding = 0, .buffer = buf_a.raw, .offset = 0, .size = bytes },
-        .{ .binding = 1, .buffer = buf_b.raw, .offset = 0, .size = bytes },
-        .{ .binding = 2, .buffer = buf_out.raw, .offset = 0, .size = bytes },
+        .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = bytes },
+        .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = bytes },
+        .{ .binding = 2, .buffer = buf_out, .offset = 0, .size = bytes },
    };
    try submitPass(gloc, pipeline, &entries, n);
 }
@ -162,7 +172,7 @@ fn submitPass(
    gloc: *GpuAllocator,
    pipeline: c.WGPUComputePipeline,
    entries: []const c.WGPUBindGroupEntry,
-    n: usize,
+    n: u32,
 ) !void {
    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
    defer c.wgpuBindGroupLayoutRelease(bgl);
@ -179,7 +189,7 @@ fn submitPass(
    const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
    c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
-    c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(ceilDiv(n, 256)), 1, 1);
+    c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1);
    c.wgpuComputePassEncoderEnd(pass);
    c.wgpuComputePassEncoderRelease(pass);

@ -189,6 +199,6 @@ fn submitPass(
    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
 }

-fn ceilDiv(n: usize, d: usize) usize {
+fn ceilDiv(n: u32, d: u32) u32 {
    return (n + d - 1) / d;
 }
--- a/src/main.zig
+++ b/src/main.zig
@ -2,65 +2,43 @@ const std = @import("std");
 const GpuAllocator = @import("GpuAllocator.zig");
 const Mat = @import("Mat.zig");

-pub fn main(init: std.process.Init) !void {
-    var gloc = try GpuAllocator.init(init.gpa);
+pub fn main() !void {
+    var gloc = try GpuAllocator.init();
    defer gloc.deinit();

-    // Define the sizes you want to benchmark
-    const sizes = [_]usize{ 1, 1024, 4096, 16384, 65536, 262144, 1024 * 1024, 4 * 1024 * 1024 };
-
-    // Print table header
-    std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{});
-    std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{});
-
-    const allocator = init.gpa;
-
-    for (sizes) |size| {
-        // Dynamically allocate buffers for the current size
-        var data_a = try allocator.alloc(f32, size);
-        defer allocator.free(data_a);
-        var data_b = try allocator.alloc(f32, size);
-        defer allocator.free(data_b);
-
-        // Populate data
-        for (0..size) |i| {
-            data_a[i] = @floatFromInt(i);
-            data_b[i] = @floatFromInt(size - 1 - i);
-        }
-
-        // Start timing the GPU operations
-        const start = std.Io.Clock.awake.now(init.io);
-
-        const a = try Mat.load(&gloc, data_a, size, 1);
-        defer a.deinit();
-        const b = try Mat.load(&gloc, data_b, size, 1);
-        defer b.deinit();
-
-        // a + b
-        const sum = try a.add(&gloc, b);
-        defer sum.deinit();
-
-        // sum * 2
-        const scaled = try sum.scale(&gloc, 2.0);
-        defer scaled.deinit();
-
-        // Read back (allocating dynamically for read-back buffers too)
-        const out_sum = try allocator.alloc(f32, size);
-        defer allocator.free(out_sum);
-        const out_scaled = try allocator.alloc(f32, size);
-        defer allocator.free(out_scaled);
-
-        try sum.read(&gloc, out_sum);
-        try scaled.read(&gloc, out_scaled);
-
-        const duration = start.durationTo(std.Io.Clock.awake.now(init.io));
-        const ns = duration.toNanoseconds();
-        const ms = @as(f64, @floatFromInt(ns)) / 1_000_000.0;
-        const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0);
-
-        // Print table row
-        std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns });
+    // Input data: a[i] = i, b[i] = 15 - i  →  add should give all 15s
+    var data_a: [16]f32 = undefined;
+    var data_b: [16]f32 = undefined;
+    for (0..16) |i| {
+        data_a[i] = @floatFromInt(i);
+        data_b[i] = @floatFromInt(15 - i);
    }
+
+    const a = try Mat.load(&gloc, &data_a, 4, 4);
+    defer a.deinit();
+    const b = try Mat.load(&gloc, &data_b, 4, 4);
+    defer b.deinit();
+
+    // a + b
+    const sum = try a.add(&gloc, b);
+    defer sum.deinit();
+
+    // sum * 2
+    const scaled = try sum.scale(&gloc, 2.0);
+    defer scaled.deinit();
+
+    // Read back
+    var out_sum: [16]f32 = undefined;
+    var out_scaled: [16]f32 = undefined;
+    try sum.read(&gloc, &out_sum);
+    try scaled.read(&gloc, &out_scaled);
+
+    // Print
+    std.debug.print("\na + b  (expect all 15):\n", .{});
+    printMat(&out_sum, 4, 4);
+
+    std.debug.print("\n(a + b) * 2  (expect all 30):\n", .{});
+    printMat(&out_scaled, 4, 4);
 }

 fn printMat(data: []const f32, rows: u32, cols: u32) void {
--- a/src/reference.zig
+++ b/src/reference.zig
@ -65,6 +65,7 @@ fn onDevice(
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
+    std.debug.print("{?}", .{device});
 }

 fn onMapped(
--- a/src/shaders.zig
+++ b/src/shaders.zig
@ -3,7 +3,7 @@ pub const SHADER_ADD =
    \\@group(0) @binding(1) var<storage, read>       b : array<f32>;
    \\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
    \\
-    \\@compute @workgroup_size(256)
+    \\@compute @workgroup_size(64)
    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
    \\    let i = gid.x;
    \\    if (i < arrayLength(&out)) {
@ -18,7 +18,7 @@ pub const SHADER_SCALE =
    \\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
    \\@group(0) @binding(2) var<uniform>             u   : Uniforms;
    \\
-    \\@compute @workgroup_size(256)
+    \\@compute @workgroup_size(64)
    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
    \\    let i = gid.x;
    \\    if (i < arrayLength(&out)) {