diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig
index 3ace41c..506a314 100644
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@@ -152,7 +152,6 @@ fn onDevice(
     }
     const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
     ctx.device = device;
-    std.debug.print("{?}", .{device});
 }
 
 fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
diff --git a/src/Mat.zig b/src/Mat.zig
index 0e186ca..1ddc2ee 100644
--- a/src/Mat.zig
+++ b/src/Mat.zig
@@ -6,14 +6,14 @@ const GpuBuffer = @import("GpuBuffer.zig");
 const Mat = @This();
 
 buf: GpuBuffer,
-rows: u32,
-cols: u32,
+rows: usize,
+cols: usize,
 
 pub fn load(
     gloc: *GpuAllocator,
     data: []const f32,
-    rows: u32,
-    cols: u32,
+    rows: usize,
+    cols: usize,
 ) !Mat {
     std.debug.assert(data.len == @as(usize, rows) * cols);
     const bytes = data.len * @sizeOf(f32);
@@ -29,7 +29,7 @@ pub fn load(
     return .{ .buf = buf, .rows = rows, .cols = cols };
 }
 
-pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat {
+pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat {
     const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
     const buf = try GpuBuffer.init(
         gloc,
@@ -43,7 +43,7 @@ pub fn deinit(self: Mat) void {
     self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory
 }
 
-pub fn len(self: Mat) u32 {
+pub fn len(self: Mat) usize {
     return self.rows * self.cols;
 }
 
@@ -144,7 +144,7 @@ fn dispatch2in1out(
     buf_b: GpuBuffer,
     buf_out: GpuBuffer,
     bytes: u64,
-    n: u32,
+    n: usize,
 ) !void {
     const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
     defer c.wgpuBindGroupLayoutRelease(bgl);
@@ -162,7 +162,7 @@ fn submitPass(
     gloc: *GpuAllocator,
     pipeline: c.WGPUComputePipeline,
     entries: []const c.WGPUBindGroupEntry,
-    n: u32,
+    n: usize,
 ) !void {
     const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
     defer c.wgpuBindGroupLayoutRelease(bgl);
@@ -179,7 +179,7 @@ fn submitPass(
     const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
     c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
     c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
-    c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1);
+    c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(ceilDiv(n, 256)), 1, 1);
     c.wgpuComputePassEncoderEnd(pass);
     c.wgpuComputePassEncoderRelease(pass);
 
@@ -189,6 +189,6 @@ fn submitPass(
     c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
 }
 
-fn ceilDiv(n: u32, d: u32) u32 {
+fn ceilDiv(n: usize, d: usize) usize {
     return (n + d - 1) / d;
 }
diff --git a/src/main.zig b/src/main.zig
index 0fdd20e..8c38c89 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -6,39 +6,61 @@ pub fn main(init: std.process.Init) !void {
     var gloc = try GpuAllocator.init(init.gpa);
     defer gloc.deinit();
 
-    // Input data: a[i] = i, b[i] = 15 - i  →  add should give all 15s
-    var data_a: [16]f32 = undefined;
-    var data_b: [16]f32 = undefined;
-    for (0..16) |i| {
-        data_a[i] = @floatFromInt(i);
-        data_b[i] = @floatFromInt(15 - i);
+    // Define the sizes you want to benchmark
+    const sizes = [_]usize{ 1, 1024, 4096, 16384, 65536, 262144, 1024 * 1024, 4 * 1024 * 1024 };
+
+    // Print table header
+    std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{});
+    std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{});
+
+    const allocator = init.gpa;
+
+    for (sizes) |size| {
+        // Dynamically allocate buffers for the current size
+        var data_a = try allocator.alloc(f32, size);
+        defer allocator.free(data_a);
+        var data_b = try allocator.alloc(f32, size);
+        defer allocator.free(data_b);
+
+        // Populate data
+        for (0..size) |i| {
+            data_a[i] = @floatFromInt(i);
+            data_b[i] = @floatFromInt(size - 1 - i);
+        }
+
+        // Start timing the GPU operations
+        const start = std.Io.Clock.awake.now(init.io);
+
+        const a = try Mat.load(&gloc, data_a, size, 1);
+        defer a.deinit();
+        const b = try Mat.load(&gloc, data_b, size, 1);
+        defer b.deinit();
+
+        // a + b
+        const sum = try a.add(&gloc, b);
+        defer sum.deinit();
+
+        // sum * 2
+        const scaled = try sum.scale(&gloc, 2.0);
+        defer scaled.deinit();
+
+        // Read back (allocating dynamically for read-back buffers too)
+        const out_sum = try allocator.alloc(f32, size);
+        defer allocator.free(out_sum);
+        const out_scaled = try allocator.alloc(f32, size);
+        defer allocator.free(out_scaled);
+
+        try sum.read(&gloc, out_sum);
+        try scaled.read(&gloc, out_scaled);
+
+        const duration = start.durationTo(std.Io.Clock.awake.now(init.io));
+        const ns = duration.toNanoseconds();
+        const ms = @as(f64, @floatFromInt(ns)) / 1_000_000.0;
+        const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0);
+
+        // Print table row
+        std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns });
     }
-
-    const a = try Mat.load(&gloc, &data_a, 4, 4);
-    defer a.deinit();
-    const b = try Mat.load(&gloc, &data_b, 4, 4);
-    defer b.deinit();
-
-    // a + b
-    const sum = try a.add(&gloc, b);
-    defer sum.deinit();
-
-    // sum * 2
-    const scaled = try sum.scale(&gloc, 2.0);
-    defer scaled.deinit();
-
-    // Read back
-    var out_sum: [16]f32 = undefined;
-    var out_scaled: [16]f32 = undefined;
-    try sum.read(&gloc, &out_sum);
-    try scaled.read(&gloc, &out_scaled);
-
-    // Print
-    std.debug.print("\na + b  (expect all 15):\n", .{});
-    printMat(&out_sum, 4, 4);
-
-    std.debug.print("\n(a + b) * 2  (expect all 30):\n", .{});
-    printMat(&out_scaled, 4, 4);
 }
 
 fn printMat(data: []const f32, rows: u32, cols: u32) void {
diff --git a/src/reference.zig b/src/reference.zig
index 6c80dfa..4392f24 100644
--- a/src/reference.zig
+++ b/src/reference.zig
@@ -65,7 +65,6 @@ fn onDevice(
     }
     const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
     ctx.device = device;
-    std.debug.print("{?}", .{device});
 }
 
 fn onMapped(
diff --git a/src/shaders.zig b/src/shaders.zig
index 730e26f..da489fd 100644
--- a/src/shaders.zig
+++ b/src/shaders.zig
@@ -3,7 +3,7 @@ pub const SHADER_ADD =
     \\@group(0) @binding(1) var<storage, read>       b : array<f32>;
     \\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
     \\
-    \\@compute @workgroup_size(64)
+    \\@compute @workgroup_size(256)
     \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
     \\    let i = gid.x;
     \\    if (i < arrayLength(&out)) {
@@ -18,7 +18,7 @@ pub const SHADER_SCALE =
     \\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
     \\@group(0) @binding(2) var<uniform>             u   : Uniforms;
     \\
-    \\@compute @workgroup_size(64)
+    \\@compute @workgroup_size(256)
     \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
     \\    let i = gid.x;
     \\    if (i < arrayLength(&out)) {