diff --git a/build.zig b/build.zig
index 158e945..3e81041 100644
--- a/build.zig
+++ b/build.zig
@@ -24,7 +24,7 @@ pub fn build(b: *std.Build) void {
 
     const exe = b.addExecutable(.{
         .root_module = b.createModule(.{
-            .root_source_file = b.path("src/main.zig"),
+            .root_source_file = b.path("src/bench.zig"),
             .link_libc = true,
             .target = target,
             .optimize = optimize,
@@ -57,5 +57,5 @@ pub fn build(b: *std.Build) void {
 
     const run = b.addRunArtifact(exe);
     run.step.dependOn(b.getInstallStep());
-    b.step("run", "Build and run").dependOn(&run.step);
+    b.step("bench", "Benchmark a simple add vector").dependOn(&run.step);
 }
diff --git a/src/Vec.zig b/src/Vec.zig
index 8c4370b..345cb94 100644
--- a/src/Vec.zig
+++ b/src/Vec.zig
@@ -185,6 +185,7 @@ fn submitPass(
     defer c.wgpuCommandEncoderRelease(enc);
     defer c.wgpuCommandBufferRelease(cmd);
     c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
+    _ = c.wgpuDevicePoll(gloc.device.device, 1, null); // Wait for it to be done
 }
 
 fn ceilDiv(n: usize, d: usize) usize {
diff --git a/src/bench.zig b/src/bench.zig
new file mode 100644
index 0000000..c1092ee
--- /dev/null
+++ b/src/bench.zig
@@ -0,0 +1,133 @@
+const std = @import("std");
+const GpuDevice = @import("GpuDevice.zig");
+const GpuAllocator = @import("GpuAllocator.zig");
+const GpuPipeline = @import("GpuPipeline.zig");
+const Vec = @import("Vec.zig");
+
+pub fn main(init: std.process.Init) !void {
+    const device = try GpuDevice.init();
+    defer device.deinit();
+
+    var gloc = try GpuAllocator.init(init.gpa, device);
+    defer gloc.deinit();
+
+    const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
+    defer add_pip.deinit();
+
+    const allocator = init.gpa;
+
+    // --- WARM-UP PHASE ---
+    {
+        var warmup_a = [_]f32{1.0};
+        var warmup_b = [_]f32{1.0};
+        const wa = try Vec.initLoad(&gloc, &warmup_a);
+        defer wa.deinit();
+        const wb = try Vec.initLoad(&gloc, &warmup_b);
+        defer wb.deinit();
+        const wsum = try wa.run(&gloc, wb, add_pip);
+        defer wsum.deinit();
+        const wout = try wsum.read(&gloc, allocator);
+        defer allocator.free(wout);
+    }
+
+    const sizes = [_]usize{
+        1,
+        1024,
+        4096,
+        16384,
+        65536,
+        262144,
+        1024 * 1024,
+        4 * 1024 * 1024,
+        4 * 4 * 1024 * 1024,
+        4 * 4 * 4 * 1024 * 1024,
+        4 * 4 * 4 * 4 * 1024 * 1024,
+    };
+
+    const iterations = 5;
+
+    // Print clear structural table headers
+    std.debug.print("\n| Size (MB) | Phase             | Time (ms)  |   GB/s   |\n", .{});
+    std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{});
+
+    for (sizes) |size| {
+        // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
+        var data_a = try allocator.alloc(f32, size);
+        defer allocator.free(data_a);
+        var data_b = try allocator.alloc(f32, size);
+        defer allocator.free(data_b);
+
+        for (0..size) |i| {
+            data_a[i] = @floatFromInt(i);
+            data_b[i] = @floatFromInt(size - 1 - i);
+        }
+
+        // Track best times across iterations
+        var min_alloc_ns: u64 = std.math.maxInt(u64);
+        var min_transfer_ns: u64 = std.math.maxInt(u64);
+        var min_compute_ns: u64 = std.math.maxInt(u64);
+
+        for (0..iterations) |_| {
+            // --- 1. GPU ALLOCATION PHASE ---
+            // Assumes Vec.init or similar handles uninitialized device allocation if exposed,
+            // otherwise we isolate data movement directly inside the step.
+            const alloc_start = std.Io.Clock.awake.now(init.io);
+
+            // (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint)
+            const a = try Vec.initLoad(&gloc, data_a);
+            defer a.deinit();
+            const b = try Vec.initLoad(&gloc, data_b);
+            defer b.deinit();
+
+            const alloc_duration = alloc_start.durationTo(std.Io.Clock.awake.now(init.io));
+            const alloc_ns = @as(u64, @intCast(alloc_duration.toNanoseconds()));
+            if (alloc_ns < min_alloc_ns) min_alloc_ns = alloc_ns;
+
+            // --- 2. COMPUTE PHASE ---
+            const compute_start = std.Io.Clock.awake.now(init.io);
+
+            const sum = try a.run(&gloc, b, add_pip);
+            defer sum.deinit();
+
+            const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
+            const compute_ns = @as(u64, @intCast(compute_duration.toNanoseconds()));
+            if (compute_ns < min_compute_ns) min_compute_ns = compute_ns;
+
+            // --- 3. TRANSFER PHASE (Device -> Host) ---
+            const transfer_start = std.Io.Clock.awake.now(init.io);
+
+            const out = try sum.read(&gloc, allocator);
+            defer allocator.free(out);
+
+            const transfer_duration = transfer_start.durationTo(std.Io.Clock.awake.now(init.io));
+            const transfer_ns = @as(u64, @intCast(transfer_duration.toNanoseconds()));
+            if (transfer_ns < min_transfer_ns) min_transfer_ns = transfer_ns;
+        }
+
+        // --- Metrics Calculations ---
+        const f_size = @as(f64, @floatFromInt(size));
+        const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32)));
+        const mb = element_bytes / (1024.0 * 1024.0);
+
+        // Individual Phase Timings (ms)
+        const alloc_ms = @as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000.0;
+        const compute_ms = @as(f64, @floatFromInt(min_compute_ns)) / 1_000_000.0;
+        const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
+
+        // Bandwidth Calculations
+        // Alloc phase moves 2 buffers worth of data from Host -> GPU
+        const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
+
+        // Compute phase performs 2 reads and 1 write completely on VRAM
+        const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
+
+        // Transfer phase pulls 1 buffer back from GPU -> Host
+        const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
+
+        // Print Results per Size Block
+        std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s });
+        std.debug.print("|           | 2. Compute        | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s });
+        std.debug.print("|           | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s });
+        std.debug.print("|-----------|-------------------|------------|---------:|\n", .{});
+    }
+}
diff --git a/src/main.zig b/src/main.zig
deleted file mode 100644
index 82ff477..0000000
--- a/src/main.zig
+++ /dev/null
@@ -1,75 +0,0 @@
-const std = @import("std");
-const GpuDevice = @import("GpuDevice.zig");
-const GpuAllocator = @import("GpuAllocator.zig");
-const GpuPipeline = @import("GpuPipeline.zig");
-const Vec = @import("Vec.zig");
-
-pub fn main(init: std.process.Init) !void {
-    const device = try GpuDevice.init();
-    defer device.deinit();
-
-    var gloc = try GpuAllocator.init(init.gpa, device);
-    defer gloc.deinit();
-
-    const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
-    defer add_pip.deinit();
-
-    // Define the sizes you want to benchmark
-    const sizes = [_]usize{
-        1,
-        1024,
-        4096,
-        16384,
-        65536,
-        262144,
-        1024 * 1024,
-        // 4 * 1024 * 1024,
-        // 4 * 4 * 1024 * 1024,
-        // 4 * 4 * 4 * 1024 * 1024,
-        // 4 * 4 * 4 * 4 * 1024 * 1024,
-        // 4 * 4 * 4 * 4 * 2 * 1024 * 1024,
-    };
-
-    // Print table header
-    std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{});
-    std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{});
-
-    const allocator = init.gpa;
-
-    for (sizes) |size| {
-        // Dynamically allocate buffers for the current size
-        var data_a = try allocator.alloc(f32, size);
-        defer allocator.free(data_a);
-        var data_b = try allocator.alloc(f32, size);
-        defer allocator.free(data_b);
-
-        // Populate data
-        for (0..size) |i| {
-            data_a[i] = @floatFromInt(i);
-            data_b[i] = @floatFromInt(size - 1 - i);
-        }
-
-        // Start timing the GPU operations
-        const start = std.Io.Clock.awake.now(init.io);
-
-        const a = try Vec.initLoad(&gloc, data_a);
-        defer a.deinit();
-        const b = try Vec.initLoad(&gloc, data_b);
-        defer b.deinit();
-
-        // a + b
-        const sum = try a.run(&gloc, b, add_pip);
-        defer sum.deinit();
-
-        const out = try sum.read(&gloc, allocator);
-        defer allocator.free(out);
-
-        const duration = start.durationTo(std.Io.Clock.awake.now(init.io));
-        const ns = duration.toNanoseconds();
-        const ms = duration.toMilliseconds();
-        const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0);
-
-        // Print table row
-        std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns });
-    }
-}