diff --git a/build.zig b/build.zig index 158e945..3e81041 100644 --- a/build.zig +++ b/build.zig @@ -24,7 +24,7 @@ pub fn build(b: *std.Build) void { const exe = b.addExecutable(.{ .root_module = b.createModule(.{ - .root_source_file = b.path("src/main.zig"), + .root_source_file = b.path("src/bench.zig"), .link_libc = true, .target = target, .optimize = optimize, @@ -57,5 +57,5 @@ pub fn build(b: *std.Build) void { const run = b.addRunArtifact(exe); run.step.dependOn(b.getInstallStep()); - b.step("run", "Build and run").dependOn(&run.step); + b.step("bench", "Benchmark a simple add vector").dependOn(&run.step); } diff --git a/src/Vec.zig b/src/Vec.zig index 8c4370b..345cb94 100644 --- a/src/Vec.zig +++ b/src/Vec.zig @@ -185,6 +185,7 @@ fn submitPass( defer c.wgpuCommandEncoderRelease(enc); defer c.wgpuCommandBufferRelease(cmd); c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); + _ = c.wgpuDevicePoll(gloc.device.device, 1, null); // Wait for it to be done } fn ceilDiv(n: usize, d: usize) usize { diff --git a/src/bench.zig b/src/bench.zig new file mode 100644 index 0000000..c1092ee --- /dev/null +++ b/src/bench.zig @@ -0,0 +1,133 @@ +const std = @import("std"); +const GpuDevice = @import("GpuDevice.zig"); +const GpuAllocator = @import("GpuAllocator.zig"); +const GpuPipeline = @import("GpuPipeline.zig"); +const Vec = @import("Vec.zig"); + +pub fn main(init: std.process.Init) !void { + const device = try GpuDevice.init(); + defer device.deinit(); + + var gloc = try GpuAllocator.init(init.gpa, device); + defer gloc.deinit(); + + const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl")); + defer add_pip.deinit(); + + const allocator = init.gpa; + + // --- WARM-UP PHASE --- + { + var warmup_a = [_]f32{1.0}; + var warmup_b = [_]f32{1.0}; + const wa = try Vec.initLoad(&gloc, &warmup_a); + defer wa.deinit(); + const wb = try Vec.initLoad(&gloc, &warmup_b); + defer wb.deinit(); + const wsum = try wa.run(&gloc, wb, add_pip); + defer wsum.deinit(); + const wout = try wsum.read(&gloc, allocator); + defer allocator.free(wout); + } + + const sizes = [_]usize{ + 1, + 1024, + 4096, + 16384, + 65536, + 262144, + 1024 * 1024, + 4 * 1024 * 1024, + 4 * 4 * 1024 * 1024, + 4 * 4 * 4 * 1024 * 1024, + 4 * 4 * 4 * 4 * 1024 * 1024, + }; + + const iterations = 5; + + // Print clear structural table headers + std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s |\n", .{}); + std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{}); + + for (sizes) |size| { + // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) --- + var data_a = try allocator.alloc(f32, size); + defer allocator.free(data_a); + var data_b = try allocator.alloc(f32, size); + defer allocator.free(data_b); + + for (0..size) |i| { + data_a[i] = @floatFromInt(i); + data_b[i] = @floatFromInt(size - 1 - i); + } + + // Track best times across iterations + var min_alloc_ns: u64 = std.math.maxInt(u64); + var min_transfer_ns: u64 = std.math.maxInt(u64); + var min_compute_ns: u64 = std.math.maxInt(u64); + + for (0..iterations) |_| { + // --- 1. GPU ALLOCATION PHASE --- + // Assumes Vec.init or similar handles uninitialized device allocation if exposed, + // otherwise we isolate data movement directly inside the step. + const alloc_start = std.Io.Clock.awake.now(init.io); + + // (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint) + const a = try Vec.initLoad(&gloc, data_a); + defer a.deinit(); + const b = try Vec.initLoad(&gloc, data_b); + defer b.deinit(); + + const alloc_duration = alloc_start.durationTo(std.Io.Clock.awake.now(init.io)); + const alloc_ns = @as(u64, @intCast(alloc_duration.toNanoseconds())); + if (alloc_ns < min_alloc_ns) min_alloc_ns = alloc_ns; + + // --- 2. COMPUTE PHASE --- + const compute_start = std.Io.Clock.awake.now(init.io); + + const sum = try a.run(&gloc, b, add_pip); + defer sum.deinit(); + + const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io)); + const compute_ns = @as(u64, @intCast(compute_duration.toNanoseconds())); + if (compute_ns < min_compute_ns) min_compute_ns = compute_ns; + + // --- 3. TRANSFER PHASE (Device -> Host) --- + const transfer_start = std.Io.Clock.awake.now(init.io); + + const out = try sum.read(&gloc, allocator); + defer allocator.free(out); + + const transfer_duration = transfer_start.durationTo(std.Io.Clock.awake.now(init.io)); + const transfer_ns = @as(u64, @intCast(transfer_duration.toNanoseconds())); + if (transfer_ns < min_transfer_ns) min_transfer_ns = transfer_ns; + } + + // --- Metrics Calculations --- + const f_size = @as(f64, @floatFromInt(size)); + const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32))); + const mb = element_bytes / (1024.0 * 1024.0); + + // Individual Phase Timings (ms) + const alloc_ms = @as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000.0; + const compute_ms = @as(f64, @floatFromInt(min_compute_ns)) / 1_000_000.0; + const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0; + + // Bandwidth Calculations + // Alloc phase moves 2 buffers worth of data from Host -> GPU + const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0); + + // Compute phase performs 2 reads and 1 write completely on VRAM + const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0); + + // Transfer phase pulls 1 buffer back from GPU -> Host + const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0); + + // Print Results per Size Block + std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s }); + std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s }); + std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s }); + std.debug.print("|-----------|-------------------|------------|---------:|\n", .{}); + } +} diff --git a/src/main.zig b/src/main.zig deleted file mode 100644 index 82ff477..0000000 --- a/src/main.zig +++ /dev/null @@ -1,75 +0,0 @@ -const std = @import("std"); -const GpuDevice = @import("GpuDevice.zig"); -const GpuAllocator = @import("GpuAllocator.zig"); -const GpuPipeline = @import("GpuPipeline.zig"); -const Vec = @import("Vec.zig"); - -pub fn main(init: std.process.Init) !void { - const device = try GpuDevice.init(); - defer device.deinit(); - - var gloc = try GpuAllocator.init(init.gpa, device); - defer gloc.deinit(); - - const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl")); - defer add_pip.deinit(); - - // Define the sizes you want to benchmark - const sizes = [_]usize{ - 1, - 1024, - 4096, - 16384, - 65536, - 262144, - 1024 * 1024, - // 4 * 1024 * 1024, - // 4 * 4 * 1024 * 1024, - // 4 * 4 * 4 * 1024 * 1024, - // 4 * 4 * 4 * 4 * 1024 * 1024, - // 4 * 4 * 4 * 4 * 2 * 1024 * 1024, - }; - - // Print table header - std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{}); - std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{}); - - const allocator = init.gpa; - - for (sizes) |size| { - // Dynamically allocate buffers for the current size - var data_a = try allocator.alloc(f32, size); - defer allocator.free(data_a); - var data_b = try allocator.alloc(f32, size); - defer allocator.free(data_b); - - // Populate data - for (0..size) |i| { - data_a[i] = @floatFromInt(i); - data_b[i] = @floatFromInt(size - 1 - i); - } - - // Start timing the GPU operations - const start = std.Io.Clock.awake.now(init.io); - - const a = try Vec.initLoad(&gloc, data_a); - defer a.deinit(); - const b = try Vec.initLoad(&gloc, data_b); - defer b.deinit(); - - // a + b - const sum = try a.run(&gloc, b, add_pip); - defer sum.deinit(); - - const out = try sum.read(&gloc, allocator); - defer allocator.free(out); - - const duration = start.durationTo(std.Io.Clock.awake.now(init.io)); - const ns = duration.toNanoseconds(); - const ms = duration.toMilliseconds(); - const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0); - - // Print table row - std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns }); - } -}