diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig index 3ace41c..506a314 100644 --- a/src/GpuAllocator.zig +++ b/src/GpuAllocator.zig @@ -152,7 +152,6 @@ fn onDevice( } const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); ctx.device = device; - std.debug.print("{?}", .{device}); } fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline { diff --git a/src/Mat.zig b/src/Mat.zig index 0e186ca..1ddc2ee 100644 --- a/src/Mat.zig +++ b/src/Mat.zig @@ -6,14 +6,14 @@ const GpuBuffer = @import("GpuBuffer.zig"); const Mat = @This(); buf: GpuBuffer, -rows: u32, -cols: u32, +rows: usize, +cols: usize, pub fn load( gloc: *GpuAllocator, data: []const f32, - rows: u32, - cols: u32, + rows: usize, + cols: usize, ) !Mat { std.debug.assert(data.len == @as(usize, rows) * cols); const bytes = data.len * @sizeOf(f32); @@ -29,7 +29,7 @@ pub fn load( return .{ .buf = buf, .rows = rows, .cols = cols }; } -pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat { +pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat { const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32); const buf = try GpuBuffer.init( gloc, @@ -43,7 +43,7 @@ pub fn deinit(self: Mat) void { self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory } -pub fn len(self: Mat) u32 { +pub fn len(self: Mat) usize { return self.rows * self.cols; } @@ -144,7 +144,7 @@ fn dispatch2in1out( buf_b: GpuBuffer, buf_out: GpuBuffer, bytes: u64, - n: u32, + n: usize, ) !void { const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); defer c.wgpuBindGroupLayoutRelease(bgl); @@ -162,7 +162,7 @@ fn submitPass( gloc: *GpuAllocator, pipeline: c.WGPUComputePipeline, entries: []const c.WGPUBindGroupEntry, - n: u32, + n: usize, ) !void { const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); defer c.wgpuBindGroupLayoutRelease(bgl); @@ -179,7 +179,7 @@ fn submitPass( const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); c.wgpuComputePassEncoderSetPipeline(pass, pipeline); c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); - c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1); + c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(ceilDiv(n, 256)), 1, 1); c.wgpuComputePassEncoderEnd(pass); c.wgpuComputePassEncoderRelease(pass); @@ -189,6 +189,6 @@ fn submitPass( c.wgpuQueueSubmit(gloc.queue, 1, &cmd); } -fn ceilDiv(n: u32, d: u32) u32 { +fn ceilDiv(n: usize, d: usize) usize { return (n + d - 1) / d; } diff --git a/src/main.zig b/src/main.zig index 0fdd20e..8c38c89 100644 --- a/src/main.zig +++ b/src/main.zig @@ -6,39 +6,61 @@ pub fn main(init: std.process.Init) !void { var gloc = try GpuAllocator.init(init.gpa); defer gloc.deinit(); - // Input data: a[i] = i, b[i] = 15 - i → add should give all 15s - var data_a: [16]f32 = undefined; - var data_b: [16]f32 = undefined; - for (0..16) |i| { - data_a[i] = @floatFromInt(i); - data_b[i] = @floatFromInt(15 - i); + // Define the sizes you want to benchmark + const sizes = [_]usize{ 1, 1024, 4096, 16384, 65536, 262144, 1024 * 1024, 4 * 1024 * 1024 }; + + // Print table header + std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{}); + std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{}); + + const allocator = init.gpa; + + for (sizes) |size| { + // Dynamically allocate buffers for the current size + var data_a = try allocator.alloc(f32, size); + defer allocator.free(data_a); + var data_b = try allocator.alloc(f32, size); + defer allocator.free(data_b); + + // Populate data + for (0..size) |i| { + data_a[i] = @floatFromInt(i); + data_b[i] = @floatFromInt(size - 1 - i); + } + + // Start timing the GPU operations + const start = std.Io.Clock.awake.now(init.io); + + const a = try Mat.load(&gloc, data_a, size, 1); + defer a.deinit(); + const b = try Mat.load(&gloc, data_b, size, 1); + defer b.deinit(); + + // a + b + const sum = try a.add(&gloc, b); + defer sum.deinit(); + + // sum * 2 + const scaled = try sum.scale(&gloc, 2.0); + defer scaled.deinit(); + + // Read back (allocating dynamically for read-back buffers too) + const out_sum = try allocator.alloc(f32, size); + defer allocator.free(out_sum); + const out_scaled = try allocator.alloc(f32, size); + defer allocator.free(out_scaled); + + try sum.read(&gloc, out_sum); + try scaled.read(&gloc, out_scaled); + + const duration = start.durationTo(std.Io.Clock.awake.now(init.io)); + const ns = duration.toNanoseconds(); + const ms = @as(f64, @floatFromInt(ns)) / 1_000_000.0; + const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0); + + // Print table row + std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns }); } - - const a = try Mat.load(&gloc, &data_a, 4, 4); - defer a.deinit(); - const b = try Mat.load(&gloc, &data_b, 4, 4); - defer b.deinit(); - - // a + b - const sum = try a.add(&gloc, b); - defer sum.deinit(); - - // sum * 2 - const scaled = try sum.scale(&gloc, 2.0); - defer scaled.deinit(); - - // Read back - var out_sum: [16]f32 = undefined; - var out_scaled: [16]f32 = undefined; - try sum.read(&gloc, &out_sum); - try scaled.read(&gloc, &out_scaled); - - // Print - std.debug.print("\na + b (expect all 15):\n", .{}); - printMat(&out_sum, 4, 4); - - std.debug.print("\n(a + b) * 2 (expect all 30):\n", .{}); - printMat(&out_scaled, 4, 4); } fn printMat(data: []const f32, rows: u32, cols: u32) void { diff --git a/src/reference.zig b/src/reference.zig index 6c80dfa..4392f24 100644 --- a/src/reference.zig +++ b/src/reference.zig @@ -65,7 +65,6 @@ fn onDevice( } const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); ctx.device = device; - std.debug.print("{?}", .{device}); } fn onMapped( diff --git a/src/shaders.zig b/src/shaders.zig index 730e26f..da489fd 100644 --- a/src/shaders.zig +++ b/src/shaders.zig @@ -3,7 +3,7 @@ pub const SHADER_ADD = \\@group(0) @binding(1) var b : array; \\@group(0) @binding(2) var out : array; \\ - \\@compute @workgroup_size(64) + \\@compute @workgroup_size(256) \\fn main(@builtin(global_invocation_id) gid : vec3) { \\ let i = gid.x; \\ if (i < arrayLength(&out)) { @@ -18,7 +18,7 @@ pub const SHADER_SCALE = \\@group(0) @binding(1) var out : array; \\@group(0) @binding(2) var u : Uniforms; \\ - \\@compute @workgroup_size(64) + \\@compute @workgroup_size(256) \\fn main(@builtin(global_invocation_id) gid : vec3) { \\ let i = gid.x; \\ if (i < arrayLength(&out)) {