From 62b5224e6ef5eb68f7d91c969e9c5d9b8ca0dde4 Mon Sep 17 00:00:00 2001 From: AdrienBouvais Date: Tue, 19 May 2026 07:54:16 +0200 Subject: [PATCH] Changed GpuProcess to use a definition Now GpuProcess isnt limited to 2 in, 1 out but to anything. --- examples/add.zig | 14 ++++- src/GpuProcess.zig | 153 ++++++++++++++++++++++++++++++--------------- 2 files changed, 116 insertions(+), 51 deletions(-) diff --git a/examples/add.zig b/examples/add.zig index ec73cef..c611ac3 100644 --- a/examples/add.zig +++ b/examples/add.zig @@ -18,7 +18,17 @@ pub fn main(init: std.process.Init) !void { const gloc = grena.gpuAllocator(); // 3. Load the WGSL compute pipeline - const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl")); + const add_process = try GpuProcess.init( + device, + @embedFile("shaders/add.wgsl"), + .{ + .bindings = &.{ + .{ .element_size = @sizeOf(f16) }, + .{ .element_size = @sizeOf(f16) }, + .{ .element_size = @sizeOf(f16) }, + }, + }, + ); defer add_process.deinit(); // 4. Setup CPU data @@ -49,7 +59,7 @@ pub fn main(init: std.process.Init) !void { // 7. Dispatch the Compute Process // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly - try add_process.run(gloc, f16, buf_a, buf_b, buf_out); + try add_process.run(gloc, .{ buf_a, buf_b, buf_out }); // 8. Map and copy the resulting buffer back to the CPU const out = try buf_out.read(allocator, f16); diff --git a/src/GpuProcess.zig b/src/GpuProcess.zig index e03e7be..0bad385 100644 --- a/src/GpuProcess.zig +++ b/src/GpuProcess.zig @@ -1,6 +1,3 @@ -/// GpuProcess is just a pipeline with 2 inpout and 1 output -/// for now, to see if I make it a bit more generic -/// const std = @import("std"); const c = @import("utils.zig").c; const sv = @import("utils.zig").sv; @@ -8,9 +5,25 @@ const GpuAllocator = @import("GpuAllocator.zig"); const GpuBuffer = @import("GpuBuffer.zig"); const GpuDevice = @import("GpuDevice.zig"); -pip: c.WGPUComputePipeline, +pub const Binding = struct { + /// Element size in bytes for this binding. E.g. @sizeOf(f32). + /// If 0, no element-based size validation is performed for this buffer. + element_size: u32 = 0, +}; -pub fn init(device: GpuDevice, wgsl: []const u8) !@This() { +pub const ProcessDef = struct { + bindings: []const Binding, + workgroup_size: u32 = 256, + max_workgroups: u32 = 65535, + /// If true, automatically adds a Uniform Buffer containing `elements_count` as a `u32` + /// to the next available binding slot. + append_info_buffer: bool = true, +}; + +pip: c.WGPUComputePipeline, +def: ProcessDef, + +pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() { var wgsl_src = c.WGPUShaderSourceWGSL{ .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, .code = sv(wgsl), @@ -20,71 +33,117 @@ pub fn init(device: GpuDevice, wgsl: []const u8) !@This() { }) orelse return error.Shader; defer c.wgpuShaderModuleRelease(shader); - return .{ .pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{ + const pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{ .compute = .{ .module = shader, .entryPoint = sv("main") }, - }) orelse return error.Pipeline }; + }) orelse return error.Pipeline; + + return .{ + .pip = pip, + .def = def, + }; } pub fn deinit(self: @This()) void { c.wgpuComputePipelineRelease(self.pip); } -fn onMapped( - status: c.WGPUMapAsyncStatus, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - const flag: *bool = @ptrCast(@alignCast(userdata1.?)); - flag.* = (status == c.WGPUMapAsyncStatus_Success); -} - -// Changed: gloc is passed by value instead of *GpuAllocator +/// Execute the compute pass with arbitrary buffer bindings via a tuple. +/// `override_elements_count` can be `null` to infer the dispatch count from the first checked buffer. +/// Example: `try proc.run(gloc, null, .{ buf_a, buf_b, buf_out });` pub fn run( self: @This(), gloc: GpuAllocator, - T: type, - buf_a: GpuBuffer, - buf_b: GpuBuffer, - buf_out: GpuBuffer, + args: anytype, ) !void { - const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB + const type_info = @typeInfo(@TypeOf(args)); + if (type_info != .@"struct" or !type_info.@"struct".is_tuple) + @compileError("Expected a tuple of GpuBuffers for args. E.g. .{ buf_a, buf_b }"); - const bytes = buf_a.size; - var offset: u64 = 0; - while (offset < bytes) { - const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset); - const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(T)); + const fields = type_info.@"struct".fields; + if (fields.len != self.def.bindings.len) { + std.log.err("Process expected {d} arguments, got {d}", .{ self.def.bindings.len, fields.len }); + return error.InvalidArgumentCount; + } - const info_buf = try GpuBuffer.init( + var elements_count: u32 = 0; + + // Infer elements_count from the first arg with a defined element_size + inline for (fields, 0..) |field, i| { + if (elements_count == 0) { + const buf = @field(args, field.name); + const el_size = self.def.bindings[i].element_size; + if (el_size > 0) { + elements_count = @intCast(buf.size / el_size); + } + } + } + + // Validate runtime buffer sizes before dispatching + inline for (fields, 0..) |field, i| { + const buf = @field(args, field.name); + const el_size = self.def.bindings[i].element_size; + if (el_size > 0) { + const expected_min_bytes = @as(u64, elements_count) * el_size; + if (buf.size < expected_min_bytes) { + std.log.err("Argument {d} size mismatch: expected at least {d} bytes, got {d}", .{ i, expected_min_bytes, buf.size }); + return error.BufferTooSmall; + } + } + } + + var entries_buf: [32]c.WGPUBindGroupEntry = undefined; + var entry_count: usize = 0; + + // Unpack tuple into WebGPU BindGroupEntries + inline for (fields, 0..) |field, i| { + const buf = @field(args, field.name); + if (@TypeOf(buf) != GpuBuffer) { + @compileError("All arguments in the tuple must be of type GpuBuffer"); + } + entries_buf[entry_count] = .{ + .binding = @intCast(i), + .buffer = buf.raw, + .offset = 0, + .size = buf.size, // Size exposes the fully allocated length + }; + entry_count += 1; + } + + // Optional uniform dispatch buffer appended at the end + var info_buf: ?GpuBuffer = null; + defer if (info_buf) |b| b.deinit(); + + if (self.def.append_info_buffer) { + info_buf = try GpuBuffer.init( gloc, @sizeOf(u32), .initMany(&.{ .Uniform, .CopyDst }), ); - defer info_buf.deinit(); + c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.?.raw, 0, &elements_count, @sizeOf(u32)); - c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32)); - - const entries = [_]c.WGPUBindGroupEntry{ - .{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) }, + entries_buf[entry_count] = .{ + .binding = @intCast(entry_count), + .buffer = info_buf.?.raw, + .offset = 0, + .size = @sizeOf(u32), }; - - try submitPass(gloc, self.pip, &entries, current_chunk_elements); - - offset += current_chunk_bytes; + entry_count += 1; } + + const entries = entries_buf[0..entry_count]; + try submitPass(gloc, self.pip, entries, elements_count, self.def.workgroup_size, self.def.max_workgroups); } -// Changed: gloc is passed by value instead of *GpuAllocator fn submitPass( gloc: GpuAllocator, pipeline: c.WGPUComputePipeline, entries: []const c.WGPUBindGroupEntry, n: usize, + workgroup_size: u32, + max_workgroups: u32, ) !void { + if (n == 0) return; + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); defer c.wgpuBindGroupLayoutRelease(bgl); @@ -95,17 +154,13 @@ fn submitPass( }) orelse return error.BindGroup; defer c.wgpuBindGroupRelease(bg); - const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse - return error.Encoder; + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder; const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); c.wgpuComputePassEncoderSetPipeline(pass, pipeline); c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); - const WORKGROUP_SIZE = 256; - const MAX_WORKGROUPS = 65535; - - const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE); - const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS); + const desired_workgroups = ceilDiv(n, workgroup_size); + const dispatch_count = @min(desired_workgroups, max_workgroups); c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1); c.wgpuComputePassEncoderEnd(pass);