Changed GpuProcess to use a definition
Now GpuProcess isnt limited to 2 in, 1 out but to anything.
This commit is contained in:
parent
44d26feba5
commit
62b5224e6e
@ -18,7 +18,17 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
const gloc = grena.gpuAllocator();
|
const gloc = grena.gpuAllocator();
|
||||||
|
|
||||||
// 3. Load the WGSL compute pipeline
|
// 3. Load the WGSL compute pipeline
|
||||||
const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
|
const add_process = try GpuProcess.init(
|
||||||
|
device,
|
||||||
|
@embedFile("shaders/add.wgsl"),
|
||||||
|
.{
|
||||||
|
.bindings = &.{
|
||||||
|
.{ .element_size = @sizeOf(f16) },
|
||||||
|
.{ .element_size = @sizeOf(f16) },
|
||||||
|
.{ .element_size = @sizeOf(f16) },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
);
|
||||||
defer add_process.deinit();
|
defer add_process.deinit();
|
||||||
|
|
||||||
// 4. Setup CPU data
|
// 4. Setup CPU data
|
||||||
@ -49,7 +59,7 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
// 7. Dispatch the Compute Process
|
// 7. Dispatch the Compute Process
|
||||||
// We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
|
// We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
|
||||||
try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
|
try add_process.run(gloc, .{ buf_a, buf_b, buf_out });
|
||||||
|
|
||||||
// 8. Map and copy the resulting buffer back to the CPU
|
// 8. Map and copy the resulting buffer back to the CPU
|
||||||
const out = try buf_out.read(allocator, f16);
|
const out = try buf_out.read(allocator, f16);
|
||||||
|
|||||||
@ -1,6 +1,3 @@
|
|||||||
/// GpuProcess is just a pipeline with 2 inpout and 1 output
|
|
||||||
/// for now, to see if I make it a bit more generic
|
|
||||||
///
|
|
||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const c = @import("utils.zig").c;
|
const c = @import("utils.zig").c;
|
||||||
const sv = @import("utils.zig").sv;
|
const sv = @import("utils.zig").sv;
|
||||||
@ -8,9 +5,25 @@ const GpuAllocator = @import("GpuAllocator.zig");
|
|||||||
const GpuBuffer = @import("GpuBuffer.zig");
|
const GpuBuffer = @import("GpuBuffer.zig");
|
||||||
const GpuDevice = @import("GpuDevice.zig");
|
const GpuDevice = @import("GpuDevice.zig");
|
||||||
|
|
||||||
pip: c.WGPUComputePipeline,
|
pub const Binding = struct {
|
||||||
|
/// Element size in bytes for this binding. E.g. @sizeOf(f32).
|
||||||
|
/// If 0, no element-based size validation is performed for this buffer.
|
||||||
|
element_size: u32 = 0,
|
||||||
|
};
|
||||||
|
|
||||||
pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
|
pub const ProcessDef = struct {
|
||||||
|
bindings: []const Binding,
|
||||||
|
workgroup_size: u32 = 256,
|
||||||
|
max_workgroups: u32 = 65535,
|
||||||
|
/// If true, automatically adds a Uniform Buffer containing `elements_count` as a `u32`
|
||||||
|
/// to the next available binding slot.
|
||||||
|
append_info_buffer: bool = true,
|
||||||
|
};
|
||||||
|
|
||||||
|
pip: c.WGPUComputePipeline,
|
||||||
|
def: ProcessDef,
|
||||||
|
|
||||||
|
pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() {
|
||||||
var wgsl_src = c.WGPUShaderSourceWGSL{
|
var wgsl_src = c.WGPUShaderSourceWGSL{
|
||||||
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
|
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
|
||||||
.code = sv(wgsl),
|
.code = sv(wgsl),
|
||||||
@ -20,71 +33,117 @@ pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
|
|||||||
}) orelse return error.Shader;
|
}) orelse return error.Shader;
|
||||||
defer c.wgpuShaderModuleRelease(shader);
|
defer c.wgpuShaderModuleRelease(shader);
|
||||||
|
|
||||||
return .{ .pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{
|
const pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{
|
||||||
.compute = .{ .module = shader, .entryPoint = sv("main") },
|
.compute = .{ .module = shader, .entryPoint = sv("main") },
|
||||||
}) orelse return error.Pipeline };
|
}) orelse return error.Pipeline;
|
||||||
|
|
||||||
|
return .{
|
||||||
|
.pip = pip,
|
||||||
|
.def = def,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deinit(self: @This()) void {
|
pub fn deinit(self: @This()) void {
|
||||||
c.wgpuComputePipelineRelease(self.pip);
|
c.wgpuComputePipelineRelease(self.pip);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onMapped(
|
/// Execute the compute pass with arbitrary buffer bindings via a tuple.
|
||||||
status: c.WGPUMapAsyncStatus,
|
/// `override_elements_count` can be `null` to infer the dispatch count from the first checked buffer.
|
||||||
_: c.WGPUStringView,
|
/// Example: `try proc.run(gloc, null, .{ buf_a, buf_b, buf_out });`
|
||||||
userdata1: ?*anyopaque,
|
|
||||||
_: ?*anyopaque,
|
|
||||||
) callconv(.c) void {
|
|
||||||
const flag: *bool = @ptrCast(@alignCast(userdata1.?));
|
|
||||||
flag.* = (status == c.WGPUMapAsyncStatus_Success);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Changed: gloc is passed by value instead of *GpuAllocator
|
|
||||||
pub fn run(
|
pub fn run(
|
||||||
self: @This(),
|
self: @This(),
|
||||||
gloc: GpuAllocator,
|
gloc: GpuAllocator,
|
||||||
T: type,
|
args: anytype,
|
||||||
buf_a: GpuBuffer,
|
|
||||||
buf_b: GpuBuffer,
|
|
||||||
buf_out: GpuBuffer,
|
|
||||||
) !void {
|
) !void {
|
||||||
const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB
|
const type_info = @typeInfo(@TypeOf(args));
|
||||||
|
if (type_info != .@"struct" or !type_info.@"struct".is_tuple)
|
||||||
|
@compileError("Expected a tuple of GpuBuffers for args. E.g. .{ buf_a, buf_b }");
|
||||||
|
|
||||||
const bytes = buf_a.size;
|
const fields = type_info.@"struct".fields;
|
||||||
var offset: u64 = 0;
|
if (fields.len != self.def.bindings.len) {
|
||||||
while (offset < bytes) {
|
std.log.err("Process expected {d} arguments, got {d}", .{ self.def.bindings.len, fields.len });
|
||||||
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
|
return error.InvalidArgumentCount;
|
||||||
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(T));
|
}
|
||||||
|
|
||||||
const info_buf = try GpuBuffer.init(
|
var elements_count: u32 = 0;
|
||||||
|
|
||||||
|
// Infer elements_count from the first arg with a defined element_size
|
||||||
|
inline for (fields, 0..) |field, i| {
|
||||||
|
if (elements_count == 0) {
|
||||||
|
const buf = @field(args, field.name);
|
||||||
|
const el_size = self.def.bindings[i].element_size;
|
||||||
|
if (el_size > 0) {
|
||||||
|
elements_count = @intCast(buf.size / el_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate runtime buffer sizes before dispatching
|
||||||
|
inline for (fields, 0..) |field, i| {
|
||||||
|
const buf = @field(args, field.name);
|
||||||
|
const el_size = self.def.bindings[i].element_size;
|
||||||
|
if (el_size > 0) {
|
||||||
|
const expected_min_bytes = @as(u64, elements_count) * el_size;
|
||||||
|
if (buf.size < expected_min_bytes) {
|
||||||
|
std.log.err("Argument {d} size mismatch: expected at least {d} bytes, got {d}", .{ i, expected_min_bytes, buf.size });
|
||||||
|
return error.BufferTooSmall;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var entries_buf: [32]c.WGPUBindGroupEntry = undefined;
|
||||||
|
var entry_count: usize = 0;
|
||||||
|
|
||||||
|
// Unpack tuple into WebGPU BindGroupEntries
|
||||||
|
inline for (fields, 0..) |field, i| {
|
||||||
|
const buf = @field(args, field.name);
|
||||||
|
if (@TypeOf(buf) != GpuBuffer) {
|
||||||
|
@compileError("All arguments in the tuple must be of type GpuBuffer");
|
||||||
|
}
|
||||||
|
entries_buf[entry_count] = .{
|
||||||
|
.binding = @intCast(i),
|
||||||
|
.buffer = buf.raw,
|
||||||
|
.offset = 0,
|
||||||
|
.size = buf.size, // Size exposes the fully allocated length
|
||||||
|
};
|
||||||
|
entry_count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optional uniform dispatch buffer appended at the end
|
||||||
|
var info_buf: ?GpuBuffer = null;
|
||||||
|
defer if (info_buf) |b| b.deinit();
|
||||||
|
|
||||||
|
if (self.def.append_info_buffer) {
|
||||||
|
info_buf = try GpuBuffer.init(
|
||||||
gloc,
|
gloc,
|
||||||
@sizeOf(u32),
|
@sizeOf(u32),
|
||||||
.initMany(&.{ .Uniform, .CopyDst }),
|
.initMany(&.{ .Uniform, .CopyDst }),
|
||||||
);
|
);
|
||||||
defer info_buf.deinit();
|
c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.?.raw, 0, &elements_count, @sizeOf(u32));
|
||||||
|
|
||||||
c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32));
|
entries_buf[entry_count] = .{
|
||||||
|
.binding = @intCast(entry_count),
|
||||||
const entries = [_]c.WGPUBindGroupEntry{
|
.buffer = info_buf.?.raw,
|
||||||
.{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes },
|
.offset = 0,
|
||||||
.{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes },
|
.size = @sizeOf(u32),
|
||||||
.{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes },
|
|
||||||
.{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) },
|
|
||||||
};
|
};
|
||||||
|
entry_count += 1;
|
||||||
try submitPass(gloc, self.pip, &entries, current_chunk_elements);
|
|
||||||
|
|
||||||
offset += current_chunk_bytes;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const entries = entries_buf[0..entry_count];
|
||||||
|
try submitPass(gloc, self.pip, entries, elements_count, self.def.workgroup_size, self.def.max_workgroups);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Changed: gloc is passed by value instead of *GpuAllocator
|
|
||||||
fn submitPass(
|
fn submitPass(
|
||||||
gloc: GpuAllocator,
|
gloc: GpuAllocator,
|
||||||
pipeline: c.WGPUComputePipeline,
|
pipeline: c.WGPUComputePipeline,
|
||||||
entries: []const c.WGPUBindGroupEntry,
|
entries: []const c.WGPUBindGroupEntry,
|
||||||
n: usize,
|
n: usize,
|
||||||
|
workgroup_size: u32,
|
||||||
|
max_workgroups: u32,
|
||||||
) !void {
|
) !void {
|
||||||
|
if (n == 0) return;
|
||||||
|
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
|
|
||||||
@ -95,17 +154,13 @@ fn submitPass(
|
|||||||
}) orelse return error.BindGroup;
|
}) orelse return error.BindGroup;
|
||||||
defer c.wgpuBindGroupRelease(bg);
|
defer c.wgpuBindGroupRelease(bg);
|
||||||
|
|
||||||
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
|
||||||
return error.Encoder;
|
|
||||||
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
||||||
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
||||||
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
||||||
|
|
||||||
const WORKGROUP_SIZE = 256;
|
const desired_workgroups = ceilDiv(n, workgroup_size);
|
||||||
const MAX_WORKGROUPS = 65535;
|
const dispatch_count = @min(desired_workgroups, max_workgroups);
|
||||||
|
|
||||||
const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE);
|
|
||||||
const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS);
|
|
||||||
|
|
||||||
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
|
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
|
||||||
c.wgpuComputePassEncoderEnd(pass);
|
c.wgpuComputePassEncoderEnd(pass);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user