Compare commits

..

No commits in common. "09e62cf667f04961455282b96bf8d9528450b5b6" and "bfac17093618d7d604e7fa96858db34ba36efa5a" have entirely different histories.

3 changed files with 57 additions and 123 deletions

View File

@ -17,6 +17,7 @@ The library exports five primary components:
Below is a complete, self-contained example demonstrating how to initialize the GPU, load data, run a compute shader, and read the results back to the CPU: Below is a complete, self-contained example demonstrating how to initialize the GPU, load data, run a compute shader, and read the results back to the CPU:
```zig ```zig
const std = @import("std"); const std = @import("std");
const gpu = @import("gpu"); const gpu = @import("gpu");
const GpuDevice = gpu.GpuDevice; const GpuDevice = gpu.GpuDevice;
@ -37,15 +38,7 @@ pub fn main(init: std.process.Init) !void {
const gloc = grena.gpuAllocator(); const gloc = grena.gpuAllocator();
// 3. Load the WGSL compute pipeline // 3. Load the WGSL compute pipeline
const add_process = try GpuProcess.init( const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
device,
@embedFile("shaders/add.wgsl"),
.{ .bindings = &.{
.{ .element_size = @sizeOf(f16) },
.{ .element_size = @sizeOf(f16) },
.{ .element_size = @sizeOf(f16) },
} },
);
defer add_process.deinit(); defer add_process.deinit();
// 4. Setup CPU data // 4. Setup CPU data
@ -61,6 +54,7 @@ pub fn main(init: std.process.Init) !void {
} }
// 5. Initialize raw GPU Buffers // 5. Initialize raw GPU Buffers
// We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported
const byte_size = len * @sizeOf(f16); const byte_size = len * @sizeOf(f16);
const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
@ -74,7 +68,8 @@ pub fn main(init: std.process.Init) !void {
try buf_b.load(f16, data_b); try buf_b.load(f16, data_b);
// 7. Dispatch the Compute Process // 7. Dispatch the Compute Process
try add_process.run(gloc, .{ buf_a, buf_b, buf_out }); // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
// 8. Map and copy the resulting buffer back to the CPU // 8. Map and copy the resulting buffer back to the CPU
const out = try buf_out.read(allocator, f16); const out = try buf_out.read(allocator, f16);
@ -82,7 +77,6 @@ pub fn main(init: std.process.Init) !void {
std.debug.print("Result: {any}\n", .{out}); std.debug.print("Result: {any}\n", .{out});
} }
``` ```
## Dependencies ## Dependencies

View File

@ -18,15 +18,7 @@ pub fn main(init: std.process.Init) !void {
const gloc = grena.gpuAllocator(); const gloc = grena.gpuAllocator();
// 3. Load the WGSL compute pipeline // 3. Load the WGSL compute pipeline
const add_process = try GpuProcess.init( const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
device,
@embedFile("shaders/add.wgsl"),
.{ .bindings = &.{
.{ .element_size = @sizeOf(f16) },
.{ .element_size = @sizeOf(f16) },
.{ .element_size = @sizeOf(f16) },
} },
);
defer add_process.deinit(); defer add_process.deinit();
// 4. Setup CPU data // 4. Setup CPU data
@ -42,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
} }
// 5. Initialize raw GPU Buffers // 5. Initialize raw GPU Buffers
// We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported
const byte_size = len * @sizeOf(f16); const byte_size = len * @sizeOf(f16);
const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
@ -55,7 +48,8 @@ pub fn main(init: std.process.Init) !void {
try buf_b.load(f16, data_b); try buf_b.load(f16, data_b);
// 7. Dispatch the Compute Process // 7. Dispatch the Compute Process
try add_process.run(gloc, .{ buf_a, buf_b, buf_out }); // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
// 8. Map and copy the resulting buffer back to the CPU // 8. Map and copy the resulting buffer back to the CPU
const out = try buf_out.read(allocator, f16); const out = try buf_out.read(allocator, f16);

View File

@ -1,3 +1,6 @@
/// GpuProcess is just a pipeline with 2 inpout and 1 output
/// for now, to see if I make it a bit more generic
///
const std = @import("std"); const std = @import("std");
const c = @import("utils.zig").c; const c = @import("utils.zig").c;
const sv = @import("utils.zig").sv; const sv = @import("utils.zig").sv;
@ -5,25 +8,9 @@ const GpuAllocator = @import("GpuAllocator.zig");
const GpuBuffer = @import("GpuBuffer.zig"); const GpuBuffer = @import("GpuBuffer.zig");
const GpuDevice = @import("GpuDevice.zig"); const GpuDevice = @import("GpuDevice.zig");
pub const Binding = struct {
/// Element size in bytes for this binding. E.g. @sizeOf(f32).
/// If 0, no element-based size validation is performed for this buffer.
element_size: u32 = 0,
};
pub const ProcessDef = struct {
bindings: []const Binding,
workgroup_size: u32 = 256,
max_workgroups: u32 = 65535,
/// If true, automatically adds a Uniform Buffer containing `elements_count` as a `u32`
/// to the next available binding slot.
append_info_buffer: bool = true,
};
pip: c.WGPUComputePipeline, pip: c.WGPUComputePipeline,
def: ProcessDef,
pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() { pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
var wgsl_src = c.WGPUShaderSourceWGSL{ var wgsl_src = c.WGPUShaderSourceWGSL{
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
.code = sv(wgsl), .code = sv(wgsl),
@ -33,116 +20,71 @@ pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() {
}) orelse return error.Shader; }) orelse return error.Shader;
defer c.wgpuShaderModuleRelease(shader); defer c.wgpuShaderModuleRelease(shader);
const pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{ return .{ .pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{
.compute = .{ .module = shader, .entryPoint = sv("main") }, .compute = .{ .module = shader, .entryPoint = sv("main") },
}) orelse return error.Pipeline; }) orelse return error.Pipeline };
return .{
.pip = pip,
.def = def,
};
} }
pub fn deinit(self: @This()) void { pub fn deinit(self: @This()) void {
c.wgpuComputePipelineRelease(self.pip); c.wgpuComputePipelineRelease(self.pip);
} }
/// Execute the compute pass with arbitrary buffer bindings via a tuple. fn onMapped(
/// Example: `try proc.run(gloc, .{ buf_a, buf_b, buf_out });` status: c.WGPUMapAsyncStatus,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
const flag: *bool = @ptrCast(@alignCast(userdata1.?));
flag.* = (status == c.WGPUMapAsyncStatus_Success);
}
// Changed: gloc is passed by value instead of *GpuAllocator
pub fn run( pub fn run(
self: @This(), self: @This(),
gloc: GpuAllocator, gloc: GpuAllocator,
args: anytype, T: type,
buf_a: GpuBuffer,
buf_b: GpuBuffer,
buf_out: GpuBuffer,
) !void { ) !void {
const type_info = @typeInfo(@TypeOf(args)); const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB
if (type_info != .@"struct" or !type_info.@"struct".is_tuple)
@compileError("Expected a tuple of GpuBuffers for args. E.g. .{ buf_a, buf_b }");
const fields = type_info.@"struct".fields; const bytes = buf_a.size;
if (fields.len != self.def.bindings.len) { var offset: u64 = 0;
std.log.err("Process expected {d} arguments, got {d}", .{ self.def.bindings.len, fields.len }); while (offset < bytes) {
return error.InvalidArgumentCount; const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
} const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(T));
var elements_count: u32 = 0; const info_buf = try GpuBuffer.init(
// Infer elements_count from the first arg with a defined element_size
inline for (fields, 0..) |field, i| {
if (elements_count == 0) {
const buf = @field(args, field.name);
const el_size = self.def.bindings[i].element_size;
if (el_size > 0) {
elements_count = @intCast(buf.size / el_size);
}
}
}
// Validate runtime buffer sizes before dispatching
inline for (fields, 0..) |field, i| {
const buf = @field(args, field.name);
const el_size = self.def.bindings[i].element_size;
if (el_size > 0) {
const expected_min_bytes = @as(u64, elements_count) * el_size;
if (buf.size < expected_min_bytes) {
std.log.err("Argument {d} size mismatch: expected at least {d} bytes, got {d}", .{ i, expected_min_bytes, buf.size });
return error.BufferTooSmall;
}
}
}
var entries_buf: [32]c.WGPUBindGroupEntry = undefined;
var entry_count: usize = 0;
// Unpack tuple into WebGPU BindGroupEntries
inline for (fields, 0..) |field, i| {
const buf = @field(args, field.name);
if (@TypeOf(buf) != GpuBuffer) {
@compileError("All arguments in the tuple must be of type GpuBuffer");
}
entries_buf[entry_count] = .{
.binding = @intCast(i),
.buffer = buf.raw,
.offset = 0,
.size = buf.size, // Size exposes the fully allocated length
};
entry_count += 1;
}
// Optional uniform dispatch buffer appended at the end
var info_buf: ?GpuBuffer = null;
defer if (info_buf) |b| b.deinit();
if (self.def.append_info_buffer) {
info_buf = try GpuBuffer.init(
gloc, gloc,
@sizeOf(u32), @sizeOf(u32),
.initMany(&.{ .Uniform, .CopyDst }), .initMany(&.{ .Uniform, .CopyDst }),
); );
c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.?.raw, 0, &elements_count, @sizeOf(u32)); defer info_buf.deinit();
entries_buf[entry_count] = .{ c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
.binding = @intCast(entry_count),
.buffer = info_buf.?.raw, const entries = [_]c.WGPUBindGroupEntry{
.offset = 0, .{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes },
.size = @sizeOf(u32), .{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes },
.{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes },
.{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) },
}; };
entry_count += 1;
}
const entries = entries_buf[0..entry_count]; try submitPass(gloc, self.pip, &entries, current_chunk_elements);
try submitPass(gloc, self.pip, entries, elements_count, self.def.workgroup_size, self.def.max_workgroups);
offset += current_chunk_bytes;
}
} }
// Changed: gloc is passed by value instead of *GpuAllocator
fn submitPass( fn submitPass(
gloc: GpuAllocator, gloc: GpuAllocator,
pipeline: c.WGPUComputePipeline, pipeline: c.WGPUComputePipeline,
entries: []const c.WGPUBindGroupEntry, entries: []const c.WGPUBindGroupEntry,
n: usize, n: usize,
workgroup_size: u32,
max_workgroups: u32,
) !void { ) !void {
if (n == 0) return;
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
defer c.wgpuBindGroupLayoutRelease(bgl); defer c.wgpuBindGroupLayoutRelease(bgl);
@ -153,13 +95,17 @@ fn submitPass(
}) orelse return error.BindGroup; }) orelse return error.BindGroup;
defer c.wgpuBindGroupRelease(bg); defer c.wgpuBindGroupRelease(bg);
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder; const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
return error.Encoder;
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
c.wgpuComputePassEncoderSetPipeline(pass, pipeline); c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
const desired_workgroups = ceilDiv(n, workgroup_size); const WORKGROUP_SIZE = 256;
const dispatch_count = @min(desired_workgroups, max_workgroups); const MAX_WORKGROUPS = 65535;
const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE);
const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS);
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1); c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
c.wgpuComputePassEncoderEnd(pass); c.wgpuComputePassEncoderEnd(pass);