diff --git a/src/GpuPipeline.zig b/src/GpuPipeline.zig deleted file mode 100644 index 83d986d..0000000 --- a/src/GpuPipeline.zig +++ /dev/null @@ -1,25 +0,0 @@ -const std = @import("std"); -const GpuDevice = @import("GpuDevice.zig"); -const c = @import("utils.zig").c; -const sv = @import("utils.zig").sv; - -raw: c.WGPUComputePipeline, - -pub fn init(device: GpuDevice, wgsl: []const u8) !@This() { - var wgsl_src = c.WGPUShaderSourceWGSL{ - .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, - .code = sv(wgsl), - }; - const shader = c.wgpuDeviceCreateShaderModule(device.device, &.{ - .nextInChain = @ptrCast(&wgsl_src), - }) orelse return error.Shader; - defer c.wgpuShaderModuleRelease(shader); - - return .{ .raw = c.wgpuDeviceCreateComputePipeline(device.device, &.{ - .compute = .{ .module = shader, .entryPoint = sv("main") }, - }) orelse return error.Pipeline }; -} - -pub fn deinit(self: @This()) void { - c.wgpuComputePipelineRelease(self.raw); -} diff --git a/src/GpuProcess.zig b/src/GpuProcess.zig new file mode 100644 index 0000000..1b5a5c9 --- /dev/null +++ b/src/GpuProcess.zig @@ -0,0 +1,118 @@ +const std = @import("std"); +const c = @import("utils.zig").c; +const sv = @import("utils.zig").sv; +const GpuAllocator = @import("GpuAllocator.zig"); +const GpuBuffer = @import("GpuBuffer.zig"); +const GpuDevice = @import("GpuDevice.zig"); + +pip: c.WGPUComputePipeline, + +pub fn init(device: GpuDevice, wgsl: []const u8) !@This() { + var wgsl_src = c.WGPUShaderSourceWGSL{ + .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, + .code = sv(wgsl), + }; + const shader = c.wgpuDeviceCreateShaderModule(device.device, &.{ + .nextInChain = @ptrCast(&wgsl_src), + }) orelse return error.Shader; + defer c.wgpuShaderModuleRelease(shader); + + return .{ .pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{ + .compute = .{ .module = shader, .entryPoint = sv("main") }, + }) orelse return error.Pipeline }; +} + +pub fn deinit(self: @This()) void { + c.wgpuComputePipelineRelease(self.pip); +} + +fn onMapped( + status: c.WGPUMapAsyncStatus, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + const flag: *bool = @ptrCast(@alignCast(userdata1.?)); + flag.* = (status == c.WGPUMapAsyncStatus_Success); +} + +// Changed: gloc is passed by value instead of *GpuAllocator +pub fn run( + self: @This(), + gloc: GpuAllocator, + buf_a: GpuBuffer, + buf_b: GpuBuffer, + buf_out: GpuBuffer, +) !void { + const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB + + const bytes = buf_a.size; + var offset: u64 = 0; + while (offset < bytes) { + const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset); + const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16)); + + const info_buf = try GpuBuffer.init( + gloc, + @sizeOf(u32), + .initMany(&.{ .Uniform, .CopyDst }), + ); + defer info_buf.deinit(); + + c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32)); + + const entries = [_]c.WGPUBindGroupEntry{ + .{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes }, + .{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes }, + .{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes }, + .{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) }, + }; + + try submitPass(gloc, self.pip, &entries, current_chunk_elements); + + offset += current_chunk_bytes; + } +} + +// Changed: gloc is passed by value instead of *GpuAllocator +fn submitPass( + gloc: GpuAllocator, + pipeline: c.WGPUComputePipeline, + entries: []const c.WGPUBindGroupEntry, + n: usize, +) !void { + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{ + .layout = bgl, + .entries = entries.ptr, + .entryCount = entries.len, + }) orelse return error.BindGroup; + defer c.wgpuBindGroupRelease(bg); + + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse + return error.Encoder; + const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); + c.wgpuComputePassEncoderSetPipeline(pass, pipeline); + c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); + + const WORKGROUP_SIZE = 256; + const MAX_WORKGROUPS = 65535; + + const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE); + const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS); + + c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1); + c.wgpuComputePassEncoderEnd(pass); + c.wgpuComputePassEncoderRelease(pass); + + const cmd = c.wgpuCommandEncoderFinish(enc, null); + defer c.wgpuCommandEncoderRelease(enc); + defer c.wgpuCommandBufferRelease(cmd); + c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); +} + +fn ceilDiv(n: usize, d: usize) usize { + return (n + d - 1) / d; +} diff --git a/src/Vec.zig b/src/Vec.zig index f75dfef..b03f986 100644 --- a/src/Vec.zig +++ b/src/Vec.zig @@ -3,7 +3,7 @@ const c = @import("utils.zig").c; const GpuAllocator = @import("GpuAllocator.zig"); const GpuBuffer = @import("GpuBuffer.zig"); const GpuDevice = @import("GpuDevice.zig"); -const GpuPipeline = @import("GpuPipeline.zig"); +const GpuProcess = @import("GpuProcess.zig"); const Vec = @This(); @@ -34,10 +34,7 @@ pub fn deinit(self: Vec) void { } /// CPU to GPU. -pub fn load( - self: Vec, - data: []const f16, -) !void { +pub fn load(self: Vec, data: []const f16) !void { try self.buf.load(data); } @@ -46,14 +43,13 @@ pub fn byteSize(self: Vec) u64 { } // Changed: gloc is passed by value instead of *GpuAllocator -pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, pip: GpuPipeline) !Vec { +pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec { std.debug.assert(self.len == other.len); const result = try Vec.initZero(gloc, self.len); errdefer result.deinit(); - try dispatch2in1out(gloc, pip.raw, self.buf, other.buf, result.buf, self.byteSize()); - + try process.run(gloc, self.buf, other.buf, result.buf); return result; } @@ -71,84 +67,3 @@ fn onMapped( const flag: *bool = @ptrCast(@alignCast(userdata1.?)); flag.* = (status == c.WGPUMapAsyncStatus_Success); } - -// Changed: gloc is passed by value instead of *GpuAllocator -fn dispatch2in1out( - gloc: GpuAllocator, - pipeline: c.WGPUComputePipeline, - buf_a: GpuBuffer, - buf_b: GpuBuffer, - buf_out: GpuBuffer, - bytes: u64, -) !void { - const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB - - var offset: u64 = 0; - while (offset < bytes) { - const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset); - const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16)); - - const info_buf = try GpuBuffer.init( - gloc, - @sizeOf(u32), - .initMany(&.{ .Uniform, .CopyDst }), - ); - defer info_buf.deinit(); - - c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32)); - - const entries = [_]c.WGPUBindGroupEntry{ - .{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes }, - .{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) }, - }; - - try submitPass(gloc, pipeline, &entries, current_chunk_elements); - - offset += current_chunk_bytes; - } -} - -// Changed: gloc is passed by value instead of *GpuAllocator -fn submitPass( - gloc: GpuAllocator, - pipeline: c.WGPUComputePipeline, - entries: []const c.WGPUBindGroupEntry, - n: usize, -) !void { - const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); - defer c.wgpuBindGroupLayoutRelease(bgl); - - const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{ - .layout = bgl, - .entries = entries.ptr, - .entryCount = entries.len, - }) orelse return error.BindGroup; - defer c.wgpuBindGroupRelease(bg); - - const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse - return error.Encoder; - const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); - c.wgpuComputePassEncoderSetPipeline(pass, pipeline); - c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); - - const WORKGROUP_SIZE = 256; - const MAX_WORKGROUPS = 65535; - - const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE); - const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS); - - c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1); - c.wgpuComputePassEncoderEnd(pass); - c.wgpuComputePassEncoderRelease(pass); - - const cmd = c.wgpuCommandEncoderFinish(enc, null); - defer c.wgpuCommandEncoderRelease(enc); - defer c.wgpuCommandBufferRelease(cmd); - c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); -} - -fn ceilDiv(n: usize, d: usize) usize { - return (n + d - 1) / d; -} diff --git a/src/example.zig b/src/example.zig index fbbb4e7..3e1d2f3 100644 --- a/src/example.zig +++ b/src/example.zig @@ -2,7 +2,7 @@ const std = @import("std"); const GpuDevice = @import("GpuDevice.zig"); const GpuAllocator = @import("GpuAllocator.zig"); const GpuArena = @import("GpuArena.zig"); -const GpuPipeline = @import("GpuPipeline.zig"); +const GpuProcess = @import("GpuProcess.zig"); const Vec = @import("Vec.zig"); const c = @import("utils.zig").c; @@ -18,8 +18,8 @@ pub fn main(init: std.process.Init) !void { const gloc = grena.gpuAllocator(); - const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl")); - defer add_pip.deinit(); + const add = try GpuProcess.init(device, @embedFile("shaders/add.wgsl")); + defer add.deinit(); const data_a = try allocator.alloc(f16, 16); defer allocator.free(data_a); @@ -36,7 +36,7 @@ pub fn main(init: std.process.Init) !void { const b = try Vec.initLoad(gloc, data_b); defer b.deinit(); - const sum = try a.run(gloc, b, add_pip); + const sum = try a.run(gloc, b, add); // Don't need `sum.deinit()` because grena will deallocate everything when deinit std.debug.print("Bytes used: {d} (3 * {d})\n", .{ grena.allocated_vram_bytes, a.byteSize() });