diff --git a/src/example.zig b/src/example.zig index 519c337..ff79b9d 100644 --- a/src/example.zig +++ b/src/example.zig @@ -1,92 +1,58 @@ -/// This is a fully self contained example. -/// It set a simple f16 Vector and do a add operation on it const std = @import("std"); const GpuDevice = @import("GpuDevice.zig"); const GpuArena = @import("GpuArena.zig"); -const GpuAllocator = @import("GpuAllocator.zig"); const GpuBuffer = @import("GpuBuffer.zig"); const GpuProcess = @import("GpuProcess.zig"); pub fn main(init: std.process.Init) !void { const allocator = init.gpa; - // Open GPU Device + // 1. Open GPU Device const device = try GpuDevice.init(.{}); defer device.deinit(); - // Create a GPU Arena to hold GPU memory + // 2. Create a GPU Arena to manage VRAM var grena = GpuArena.init(allocator, device); defer grena.deinit(); const gloc = grena.gpuAllocator(); - // Create a GPU process that load the pipeline/shader - const add = try GpuProcess.init(device, @embedFile("shaders/add.wgsl")); - defer add.deinit(); + // 3. Load the WGSL compute pipeline + const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl")); + defer add_process.deinit(); - // Allocate CPU memory - const data_a = try allocator.alloc(f16, 16); + // 4. Setup CPU data + const len: usize = 16; + const data_a = try allocator.alloc(f16, len); defer allocator.free(data_a); - const data_b = try allocator.alloc(f16, 16); + const data_b = try allocator.alloc(f16, len); defer allocator.free(data_b); - for (0..16) |i| { + for (0..len) |i| { data_a[i] = @floatFromInt(i); - data_b[i] = @floatFromInt(16 - 1 - i); + data_b[i] = @floatFromInt(len - 1 - i); } - // Allocate GPU memory (Vec.deinit isn't necessary because grena will do it when deinit) - const a = try Vec.initZero(gloc, 16); - const b = try Vec.initZero(gloc, 16); + // 5. Initialize raw GPU Buffers + // We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported + const byte_size = len * @sizeOf(f16); + const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); + const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); + const buf_out = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); - // Load CPU -> GPU - try a.load(data_a); - try b.load(data_b); + // Note: The buffers are safely tied to the GpuArena which will automatically + // release them at the end. You can also manually call buf_x.deinit() if desired. - // Run GPU Pipeline - const sum = try a.run(gloc, b, add); + // 6. Transfer data from CPU slices to GPU Buffers + try buf_a.load(f16, data_a); + try buf_b.load(f16, data_b); - // Read GPU -> CPU - const out = try sum.read(allocator); + // 7. Dispatch the Compute Process + // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly + try add_process.run(gloc, f16, buf_a, buf_b, buf_out); + + // 8. Map and copy the resulting buffer back to the CPU + const out = try buf_out.read(allocator, f16); defer allocator.free(out); - std.debug.print("{any}\n", .{out}); + std.debug.print("Result: {any}\n", .{out}); } - -/// Minimal implementation of a f16 Vector -const Vec = struct { - buf: GpuBuffer, - len: usize, - - pub fn initZero(gloc: GpuAllocator, len: usize) !Vec { - return .{ - .buf = try GpuBuffer.init( - gloc, - len * @sizeOf(f16), - .initMany(&.{ .Storage, .CopyDst, .CopySrc }), - ), - .len = len, - }; - } - - pub fn deinit(self: Vec) void { - self.buf.deinit(); - } - - pub fn load(self: Vec, data: []const f16) !void { - try self.buf.load(f16, data); - } - - pub fn read(self: Vec, alloc: std.mem.Allocator) ![]f16 { - return self.buf.read(alloc, f16); - } - - pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec { - std.debug.assert(self.len == other.len); - - const result = try Vec.initZero(gloc, self.len); - errdefer result.deinit(); - - try process.run(gloc, f16, self.buf, other.buf, result.buf); - return result; - } -};