diff --git a/build.zig b/build.zig index 140a513..4c0cf8d 100644 --- a/build.zig +++ b/build.zig @@ -56,39 +56,4 @@ pub fn build(b: *std.Build) !void { const run_cmd = b.addRunArtifact(exe); run_step.dependOn(&run_cmd.step); } - - const exe = b.addExecutable(.{ - .name = "circle", - .root_module = b.createModule(.{ - .root_source_file = b.path("src/circle.zig"), - .target = target, - .optimize = optimize, - .imports = &.{}, - }), - }); - - exe.root_module.addIncludePath(b.path("libs/wgpu-native/include")); - exe.root_module.addLibraryPath(b.path("libs/wgpu-native/lib")); - exe.root_module.addObjectFile(b.path("libs/wgpu-native/lib/libwgpu_native.a")); - - // Platform-specific system frameworks needed by wgpu-native - if (t.os.tag == .macos) { - exe.root_module.linkFramework("Metal", .{}); - exe.root_module.linkFramework("QuartzCore", .{}); - exe.root_module.linkFramework("Foundation", .{}); - exe.root_module.linkFramework("CoreGraphics", .{}); - } else if (t.os.tag == .windows) { - exe.root_module.linkSystemLibrary("d3d12", .{}); - exe.root_module.linkSystemLibrary("dxgi", .{}); - exe.root_module.linkSystemLibrary("user32", .{}); - } else { - exe.root_module.linkSystemLibrary("vulkan", .{}); - exe.root_module.linkSystemLibrary("gcc_s", .{}); - } - - b.installArtifact(exe); - - const run_step = b.step("circle", "Run circle"); - const run_cmd = b.addRunArtifact(exe); - run_step.dependOn(&run_cmd.step); } diff --git a/examples/bench.zig b/examples/bench_cp.zig similarity index 100% rename from examples/bench.zig rename to examples/bench_cp.zig diff --git a/examples/circle.zig b/examples/circle.zig new file mode 100644 index 0000000..a508626 --- /dev/null +++ b/examples/circle.zig @@ -0,0 +1,76 @@ +const std = @import("std"); +const gpu = @import("gpu"); +const GpuDevice = gpu.GpuDevice; +const GpuArena = gpu.GpuArena; +const GpuBuffer = gpu.GpuBuffer; +const GpuRender = gpu.GpuRender; +const GpuTexture = gpu.GpuTexture; +const GpuTextureView = gpu.GpuTextureView; + +const width: u32 = 512; +const height: u32 = 512; + +pub fn main(init: std.process.Init) !void { + const allocator = init.gpa; + + // 1. Open GPU Device + const device = try GpuDevice.init(.{}); + defer device.deinit(); + + // 2. Init VRAM Arena + var grena = GpuArena.init(allocator, device); + defer grena.deinit(); + const gloc = grena.gpuAllocator(); + + // 3. Load Render Pipeline + const circle_rp = try GpuRender.init( + device, // Change to gloc + track them + @embedFile("shaders/circle.wgsl"), + .{ .bindings = &.{}, .texture_format = .RGBA8Unorm, .topology = .TriangleStrip }, + ); + defer circle_rp.deinit(); + + // 4. Create VRAM texture to render into + const texture = try GpuTexture.init(gloc, .{ + .format = .RGBA8Unorm, + .size = .{ .width = width, .height = height, .depthOrArrayLayers = 1 }, + .usage = .initMany(&.{ .RenderAttachment, .CopySrc }), + }); + defer texture.deinit(); + + // 5. Create a view from texture + const view = try GpuTextureView.init(gloc, texture, .{}); + defer view.deinit(); + + // 6. Run the rendering pipeline + try circle_rp.draw(gloc, view, 4, .{}); + + // 7. Load Texture into GpuBuffer + const cpu_staging_cpu = try texture.buffCopy(gloc); + defer cpu_staging_cpu.deinit(); + + // 8. Read GpuBuffer to CPU + const pixels = try cpu_staging_cpu.read(allocator, u8); + defer allocator.free(pixels); + + // 9. Write a simple ppm image + try savePpm(init.io, "circle.ppm", width, height, pixels); + std.debug.print("Successfully rendered circle to circle.ppm!\n", .{}); +} + +fn savePpm(io: std.Io, filename: []const u8, w: u32, h: u32, rgba_pixels: []const u8) !void { + const file = try std.Io.Dir.cwd().createFile(io, filename, .{}); + defer file.close(io); + + var buf: [255]u8 = undefined; + var writer = file.writer(io, &buf); + + // PPM Header: P6 format means raw RGB bytes + try writer.interface.print("P6\n{d} {d}\n255\n", .{ w, h }); + + // Strip Alpha channel when writing out to standard RGB PPM format + var i: usize = 0; + while (i < rgba_pixels.len) : (i += 4) { + try writer.interface.writeAll(rgba_pixels[i .. i + 3]); + } +} diff --git a/examples/add.zig b/examples/compute.zig similarity index 100% rename from examples/add.zig rename to examples/compute.zig diff --git a/examples/digit.zig b/examples/digit.zig deleted file mode 100644 index 20f35a6..0000000 --- a/examples/digit.zig +++ /dev/null @@ -1,77 +0,0 @@ -// I am using this mnist reduced dataset https://www.kaggle.com/datasets/mohamedgamal07/reduced-mnist - -const std = @import("std"); -const gpu = @import("gpu"); -const GpuDevice = gpu.GpuDevice; -const GpuArena = gpu.GpuArena; -const GpuBuffer = gpu.GpuBuffer; -const GpuProcess = gpu.GpuProcess; - -const BATCHSIZE = 10; -const EPOCH = 10; - -pub fn main(init: std.process.Init) !void { - const allocator = init.gpa; - const io = init.io; - - // 1. Open GPU Device - const device = try GpuDevice.init(.{}); - defer device.deinit(); - - // 2. Create a GPU Arena to manage VRAM - var grena = GpuArena.init(allocator, device); - defer grena.deinit(); - const gloc = grena.gpuAllocator(); - - // 3. Load the WGSL compute pipeline - const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl")); - defer add_process.deinit(); - - var train_dir = try std.Io.Dir.cwd().openDir(io, "mnist/train", .{}); - - var images: [BATCHSIZE * 28 * 28]f16 = undefined; - for (EPOCH) |epoch| { - // Load random images from train dir - train_dir.openDir(io, "0", .{}); - for (BATCHSIZE) |i| { - const file = try train_dir.openFile(io, "0.jpg", .{}); - images[28 * 28 * i .. 28 * 28 * (i + 1)] = file.read - } - } - - // 4. Setup CPU data - const len: usize = 16; - const data_a = try allocator.alloc(f16, len); - defer allocator.free(data_a); - const data_b = try allocator.alloc(f16, len); - defer allocator.free(data_b); - - for (0..len) |i| { - data_a[i] = @floatFromInt(i); - data_b[i] = @floatFromInt(len - 1 - i); - } - - // 5. Initialize raw GPU Buffers - // We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported - const byte_size = len * @sizeOf(f16); - const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); - const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); - const buf_out = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc })); - - // Note: The buffers are safely tied to the GpuArena which will automatically - // release them at the end. You can also manually call buf_x.deinit() if desired. - - // 6. Transfer data from CPU slices to GPU Buffers - try buf_a.load(f16, data_a); - try buf_b.load(f16, data_b); - - // 7. Dispatch the Compute Process - // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly - try add_process.run(gloc, f16, buf_a, buf_b, buf_out); - - // 8. Map and copy the resulting buffer back to the CPU - const out = try buf_out.read(allocator, f16); - defer allocator.free(out); - - std.debug.print("Result: {any}\n", .{out}); -} diff --git a/src/shaders/circle.wgsl b/examples/shaders/circle.wgsl similarity index 100% rename from src/shaders/circle.wgsl rename to examples/shaders/circle.wgsl diff --git a/src/GpuTexture.zig b/src/GpuTexture.zig index 7d75cb0..b819ff2 100644 --- a/src/GpuTexture.zig +++ b/src/GpuTexture.zig @@ -1,6 +1,7 @@ const std = @import("std"); const c = @import("utils.zig").c; const GpuAllocator = @import("GpuAllocator.zig"); +const GpuBuffer = @import("GpuBuffer.zig"); const GpuTextureFormat = @import("lib.zig").GpuTextureFormat; const GpuTextureUsage = @import("lib.zig").GpuTextureUsage; @@ -40,6 +41,45 @@ pub fn getConstMappedRange(self: @This(), offset: u64, size: u64) ?*const anyopa return c.wgpuBufferGetConstMappedRange(self.raw, offset, size); } +pub fn bytesSize(self: @This()) u32 { + return self.bytesSizeRow() * self.def.size.height; +} + +pub fn bytesSizeRow(self: @This()) u32 { + return self.def.size.width * self.def.format.bytesPerPixel(); +} + +/// Return a GpuBuffer containing a copy of the texture. +pub fn buffCopy(self: @This(), gloc: GpuAllocator) !GpuBuffer { + const buf = try GpuBuffer.init(gloc, self.bytesSize(), .initMany(&.{ .CopyDst, .CopySrc })); + + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder; + defer c.wgpuCommandEncoderRelease(enc); + + const src_copy = c.WGPUTexelCopyTextureInfo{ + .texture = self.raw, + .mipLevel = 0, + .origin = .{ .x = 0, .y = 0, .z = 0 }, + .aspect = c.WGPUTextureAspect_All, + }; + const dst_copy = c.WGPUTexelCopyBufferInfo{ + .buffer = buf.raw, + .layout = .{ + .offset = 0, + .bytesPerRow = self.bytesSizeRow(), + .rowsPerImage = self.def.size.height, + }, + }; + + c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, &self.def.size); + + const cmd = c.wgpuCommandEncoderFinish(enc, null); + defer c.wgpuCommandBufferRelease(cmd); + c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); + + return buf; +} + pub fn mapAsync( self: @This(), mode: c.WGPUMapMode, diff --git a/src/circle.zig b/src/circle.zig deleted file mode 100644 index ef9ce7c..0000000 --- a/src/circle.zig +++ /dev/null @@ -1,112 +0,0 @@ -const std = @import("std"); -const gpu = @import("lib.zig"); -const c = @import("utils.zig").c; -const sv = @import("utils.zig").sv; -const GpuDevice = gpu.GpuDevice; -const GpuArena = gpu.GpuArena; -const GpuBuffer = gpu.GpuBuffer; -const GpuRender = gpu.GpuRender; -const GpuTexture = gpu.GpuTexture; -const GpuTextureView = gpu.GpuTextureView; - -pub fn main(init: std.process.Init) !void { - const allocator = init.gpa; - - // 1. Open the raw headless GPU Device you shared - const device = try GpuDevice.init(.{}); - defer device.deinit(); - - var grena = GpuArena.init(allocator, device); - defer grena.deinit(); - const gloc = grena.gpuAllocator(); - - const width: u32 = 512; - const height: u32 = 512; - - // 2. Load our Render Pipeline (Procedural Triangle Strip) - const circle_rp = try GpuRender.init( - device, - @embedFile("shaders/circle.wgsl"), - .{ - .bindings = &.{}, - .texture_format = .RGBA8Unorm, - .topology = .TriangleStrip, - }, - ); - defer circle_rp.deinit(); - - // 3. Create the offscreen VRAM texture to render into - const texture = try GpuTexture.init(gloc, .{ - .format = .RGBA8Unorm, - .size = .{ .width = width, .height = height, .depthOrArrayLayers = 1 }, - .usage = .initMany(&.{ .RenderAttachment, .CopySrc }), - }); - defer texture.deinit(); - - const view = try GpuTextureView.init(gloc, texture, .{}); - defer view.deinit(); - - // 4. Create a staging buffer to pull pixels from VRAM to CPU - // 4 bytes per pixel (RGBA8) - const row_bytes = width * 4; - const buffer_bytes = row_bytes * height; - - // Create a regular GpuBuffer set up to receive texture copy transfers - const cpu_staging_buf = try GpuBuffer.init(gloc, buffer_bytes, .initMany(&.{ .CopyDst, .CopySrc })); - - // 5. Draw the Circle Frame into the texture view! - try circle_rp.draw(gloc, view, 4, .{}); - - // 6. Copy the texture data into our CPU staging buffer - const enc = c.wgpuDeviceCreateCommandEncoder(device.device, null) orelse return error.Encoder; - defer c.wgpuCommandEncoderRelease(enc); - - const src_copy = c.WGPUTexelCopyTextureInfo{ - .texture = texture.raw, - .mipLevel = 0, - .origin = .{ .x = 0, .y = 0, .z = 0 }, - .aspect = c.WGPUTextureAspect_All, - }; - const dst_copy = c.WGPUTexelCopyBufferInfo{ - .buffer = cpu_staging_buf.raw, - .layout = .{ - .offset = 0, - .bytesPerRow = row_bytes, - .rowsPerImage = height, - }, - }; - const copy_size = c.WGPUExtent3D{ .width = width, .height = height, .depthOrArrayLayers = 1 }; - - c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, ©_size); - - const cmd = c.wgpuCommandEncoderFinish(enc, null); - defer c.wgpuCommandBufferRelease(cmd); - c.wgpuQueueSubmit(device.queue, 1, &cmd); - - // 7. Map and read the raw image bytes back to CPU - // (This uses whatever slice-reading helpers your `GpuBuffer` wrapper provides) - const pixels = try cpu_staging_buf.read(allocator, u8); - defer allocator.free(pixels); - - // Now you have the raw binary image data! Let's output a simple Netpbm PPM image file - // so you can actually open and look at your rendered circle. - try savePpm(init.io, "circle.ppm", width, height, pixels); - std.debug.print("Successfully rendered circle to circle.ppm!\n", .{}); -} - -fn savePpm(io: std.Io, filename: []const u8, w: u32, h: u32, rgba_pixels: []const u8) !void { - const file = try std.Io.Dir.cwd().createFile(io, filename, .{}); - defer file.close(io); - - var buf: [255]u8 = undefined; - var writer = file.writer(io, &buf); - - // PPM Header: P6 format means raw RGB bytes - try writer.interface.print("P6\n{d} {d}\n255\n", .{ w, h }); - - // Strip Alpha channel when writing out to standard RGB PPM format - var i: usize = 0; - while (i < rgba_pixels.len) : (i += 4) { - try writer.interface.writeAll(rgba_pixels[i .. i + 3]); - } -} diff --git a/src/shaders/add.wgsl b/src/shaders/add.wgsl deleted file mode 100644 index 4288742..0000000 --- a/src/shaders/add.wgsl +++ /dev/null @@ -1,24 +0,0 @@ -enable f16; - -@group(0) @binding(0) var A: array; -@group(0) @binding(1) var B: array; -@group(0) @binding(2) var C: array; -@group(0) @binding(3) var size: u32; - -@compute @workgroup_size(256) -fn main( - @builtin(global_invocation_id) global_id : vec3, - @builtin(num_workgroups) num_workgroups: vec3 -) { - // 1. Calculate the total number of threads across the entire grid - let total_threads = num_workgroups.x * 256u; - - // 2. Start at this thread's unique global ID - var index = global_id.x; - - // 3. Stride through the tensor elements - while (index < size) { - C[index] = A[index] + B[index]; - index += total_threads; // Jump forward by the total thread count - } -}