diff --git a/src/Mat.zig b/src/Mat.zig index e81d4ed..c3cde6f 100644 --- a/src/Mat.zig +++ b/src/Mat.zig @@ -1,3 +1,4 @@ +/// Dummy const std = @import("std"); const c = @import("c.zig").c; const GpuAllocator = @import("GpuAllocator.zig"); @@ -62,8 +63,8 @@ pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat { return result; } -pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void { - std.debug.assert(out.len >= self.len()); +pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 { + const out = try alloc.alloc(f32, self.len()); const bytes = self.byteSize(); const staging = try GpuBuffer.init( @@ -94,6 +95,8 @@ pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void { )); @memcpy(out[0..self.len()], ptr[0..self.len()]); staging.unmap(); + + return out; } fn onMapped( diff --git a/src/main.zig b/src/main.zig index d836133..9ed4e20 100644 --- a/src/main.zig +++ b/src/main.zig @@ -53,13 +53,8 @@ pub fn main(init: std.process.Init) !void { const sum = try a.add(&gloc, b); defer sum.deinit(); - // Read back (allocating dynamically for read-back buffers too) - const out_sum = try allocator.alloc(f32, size); - defer allocator.free(out_sum); - const out_scaled = try allocator.alloc(f32, size); - defer allocator.free(out_scaled); - - try sum.read(&gloc, out_sum); + const out = try sum.read(&gloc, allocator); + defer allocator.free(out); const duration = start.durationTo(std.Io.Clock.awake.now(init.io)); const ns = duration.toNanoseconds(); diff --git a/src/reference.zig b/src/reference.zig deleted file mode 100644 index 4392f24..0000000 --- a/src/reference.zig +++ /dev/null @@ -1,238 +0,0 @@ -// Minimal WebGPU compute in Zig: element-wise matrix addition -// Uses wgpu-native C bindings. -// Build: see ../build.zig -// -// Data flow: -// CPU (mat_a, mat_b) → GPU storage buffers → compute shader → GPU buf_c -// → staging buffer (mapped) → CPU read → print - -const std = @import("std"); -const c = @cImport(@cInclude("wgpu.h")); - -// ── Config ──────────────────────────────────────────────────────────────────── -const ROWS: u32 = 4; -const COLS: u32 = 4; -const N = ROWS * COLS; // 16 elements -const BUF_BYTES = N * @sizeOf(f32); - -// ── WGSL Compute Shader ─────────────────────────────────────────────────────── -// workgroup_size(4,4) matches one full 4×4 matrix → dispatch(1,1,1) -const SHADER = - \\@group(0) @binding(0) var mat_a : array; - \\@group(0) @binding(1) var mat_b : array; - \\@group(0) @binding(2) var mat_c : array; - \\ - \\@compute @workgroup_size(4, 4) - \\fn main(@builtin(global_invocation_id) gid : vec3) { - \\ let idx = gid.y * 4u + gid.x; - \\ if (idx < arrayLength(&mat_c)) { - \\ mat_c[idx] = mat_a[idx] + mat_b[idx]; - \\ } - \\} -; - -// ── Callback state ──────────────────────────────────────────────────────────── -const Ctx = struct { - adapter: c.WGPUAdapter = null, - device: c.WGPUDevice = null, -}; - -fn onAdapter( - status: c.WGPURequestAdapterStatus, - adapter: c.WGPUAdapter, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestAdapterStatus_Success) { - std.log.err("Adapter request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.adapter = adapter; -} - -fn onDevice( - status: c.WGPURequestDeviceStatus, - device: c.WGPUDevice, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestDeviceStatus_Success) { - std.log.err("Device request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.device = device; -} - -fn onMapped( - status: c.WGPUMapAsyncStatus, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - const flag: *bool = @ptrCast(@alignCast(userdata1.?)); - flag.* = (status == c.WGPUMapAsyncStatus_Success); -} - -fn sv(s: []const u8) c.WGPUStringView { - return .{ .data = s.ptr, .length = s.len }; -} - -// ── Main ────────────────────────────────────────────────────────────────────── -pub fn main() !void { - - // 1. Instance ────────────────────────────────────────────────────────────── - const instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse - return error.NoInstance; - defer c.wgpuInstanceRelease(instance); - - // 2. Adapter (async → poll) ──────────────────────────────────────────────── - var ctx = Ctx{}; - _ = c.wgpuInstanceRequestAdapter( - instance, - &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, - .{ .callback = onAdapter, .userdata1 = &ctx }, - ); - c.wgpuInstanceProcessEvents(instance); // drive callbacks - const adapter = ctx.adapter orelse return error.NoAdapter; - defer c.wgpuAdapterRelease(adapter); - - // 3. Device ──────────────────────────────────────────────────────────────── - _ = c.wgpuAdapterRequestDevice(adapter, null, .{ .callback = onDevice, .userdata1 = &ctx }); - c.wgpuInstanceProcessEvents(instance); - const device = ctx.device orelse return error.NoDevice; - defer c.wgpuDeviceRelease(device); - - const queue = c.wgpuDeviceGetQueue(device); - defer c.wgpuQueueRelease(queue); - - // 4. Input data ──────────────────────────────────────────────────────────── - // mat_a[i] = i (0 … 15) - // mat_b[i] = 15 − i → every element of mat_c should equal 15 - var mat_a: [N]f32 = undefined; - var mat_b: [N]f32 = undefined; - for (0..N) |i| { - mat_a[i] = @floatFromInt(i); - mat_b[i] = @floatFromInt(N - 1 - i); - } - - // 5. GPU Buffers ─────────────────────────────────────────────────────────── - const buf_a = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferA; - - const buf_b = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferB; - - // buf_c: GPU-only result; staging: CPU-readable copy - const buf_c = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopySrc, - .size = BUF_BYTES, - }) orelse return error.BufferC; - - const buf_staging = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferStaging; - - defer c.wgpuBufferRelease(buf_a); - defer c.wgpuBufferRelease(buf_b); - defer c.wgpuBufferRelease(buf_c); - defer c.wgpuBufferRelease(buf_staging); - - // Upload inputs - c.wgpuQueueWriteBuffer(queue, buf_a, 0, &mat_a, BUF_BYTES); - c.wgpuQueueWriteBuffer(queue, buf_b, 0, &mat_b, BUF_BYTES); - - // 6. Shader module ───────────────────────────────────────────────────────── - // ✅ New API (0.20+) - var wgsl_src = c.WGPUShaderSourceWGSL{ - .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, - .code = sv(SHADER), - }; - const shader = c.wgpuDeviceCreateShaderModule(device, &.{ - .nextInChain = @ptrCast(&wgsl_src), - }) orelse return error.Shader; - - // 7. Compute pipeline (layout auto-inferred from shader) ─────────────────── - // ✅ - const pipeline = c.wgpuDeviceCreateComputePipeline(device, &.{ - .compute = .{ - .module = shader, - .entryPoint = sv("main"), - }, - }) orelse return error.Pipeline; - defer c.wgpuComputePipelineRelease(pipeline); - - // 8. Bind group ──────────────────────────────────────────────────────────── - const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); - defer c.wgpuBindGroupLayoutRelease(bgl); - - const entries = [_]c.WGPUBindGroupEntry{ - .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = BUF_BYTES }, - .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = BUF_BYTES }, - .{ .binding = 2, .buffer = buf_c, .offset = 0, .size = BUF_BYTES }, - }; - const bind_group = c.wgpuDeviceCreateBindGroup(device, &.{ - .layout = bgl, - .entries = &entries, - .entryCount = entries.len, - }) orelse return error.BindGroup; - defer c.wgpuBindGroupRelease(bind_group); - - // 9. Encode compute pass + buffer copy ──────────────────────────────────── - const encoder = c.wgpuDeviceCreateCommandEncoder(device, null) orelse - return error.Encoder; - - const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null); - c.wgpuComputePassEncoderSetPipeline(pass, pipeline); - c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null); - // dispatch(1,1,1): one workgroup of size (4,4) covers the whole 4×4 matrix - c.wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1); - c.wgpuComputePassEncoderEnd(pass); - c.wgpuComputePassEncoderRelease(pass); - - // Copy result buffer → CPU-readable staging buffer - c.wgpuCommandEncoderCopyBufferToBuffer(encoder, buf_c, 0, buf_staging, 0, BUF_BYTES); - - const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null); - defer c.wgpuCommandEncoderRelease(encoder); - defer c.wgpuCommandBufferRelease(cmdbuf); - - // 10. Submit ─────────────────────────────────────────────────────────────── - c.wgpuQueueSubmit(queue, 1, &cmdbuf); - - // 11. Map staging buffer back to CPU ────────────────────────────────────── - var mapped = false; - _ = c.wgpuBufferMapAsync( - buf_staging, - c.WGPUMapMode_Read, - 0, - BUF_BYTES, - .{ .callback = onMapped, .userdata1 = &mapped }, - ); - // Poll the device until the async map completes - while (!mapped) _ = c.wgpuDevicePoll(device, 1, null); - - const ptr: [*]const f32 = @ptrCast(@alignCast( - c.wgpuBufferGetConstMappedRange(buf_staging, 0, BUF_BYTES), - )); - const result = ptr[0..N]; - - // 12. Print ──────────────────────────────────────────────────────────────── - std.debug.print("\nmat_a + mat_b ({d}×{d}):\n", .{ ROWS, COLS }); - for (0..ROWS) |r| { - for (0..COLS) |col| - std.debug.print("{d:6.0}", .{result[r * COLS + col]}); - std.debug.print("\n", .{}); - } - // Expected output: every cell = 15.0 - - c.wgpuBufferUnmap(buf_staging); -} diff --git a/src/shaders.zig b/src/shaders.zig index 4e21cac..26f2330 100644 --- a/src/shaders.zig +++ b/src/shaders.zig @@ -1,18 +1,3 @@ const std = @import("std"); pub const SHADER_ADD = @embedFile("shaders/add.wgsl"); - -pub const SHADER_SCALE = - \\struct Uniforms { scalar : f32 } - \\@group(0) @binding(0) var a : array; - \\@group(0) @binding(1) var out : array; - \\@group(0) @binding(2) var u : Uniforms; - \\ - \\@compute @workgroup_size(256) - \\fn main(@builtin(global_invocation_id) gid : vec3) { - \\ let i = gid.x; - \\ if (i < arrayLength(&out)) { - \\ out[i] = a[i] * u.scalar; - \\ } - \\} -;