From 8ff91d35ecbbb9f8b25cda4b49afb5211190f4f6 Mon Sep 17 00:00:00 2001 From: adrien Date: Fri, 15 May 2026 09:44:53 +0200 Subject: [PATCH] Basic GpuAlloc + Mat --- src/gpu.zig | 399 ++++++++++++++++++++++++++++++++++++++++++++++ src/main.zig | 258 ++++-------------------------- src/reference.zig | 239 +++++++++++++++++++++++++++ src/wgpu.zig | 137 ---------------- 4 files changed, 672 insertions(+), 361 deletions(-) create mode 100644 src/gpu.zig create mode 100644 src/reference.zig delete mode 100644 src/wgpu.zig diff --git a/src/gpu.zig b/src/gpu.zig new file mode 100644 index 0000000..614d140 --- /dev/null +++ b/src/gpu.zig @@ -0,0 +1,399 @@ +/// gpu.zig — WebGPU compute module +/// +/// Usage: +/// var gloc = try GpuAllocator.init(); +/// defer gloc.deinit(); +/// +/// const a = try Mat.load(&gloc, &data_a, 4, 4); +/// defer a.deinit(); +/// const b = try Mat.load(&gloc, &data_b, 4, 4); +/// defer b.deinit(); +/// +/// const c = try a.add(&gloc, b); +/// defer c.deinit(); +/// +/// var out: [16]f32 = undefined; +/// try c.read(&gloc, &out); +const std = @import("std"); +const c = @cImport(@cInclude("wgpu.h")); + +// ── Shaders ─────────────────────────────────────────────────────────────────── + +const SHADER_ADD = + \\@group(0) @binding(0) var a : array; + \\@group(0) @binding(1) var b : array; + \\@group(0) @binding(2) var out : array; + \\ + \\@compute @workgroup_size(64) + \\fn main(@builtin(global_invocation_id) gid : vec3) { + \\ let i = gid.x; + \\ if (i < arrayLength(&out)) { + \\ out[i] = a[i] + b[i]; + \\ } + \\} +; + +const SHADER_SCALE = + \\struct Uniforms { scalar : f32 } + \\@group(0) @binding(0) var a : array; + \\@group(0) @binding(1) var out : array; + \\@group(0) @binding(2) var u : Uniforms; + \\ + \\@compute @workgroup_size(64) + \\fn main(@builtin(global_invocation_id) gid : vec3) { + \\ let i = gid.x; + \\ if (i < arrayLength(&out)) { + \\ out[i] = a[i] * u.scalar; + \\ } + \\} +; + +// ── Helpers ─────────────────────────────────────────────────────────────────── + +fn sv(s: []const u8) c.WGPUStringView { + return .{ .data = s.ptr, .length = s.len }; +} + +fn ceilDiv(n: u32, d: u32) u32 { + return (n + d - 1) / d; +} + +// ── Callbacks ───────────────────────────────────────────────────────────────── + +const Ctx = struct { + adapter: c.WGPUAdapter = null, + device: c.WGPUDevice = null, +}; + +fn onAdapter( + status: c.WGPURequestAdapterStatus, + adapter: c.WGPUAdapter, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestAdapterStatus_Success) { + std.log.err("Adapter request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.adapter = adapter; +} + +fn onDevice( + status: c.WGPURequestDeviceStatus, + device: c.WGPUDevice, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestDeviceStatus_Success) { + std.log.err("Device request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.device = device; + std.debug.print("{?}", .{device}); +} + +fn onMapped( + status: c.WGPUMapAsyncStatus, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + const flag: *bool = @ptrCast(@alignCast(userdata1.?)); + flag.* = (status == c.WGPUMapAsyncStatus_Success); +} + +// ── Pipeline factory ────────────────────────────────────────────────────────── + +fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline { + var wgsl_src = c.WGPUShaderSourceWGSL{ + .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, + .code = sv(wgsl), + }; + const shader = c.wgpuDeviceCreateShaderModule(device, &.{ + .nextInChain = @ptrCast(&wgsl_src), + }) orelse return error.Shader; + defer c.wgpuShaderModuleRelease(shader); + + return c.wgpuDeviceCreateComputePipeline(device, &.{ + .compute = .{ .module = shader, .entryPoint = sv("main") }, + }) orelse error.Pipeline; +} + +// ── GpuAllocator ────────────────────────────────────────────────────────────── + +/// Owns the WebGPU context. One per application. +/// All Mat instances must not outlive their GpuAllocator. +pub const GpuAllocator = struct { + instance: c.WGPUInstance, + adapter: c.WGPUAdapter, + device: c.WGPUDevice, + queue: c.WGPUQueue, + + // Lazily created, cached for lifetime of allocator + _pip_add: c.WGPUComputePipeline = null, + _pip_scale: c.WGPUComputePipeline = null, + + pub fn init() !GpuAllocator { + const instance = c.wgpuCreateInstance( + &std.mem.zeroes(c.WGPUInstanceDescriptor), + ) orelse return error.NoInstance; + errdefer c.wgpuInstanceRelease(instance); + + var ctx = Ctx{}; + _ = c.wgpuInstanceRequestAdapter( + instance, + &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, + .{ .callback = onAdapter, .userdata1 = &ctx }, + ); + c.wgpuInstanceProcessEvents(instance); + const adapter = ctx.adapter orelse return error.NoAdapter; + errdefer c.wgpuAdapterRelease(adapter); + + _ = c.wgpuAdapterRequestDevice( + adapter, + null, + .{ .callback = onDevice, .userdata1 = &ctx }, + ); + c.wgpuInstanceProcessEvents(instance); + const device = ctx.device orelse return error.NoDevice; + + return .{ + .instance = instance, + .adapter = adapter, + .device = device, + .queue = c.wgpuDeviceGetQueue(device), + }; + } + + pub fn deinit(self: *GpuAllocator) void { + if (self._pip_add) |p| c.wgpuComputePipelineRelease(p); + if (self._pip_scale) |p| c.wgpuComputePipelineRelease(p); + c.wgpuQueueRelease(self.queue); + c.wgpuDeviceRelease(self.device); + c.wgpuAdapterRelease(self.adapter); + c.wgpuInstanceRelease(self.instance); + } + + // ── Internal ───────────────────────────────────────────────────────────── + + pub fn makeBuffer( + self: *GpuAllocator, + bytes: u64, + usage: c.WGPUBufferUsage, + ) !c.WGPUBuffer { + return c.wgpuDeviceCreateBuffer(self.device, &.{ + .usage = usage, + .size = bytes, + }) orelse error.BufferAlloc; + } + + fn pipAdd(self: *GpuAllocator) !c.WGPUComputePipeline { + if (self._pip_add == null) + self._pip_add = try buildPipeline(self.device, SHADER_ADD); + return self._pip_add.?; + } + + fn pipScale(self: *GpuAllocator) !c.WGPUComputePipeline { + if (self._pip_scale == null) + self._pip_scale = try buildPipeline(self.device, SHADER_SCALE); + return self._pip_scale.?; + } + + /// Poll until GPU work completes. Use after submit if you need CPU sync. + pub fn poll(self: *GpuAllocator) void { + _ = c.wgpuDevicePoll(self.device, 1, null); + } +}; + +/// GPU-resident f32 matrix. +/// Buffer usage: Storage | CopyDst | CopySrc (usable as input and output). +pub const Mat = struct { + buf: c.WGPUBuffer, + rows: u32, + cols: u32, + + // ── Lifecycle ───────────────────────────────────────────────────────────── + + /// Allocate GPU buffer and upload `data`. `data.len` must equal rows*cols. + pub fn load( + gloc: *GpuAllocator, + data: []const f32, + rows: u32, + cols: u32, + ) !Mat { + std.debug.assert(data.len == @as(usize, rows) * cols); + const bytes = data.len * @sizeOf(f32); + const buf = try gloc.makeBuffer( + bytes, + c.WGPUBufferUsage_Storage | + c.WGPUBufferUsage_CopyDst | + c.WGPUBufferUsage_CopySrc, + ); + c.wgpuQueueWriteBuffer(gloc.queue, buf, 0, data.ptr, bytes); + return .{ .buf = buf, .rows = rows, .cols = cols }; + } + + /// Allocate zeroed GPU buffer (no upload). + pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat { + const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32); + const buf = try gloc.makeBuffer( + bytes, + c.WGPUBufferUsage_Storage | + c.WGPUBufferUsage_CopyDst | + c.WGPUBufferUsage_CopySrc, + ); + return .{ .buf = buf, .rows = rows, .cols = cols }; + } + + pub fn deinit(self: Mat) void { + c.wgpuBufferRelease(self.buf); + } + + pub fn len(self: Mat) u32 { + return self.rows * self.cols; + } + + pub fn byteSize(self: Mat) u64 { + return @as(u64, self.len()) * @sizeOf(f32); + } + + /// Element-wise add. Shapes must match. Returns new Mat (caller owns). + pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat { + std.debug.assert(self.rows == other.rows and self.cols == other.cols); + + const result = try Mat.zeros(gloc, self.rows, self.cols); + errdefer result.deinit(); + + const pipeline = try gloc.pipAdd(); + try dispatch2in1out(gloc, pipeline, self.buf, other.buf, result.buf, self.byteSize(), self.len()); + + return result; + } + + /// Element-wise multiply by scalar. Returns new Mat (caller owns). + pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat { + const result = try Mat.zeros(gloc, self.rows, self.cols); + errdefer result.deinit(); + + const bytes = self.byteSize(); + const n = self.len(); + + // Upload scalar as uniform buffer + const uni_buf = try gloc.makeBuffer( + @sizeOf(f32), + c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst, + ); + defer c.wgpuBufferRelease(uni_buf); + c.wgpuQueueWriteBuffer(gloc.queue, uni_buf, 0, &scalar, @sizeOf(f32)); + + const pipeline = try gloc.pipScale(); + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + const entries = [_]c.WGPUBindGroupEntry{ + .{ .binding = 0, .buffer = self.buf, .offset = 0, .size = bytes }, + .{ .binding = 1, .buffer = result.buf, .offset = 0, .size = bytes }, + .{ .binding = 2, .buffer = uni_buf, .offset = 0, .size = @sizeOf(f32) }, + }; + try submitPass(gloc, pipeline, &entries, n); + + return result; + } + + /// Read GPU buffer back to CPU. `out.len` must be >= rows*cols. + pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void { + std.debug.assert(out.len >= self.len()); + const bytes = self.byteSize(); + + const staging = try gloc.makeBuffer( + bytes, + c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst, + ); + defer c.wgpuBufferRelease(staging); + + // Copy result → staging + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse + return error.Encoder; + c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf, 0, staging, 0, bytes); + const cmd = c.wgpuCommandEncoderFinish(enc, null); + defer c.wgpuCommandEncoderRelease(enc); + defer c.wgpuCommandBufferRelease(cmd); + c.wgpuQueueSubmit(gloc.queue, 1, &cmd); + + // Map and copy to slice + var mapped = false; + _ = c.wgpuBufferMapAsync( + staging, + c.WGPUMapMode_Read, + 0, + bytes, + .{ .callback = onMapped, .userdata1 = &mapped }, + ); + while (!mapped) gloc.poll(); + + const ptr: [*]const f32 = @ptrCast(@alignCast( + c.wgpuBufferGetConstMappedRange(staging, 0, bytes), + )); + @memcpy(out[0..self.len()], ptr[0..self.len()]); + c.wgpuBufferUnmap(staging); + } +}; + +// ── Dispatch helpers ────────────────────────────────────────────────────────── + +/// Encode + submit a 2-input, 1-output compute pass (used by add). +fn dispatch2in1out( + gloc: *GpuAllocator, + pipeline: c.WGPUComputePipeline, + buf_a: c.WGPUBuffer, + buf_b: c.WGPUBuffer, + buf_out: c.WGPUBuffer, + bytes: u64, + n: u32, +) !void { + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + const entries = [_]c.WGPUBindGroupEntry{ + .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = bytes }, + .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = bytes }, + .{ .binding = 2, .buffer = buf_out, .offset = 0, .size = bytes }, + }; + try submitPass(gloc, pipeline, &entries, n); +} + +/// Create bind group, encode pass, submit. workgroup_size=64. +fn submitPass( + gloc: *GpuAllocator, + pipeline: c.WGPUComputePipeline, + entries: []const c.WGPUBindGroupEntry, + n: u32, +) !void { + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{ + .layout = bgl, + .entries = entries.ptr, + .entryCount = entries.len, + }) orelse return error.BindGroup; + defer c.wgpuBindGroupRelease(bg); + + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse + return error.Encoder; + const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); + c.wgpuComputePassEncoderSetPipeline(pass, pipeline); + c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null); + c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1); + c.wgpuComputePassEncoderEnd(pass); + c.wgpuComputePassEncoderRelease(pass); + + const cmd = c.wgpuCommandEncoderFinish(enc, null); + defer c.wgpuCommandEncoderRelease(enc); + defer c.wgpuCommandBufferRelease(cmd); + c.wgpuQueueSubmit(gloc.queue, 1, &cmd); +} diff --git a/src/main.zig b/src/main.zig index 6c80dfa..92d9de8 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,239 +1,49 @@ -// Minimal WebGPU compute in Zig: element-wise matrix addition -// Uses wgpu-native C bindings. -// Build: see ../build.zig -// -// Data flow: -// CPU (mat_a, mat_b) → GPU storage buffers → compute shader → GPU buf_c -// → staging buffer (mapped) → CPU read → print - const std = @import("std"); -const c = @cImport(@cInclude("wgpu.h")); +const gpu = @import("gpu.zig"); -// ── Config ──────────────────────────────────────────────────────────────────── -const ROWS: u32 = 4; -const COLS: u32 = 4; -const N = ROWS * COLS; // 16 elements -const BUF_BYTES = N * @sizeOf(f32); - -// ── WGSL Compute Shader ─────────────────────────────────────────────────────── -// workgroup_size(4,4) matches one full 4×4 matrix → dispatch(1,1,1) -const SHADER = - \\@group(0) @binding(0) var mat_a : array; - \\@group(0) @binding(1) var mat_b : array; - \\@group(0) @binding(2) var mat_c : array; - \\ - \\@compute @workgroup_size(4, 4) - \\fn main(@builtin(global_invocation_id) gid : vec3) { - \\ let idx = gid.y * 4u + gid.x; - \\ if (idx < arrayLength(&mat_c)) { - \\ mat_c[idx] = mat_a[idx] + mat_b[idx]; - \\ } - \\} -; - -// ── Callback state ──────────────────────────────────────────────────────────── -const Ctx = struct { - adapter: c.WGPUAdapter = null, - device: c.WGPUDevice = null, -}; - -fn onAdapter( - status: c.WGPURequestAdapterStatus, - adapter: c.WGPUAdapter, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestAdapterStatus_Success) { - std.log.err("Adapter request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.adapter = adapter; -} - -fn onDevice( - status: c.WGPURequestDeviceStatus, - device: c.WGPUDevice, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestDeviceStatus_Success) { - std.log.err("Device request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.device = device; - std.debug.print("{?}", .{device}); -} - -fn onMapped( - status: c.WGPUMapAsyncStatus, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - const flag: *bool = @ptrCast(@alignCast(userdata1.?)); - flag.* = (status == c.WGPUMapAsyncStatus_Success); -} - -fn sv(s: []const u8) c.WGPUStringView { - return .{ .data = s.ptr, .length = s.len }; -} - -// ── Main ────────────────────────────────────────────────────────────────────── pub fn main() !void { + var gloc = try gpu.GpuAllocator.init(); + defer gloc.deinit(); - // 1. Instance ────────────────────────────────────────────────────────────── - const instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse - return error.NoInstance; - defer c.wgpuInstanceRelease(instance); - - // 2. Adapter (async → poll) ──────────────────────────────────────────────── - var ctx = Ctx{}; - _ = c.wgpuInstanceRequestAdapter( - instance, - &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, - .{ .callback = onAdapter, .userdata1 = &ctx }, - ); - c.wgpuInstanceProcessEvents(instance); // drive callbacks - const adapter = ctx.adapter orelse return error.NoAdapter; - defer c.wgpuAdapterRelease(adapter); - - // 3. Device ──────────────────────────────────────────────────────────────── - _ = c.wgpuAdapterRequestDevice(adapter, null, .{ .callback = onDevice, .userdata1 = &ctx }); - c.wgpuInstanceProcessEvents(instance); - const device = ctx.device orelse return error.NoDevice; - defer c.wgpuDeviceRelease(device); - - const queue = c.wgpuDeviceGetQueue(device); - defer c.wgpuQueueRelease(queue); - - // 4. Input data ──────────────────────────────────────────────────────────── - // mat_a[i] = i (0 … 15) - // mat_b[i] = 15 − i → every element of mat_c should equal 15 - var mat_a: [N]f32 = undefined; - var mat_b: [N]f32 = undefined; - for (0..N) |i| { - mat_a[i] = @floatFromInt(i); - mat_b[i] = @floatFromInt(N - 1 - i); + // Input data: a[i] = i, b[i] = 15 - i → add should give all 15s + var data_a: [16]f32 = undefined; + var data_b: [16]f32 = undefined; + for (0..16) |i| { + data_a[i] = @floatFromInt(i); + data_b[i] = @floatFromInt(15 - i); } - // 5. GPU Buffers ─────────────────────────────────────────────────────────── - const buf_a = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferA; + const a = try gpu.Mat.load(&gloc, &data_a, 4, 4); + defer a.deinit(); + const b = try gpu.Mat.load(&gloc, &data_b, 4, 4); + defer b.deinit(); - const buf_b = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferB; + // a + b + const sum = try a.add(&gloc, b); + defer sum.deinit(); - // buf_c: GPU-only result; staging: CPU-readable copy - const buf_c = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopySrc, - .size = BUF_BYTES, - }) orelse return error.BufferC; + // sum * 2 + const scaled = try sum.scale(&gloc, 2.0); + defer scaled.deinit(); - const buf_staging = c.wgpuDeviceCreateBuffer(device, &.{ - .usage = c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst, - .size = BUF_BYTES, - }) orelse return error.BufferStaging; + // Read back + var out_sum: [16]f32 = undefined; + var out_scaled: [16]f32 = undefined; + try sum.read(&gloc, &out_sum); + try scaled.read(&gloc, &out_scaled); - defer c.wgpuBufferRelease(buf_a); - defer c.wgpuBufferRelease(buf_b); - defer c.wgpuBufferRelease(buf_c); - defer c.wgpuBufferRelease(buf_staging); + // Print + std.debug.print("\na + b (expect all 15):\n", .{}); + printMat(&out_sum, 4, 4); - // Upload inputs - c.wgpuQueueWriteBuffer(queue, buf_a, 0, &mat_a, BUF_BYTES); - c.wgpuQueueWriteBuffer(queue, buf_b, 0, &mat_b, BUF_BYTES); + std.debug.print("\n(a + b) * 2 (expect all 30):\n", .{}); + printMat(&out_scaled, 4, 4); +} - // 6. Shader module ───────────────────────────────────────────────────────── - // ✅ New API (0.20+) - var wgsl_src = c.WGPUShaderSourceWGSL{ - .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, - .code = sv(SHADER), - }; - const shader = c.wgpuDeviceCreateShaderModule(device, &.{ - .nextInChain = @ptrCast(&wgsl_src), - }) orelse return error.Shader; - - // 7. Compute pipeline (layout auto-inferred from shader) ─────────────────── - // ✅ - const pipeline = c.wgpuDeviceCreateComputePipeline(device, &.{ - .compute = .{ - .module = shader, - .entryPoint = sv("main"), - }, - }) orelse return error.Pipeline; - defer c.wgpuComputePipelineRelease(pipeline); - - // 8. Bind group ──────────────────────────────────────────────────────────── - const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); - defer c.wgpuBindGroupLayoutRelease(bgl); - - const entries = [_]c.WGPUBindGroupEntry{ - .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = BUF_BYTES }, - .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = BUF_BYTES }, - .{ .binding = 2, .buffer = buf_c, .offset = 0, .size = BUF_BYTES }, - }; - const bind_group = c.wgpuDeviceCreateBindGroup(device, &.{ - .layout = bgl, - .entries = &entries, - .entryCount = entries.len, - }) orelse return error.BindGroup; - defer c.wgpuBindGroupRelease(bind_group); - - // 9. Encode compute pass + buffer copy ──────────────────────────────────── - const encoder = c.wgpuDeviceCreateCommandEncoder(device, null) orelse - return error.Encoder; - - const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null); - c.wgpuComputePassEncoderSetPipeline(pass, pipeline); - c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null); - // dispatch(1,1,1): one workgroup of size (4,4) covers the whole 4×4 matrix - c.wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1); - c.wgpuComputePassEncoderEnd(pass); - c.wgpuComputePassEncoderRelease(pass); - - // Copy result buffer → CPU-readable staging buffer - c.wgpuCommandEncoderCopyBufferToBuffer(encoder, buf_c, 0, buf_staging, 0, BUF_BYTES); - - const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null); - defer c.wgpuCommandEncoderRelease(encoder); - defer c.wgpuCommandBufferRelease(cmdbuf); - - // 10. Submit ─────────────────────────────────────────────────────────────── - c.wgpuQueueSubmit(queue, 1, &cmdbuf); - - // 11. Map staging buffer back to CPU ────────────────────────────────────── - var mapped = false; - _ = c.wgpuBufferMapAsync( - buf_staging, - c.WGPUMapMode_Read, - 0, - BUF_BYTES, - .{ .callback = onMapped, .userdata1 = &mapped }, - ); - // Poll the device until the async map completes - while (!mapped) _ = c.wgpuDevicePoll(device, 1, null); - - const ptr: [*]const f32 = @ptrCast(@alignCast( - c.wgpuBufferGetConstMappedRange(buf_staging, 0, BUF_BYTES), - )); - const result = ptr[0..N]; - - // 12. Print ──────────────────────────────────────────────────────────────── - std.debug.print("\nmat_a + mat_b ({d}×{d}):\n", .{ ROWS, COLS }); - for (0..ROWS) |r| { - for (0..COLS) |col| - std.debug.print("{d:6.0}", .{result[r * COLS + col]}); +fn printMat(data: []const f32, rows: u32, cols: u32) void { + for (0..rows) |r| { + for (0..cols) |col| + std.debug.print("{d:6.0}", .{data[r * cols + col]}); std.debug.print("\n", .{}); } - // Expected output: every cell = 15.0 - - c.wgpuBufferUnmap(buf_staging); } diff --git a/src/reference.zig b/src/reference.zig new file mode 100644 index 0000000..6c80dfa --- /dev/null +++ b/src/reference.zig @@ -0,0 +1,239 @@ +// Minimal WebGPU compute in Zig: element-wise matrix addition +// Uses wgpu-native C bindings. +// Build: see ../build.zig +// +// Data flow: +// CPU (mat_a, mat_b) → GPU storage buffers → compute shader → GPU buf_c +// → staging buffer (mapped) → CPU read → print + +const std = @import("std"); +const c = @cImport(@cInclude("wgpu.h")); + +// ── Config ──────────────────────────────────────────────────────────────────── +const ROWS: u32 = 4; +const COLS: u32 = 4; +const N = ROWS * COLS; // 16 elements +const BUF_BYTES = N * @sizeOf(f32); + +// ── WGSL Compute Shader ─────────────────────────────────────────────────────── +// workgroup_size(4,4) matches one full 4×4 matrix → dispatch(1,1,1) +const SHADER = + \\@group(0) @binding(0) var mat_a : array; + \\@group(0) @binding(1) var mat_b : array; + \\@group(0) @binding(2) var mat_c : array; + \\ + \\@compute @workgroup_size(4, 4) + \\fn main(@builtin(global_invocation_id) gid : vec3) { + \\ let idx = gid.y * 4u + gid.x; + \\ if (idx < arrayLength(&mat_c)) { + \\ mat_c[idx] = mat_a[idx] + mat_b[idx]; + \\ } + \\} +; + +// ── Callback state ──────────────────────────────────────────────────────────── +const Ctx = struct { + adapter: c.WGPUAdapter = null, + device: c.WGPUDevice = null, +}; + +fn onAdapter( + status: c.WGPURequestAdapterStatus, + adapter: c.WGPUAdapter, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestAdapterStatus_Success) { + std.log.err("Adapter request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.adapter = adapter; +} + +fn onDevice( + status: c.WGPURequestDeviceStatus, + device: c.WGPUDevice, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestDeviceStatus_Success) { + std.log.err("Device request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.device = device; + std.debug.print("{?}", .{device}); +} + +fn onMapped( + status: c.WGPUMapAsyncStatus, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + const flag: *bool = @ptrCast(@alignCast(userdata1.?)); + flag.* = (status == c.WGPUMapAsyncStatus_Success); +} + +fn sv(s: []const u8) c.WGPUStringView { + return .{ .data = s.ptr, .length = s.len }; +} + +// ── Main ────────────────────────────────────────────────────────────────────── +pub fn main() !void { + + // 1. Instance ────────────────────────────────────────────────────────────── + const instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse + return error.NoInstance; + defer c.wgpuInstanceRelease(instance); + + // 2. Adapter (async → poll) ──────────────────────────────────────────────── + var ctx = Ctx{}; + _ = c.wgpuInstanceRequestAdapter( + instance, + &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, + .{ .callback = onAdapter, .userdata1 = &ctx }, + ); + c.wgpuInstanceProcessEvents(instance); // drive callbacks + const adapter = ctx.adapter orelse return error.NoAdapter; + defer c.wgpuAdapterRelease(adapter); + + // 3. Device ──────────────────────────────────────────────────────────────── + _ = c.wgpuAdapterRequestDevice(adapter, null, .{ .callback = onDevice, .userdata1 = &ctx }); + c.wgpuInstanceProcessEvents(instance); + const device = ctx.device orelse return error.NoDevice; + defer c.wgpuDeviceRelease(device); + + const queue = c.wgpuDeviceGetQueue(device); + defer c.wgpuQueueRelease(queue); + + // 4. Input data ──────────────────────────────────────────────────────────── + // mat_a[i] = i (0 … 15) + // mat_b[i] = 15 − i → every element of mat_c should equal 15 + var mat_a: [N]f32 = undefined; + var mat_b: [N]f32 = undefined; + for (0..N) |i| { + mat_a[i] = @floatFromInt(i); + mat_b[i] = @floatFromInt(N - 1 - i); + } + + // 5. GPU Buffers ─────────────────────────────────────────────────────────── + const buf_a = c.wgpuDeviceCreateBuffer(device, &.{ + .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, + .size = BUF_BYTES, + }) orelse return error.BufferA; + + const buf_b = c.wgpuDeviceCreateBuffer(device, &.{ + .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, + .size = BUF_BYTES, + }) orelse return error.BufferB; + + // buf_c: GPU-only result; staging: CPU-readable copy + const buf_c = c.wgpuDeviceCreateBuffer(device, &.{ + .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopySrc, + .size = BUF_BYTES, + }) orelse return error.BufferC; + + const buf_staging = c.wgpuDeviceCreateBuffer(device, &.{ + .usage = c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst, + .size = BUF_BYTES, + }) orelse return error.BufferStaging; + + defer c.wgpuBufferRelease(buf_a); + defer c.wgpuBufferRelease(buf_b); + defer c.wgpuBufferRelease(buf_c); + defer c.wgpuBufferRelease(buf_staging); + + // Upload inputs + c.wgpuQueueWriteBuffer(queue, buf_a, 0, &mat_a, BUF_BYTES); + c.wgpuQueueWriteBuffer(queue, buf_b, 0, &mat_b, BUF_BYTES); + + // 6. Shader module ───────────────────────────────────────────────────────── + // ✅ New API (0.20+) + var wgsl_src = c.WGPUShaderSourceWGSL{ + .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, + .code = sv(SHADER), + }; + const shader = c.wgpuDeviceCreateShaderModule(device, &.{ + .nextInChain = @ptrCast(&wgsl_src), + }) orelse return error.Shader; + + // 7. Compute pipeline (layout auto-inferred from shader) ─────────────────── + // ✅ + const pipeline = c.wgpuDeviceCreateComputePipeline(device, &.{ + .compute = .{ + .module = shader, + .entryPoint = sv("main"), + }, + }) orelse return error.Pipeline; + defer c.wgpuComputePipelineRelease(pipeline); + + // 8. Bind group ──────────────────────────────────────────────────────────── + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + const entries = [_]c.WGPUBindGroupEntry{ + .{ .binding = 0, .buffer = buf_a, .offset = 0, .size = BUF_BYTES }, + .{ .binding = 1, .buffer = buf_b, .offset = 0, .size = BUF_BYTES }, + .{ .binding = 2, .buffer = buf_c, .offset = 0, .size = BUF_BYTES }, + }; + const bind_group = c.wgpuDeviceCreateBindGroup(device, &.{ + .layout = bgl, + .entries = &entries, + .entryCount = entries.len, + }) orelse return error.BindGroup; + defer c.wgpuBindGroupRelease(bind_group); + + // 9. Encode compute pass + buffer copy ──────────────────────────────────── + const encoder = c.wgpuDeviceCreateCommandEncoder(device, null) orelse + return error.Encoder; + + const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null); + c.wgpuComputePassEncoderSetPipeline(pass, pipeline); + c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null); + // dispatch(1,1,1): one workgroup of size (4,4) covers the whole 4×4 matrix + c.wgpuComputePassEncoderDispatchWorkgroups(pass, 1, 1, 1); + c.wgpuComputePassEncoderEnd(pass); + c.wgpuComputePassEncoderRelease(pass); + + // Copy result buffer → CPU-readable staging buffer + c.wgpuCommandEncoderCopyBufferToBuffer(encoder, buf_c, 0, buf_staging, 0, BUF_BYTES); + + const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null); + defer c.wgpuCommandEncoderRelease(encoder); + defer c.wgpuCommandBufferRelease(cmdbuf); + + // 10. Submit ─────────────────────────────────────────────────────────────── + c.wgpuQueueSubmit(queue, 1, &cmdbuf); + + // 11. Map staging buffer back to CPU ────────────────────────────────────── + var mapped = false; + _ = c.wgpuBufferMapAsync( + buf_staging, + c.WGPUMapMode_Read, + 0, + BUF_BYTES, + .{ .callback = onMapped, .userdata1 = &mapped }, + ); + // Poll the device until the async map completes + while (!mapped) _ = c.wgpuDevicePoll(device, 1, null); + + const ptr: [*]const f32 = @ptrCast(@alignCast( + c.wgpuBufferGetConstMappedRange(buf_staging, 0, BUF_BYTES), + )); + const result = ptr[0..N]; + + // 12. Print ──────────────────────────────────────────────────────────────── + std.debug.print("\nmat_a + mat_b ({d}×{d}):\n", .{ ROWS, COLS }); + for (0..ROWS) |r| { + for (0..COLS) |col| + std.debug.print("{d:6.0}", .{result[r * COLS + col]}); + std.debug.print("\n", .{}); + } + // Expected output: every cell = 15.0 + + c.wgpuBufferUnmap(buf_staging); +} diff --git a/src/wgpu.zig b/src/wgpu.zig deleted file mode 100644 index 52480ce..0000000 --- a/src/wgpu.zig +++ /dev/null @@ -1,137 +0,0 @@ -const std = @import("std"); -const c = @cImport(@cInclude("wgpu.h")); - -/// Replace enum_WGPURequestAdapterStatus -pub const RequestAdapterStatus = enum { - Success, - CallbackCancelled, - Unavailable, - Error, - Force32, -}; - -pub const BufferUsage = enum(u64) { - None = 0, - MapRead = 1, // CPU can read after GPU finishes - MapWrite = 2, - CopySrc = 4, // can copy from GPU to staging. - CopyDst = 8, // CPU can write to it - Index = 16, - Vertex = 32, - Uniform = 64, - Storage = 128, - Indirect = 256, - QueryResolve = 512, -}; - -const Ctx = struct { - adapter: c.WGPUAdapter = null, - device: c.WGPUDevice = null, -}; - -fn onAdapter( - status: c.WGPURequestAdapterStatus, - adapter: c.WGPUAdapter, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestAdapterStatus_Success) { - std.log.err("Adapter request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.adapter = adapter; -} - -fn onDevice( - status: c.WGPURequestDeviceStatus, - device: c.WGPUDevice, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestDeviceStatus_Success) { - std.log.err("Device request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.device = device; - std.debug.print("{?}", .{device}); -} - -fn onMapped( - status: c.WGPUMapAsyncStatus, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - const flag: *bool = @ptrCast(@alignCast(userdata1.?)); - flag.* = (status == c.WGPUMapAsyncStatus_Success); -} - -fn sv(s: []const u8) c.WGPUStringView { - return .{ .data = s.ptr, .length = s.len }; -} - -const AllocatorGPU = @This(); - -allocator: std.mem.Allocator, -instance: *c.struct_WGPUInstanceImpl, -adapter: *c.struct_WGPUAdapterImpl, -device: c.struct_WGPUDeviceImpl, -queue: c.struct_WGPUQueueImpl, -ctx: Ctx, - -buffers: std.AutoHashMap(*c.struct_WGPUBufferImpl, void), - -pub fn init(allocator: std.mem.Allocator) !AllocatorGPU { - var self: AllocatorGPU = undefined; - self.allocator = allocator; - self.ctx = .{}; - self.buffers = try .init(allocator); - - // 1. Instance ────────────────────────────────────────────────────────────── - self.instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse - return error.NoInstance; - - // 2. Adapter (async → poll) ──────────────────────────────────────────────── - _ = c.wgpuInstanceRequestAdapter( - self.instance, - &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, - .{ .callback = onAdapter, .userdata1 = &self.ctx }, - ); - c.wgpuInstanceProcessEvents(self.instance); // drive callbacks - self.adapter = self.ctx.adapter orelse return error.NoAdapter; - - // 3. Device ──────────────────────────────────────────────────────────────── - _ = c.wgpuAdapterRequestDevice(self.adapter, null, .{ .callback = onDevice, .userdata1 = &self.ctx }); - c.wgpuInstanceProcessEvents(self.instance); - self.device = self.ctx.device orelse return error.NoDevice; - - self.queue = c.wgpuDeviceGetQueue(self.device); - - return self; -} - -pub fn deinit(self: AllocatorGPU) void { - c.wgpuInstanceRelease(self.instance); - defer c.wgpuAdapterRelease(self.adapter); - defer c.wgpuDeviceRelease(self.device); - defer c.wgpuQueueRelease(self.queue); -} - -pub fn addBuff( - self: AllocatorGPU, - comptime T: type, - comptime len: comptime_int, - comptime opt: struct {}, -) !void { - self.buffers.put( - c.wgpuDeviceCreateBuffer(self.device, &.{ - .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, - .size = len * @bitSizeOf(T), - }) orelse return error.Buffer, - {}, - ); -}