From 8050bab9698dd27b95d6c03fda678c3cfb520722 Mon Sep 17 00:00:00 2001 From: adrien Date: Tue, 28 Apr 2026 23:30:21 +0200 Subject: [PATCH] Started to understand how it work and implemented chained operation --- src/main.zig | 1 + src/wgpu.zig | 137 +++++++++++++++++++++++++ tmp | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 417 insertions(+) create mode 100644 src/wgpu.zig create mode 100644 tmp diff --git a/src/main.zig b/src/main.zig index 4392f24..6c80dfa 100644 --- a/src/main.zig +++ b/src/main.zig @@ -65,6 +65,7 @@ fn onDevice( } const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); ctx.device = device; + std.debug.print("{?}", .{device}); } fn onMapped( diff --git a/src/wgpu.zig b/src/wgpu.zig new file mode 100644 index 0000000..52480ce --- /dev/null +++ b/src/wgpu.zig @@ -0,0 +1,137 @@ +const std = @import("std"); +const c = @cImport(@cInclude("wgpu.h")); + +/// Replace enum_WGPURequestAdapterStatus +pub const RequestAdapterStatus = enum { + Success, + CallbackCancelled, + Unavailable, + Error, + Force32, +}; + +pub const BufferUsage = enum(u64) { + None = 0, + MapRead = 1, // CPU can read after GPU finishes + MapWrite = 2, + CopySrc = 4, // can copy from GPU to staging. + CopyDst = 8, // CPU can write to it + Index = 16, + Vertex = 32, + Uniform = 64, + Storage = 128, + Indirect = 256, + QueryResolve = 512, +}; + +const Ctx = struct { + adapter: c.WGPUAdapter = null, + device: c.WGPUDevice = null, +}; + +fn onAdapter( + status: c.WGPURequestAdapterStatus, + adapter: c.WGPUAdapter, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestAdapterStatus_Success) { + std.log.err("Adapter request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.adapter = adapter; +} + +fn onDevice( + status: c.WGPURequestDeviceStatus, + device: c.WGPUDevice, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestDeviceStatus_Success) { + std.log.err("Device request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.device = device; + std.debug.print("{?}", .{device}); +} + +fn onMapped( + status: c.WGPUMapAsyncStatus, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + const flag: *bool = @ptrCast(@alignCast(userdata1.?)); + flag.* = (status == c.WGPUMapAsyncStatus_Success); +} + +fn sv(s: []const u8) c.WGPUStringView { + return .{ .data = s.ptr, .length = s.len }; +} + +const AllocatorGPU = @This(); + +allocator: std.mem.Allocator, +instance: *c.struct_WGPUInstanceImpl, +adapter: *c.struct_WGPUAdapterImpl, +device: c.struct_WGPUDeviceImpl, +queue: c.struct_WGPUQueueImpl, +ctx: Ctx, + +buffers: std.AutoHashMap(*c.struct_WGPUBufferImpl, void), + +pub fn init(allocator: std.mem.Allocator) !AllocatorGPU { + var self: AllocatorGPU = undefined; + self.allocator = allocator; + self.ctx = .{}; + self.buffers = try .init(allocator); + + // 1. Instance ────────────────────────────────────────────────────────────── + self.instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse + return error.NoInstance; + + // 2. Adapter (async → poll) ──────────────────────────────────────────────── + _ = c.wgpuInstanceRequestAdapter( + self.instance, + &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, + .{ .callback = onAdapter, .userdata1 = &self.ctx }, + ); + c.wgpuInstanceProcessEvents(self.instance); // drive callbacks + self.adapter = self.ctx.adapter orelse return error.NoAdapter; + + // 3. Device ──────────────────────────────────────────────────────────────── + _ = c.wgpuAdapterRequestDevice(self.adapter, null, .{ .callback = onDevice, .userdata1 = &self.ctx }); + c.wgpuInstanceProcessEvents(self.instance); + self.device = self.ctx.device orelse return error.NoDevice; + + self.queue = c.wgpuDeviceGetQueue(self.device); + + return self; +} + +pub fn deinit(self: AllocatorGPU) void { + c.wgpuInstanceRelease(self.instance); + defer c.wgpuAdapterRelease(self.adapter); + defer c.wgpuDeviceRelease(self.device); + defer c.wgpuQueueRelease(self.queue); +} + +pub fn addBuff( + self: AllocatorGPU, + comptime T: type, + comptime len: comptime_int, + comptime opt: struct {}, +) !void { + self.buffers.put( + c.wgpuDeviceCreateBuffer(self.device, &.{ + .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst, + .size = len * @bitSizeOf(T), + }) orelse return error.Buffer, + {}, + ); +} diff --git a/tmp b/tmp new file mode 100644 index 0000000..6597096 --- /dev/null +++ b/tmp @@ -0,0 +1,279 @@ +# Tensor GPU: Memory & Pipeline Strategy + +**Best approach:** Lazy graph + ping-pong buffers + single command buffer. + +--- + +## Architecture + +**Problem with eager pipelines:** +``` +m1.add(m2) → dispatch + sync point (slow) +.mul(5) → dispatch + sync point (slow) +.sub(m3) → dispatch + sync point (slow) +Result: 3× GPU kernel submission overhead. Many intermediate buffers. +``` + +**Better: Build graph, execute once:** +``` +m1.add(m2).mul(5).sub(m3) // build operation list + .compute() // ONE command buffer, all ops +``` + +--- + +## Implementation + +```zig +const std = @import("std"); +const c = @cImport(@cInclude("wgpu.h")); + +pub const Operation = union(enum) { + add: struct { other: *TensorGPU }, + mul: struct { scalar: f32 }, + sub: struct { other: *TensorGPU }, + div: struct { scalar: f32 }, +}; + +pub const TensorGPU = struct { + gpu: *AllocatorGPU, + buffer: c.WGPUBuffer, + shape: [2]u32, // rows, cols + element_count: u32, + buf_bytes: u32, + + operations: std.ArrayList(Operation), + is_computed: bool, + allocator: std.mem.Allocator, + + pub fn init(gpu: *AllocatorGPU, shape: [2]u32, allocator: std.mem.Allocator) !TensorGPU { + const rows = shape[0]; + const cols = shape[1]; + const element_count = rows * cols; + const buf_bytes = element_count * @sizeOf(f32); + + const buffer = c.wgpuDeviceCreateBuffer(gpu.device, &.{ + .usage = c.WGPUBufferUsage_Storage + | c.WGPUBufferUsage_CopySrc + | c.WGPUBufferUsage_CopyDst, + .size = buf_bytes, + }) orelse return error.BufferCreate; + + var self: TensorGPU = .{ + .gpu = gpu, + .buffer = buffer, + .shape = shape, + .element_count = element_count, + .buf_bytes = buf_bytes, + .operations = try std.ArrayList(Operation).initCapacity(allocator, 8), + .is_computed = true, + .allocator = allocator, + }; + + return self; + } + + pub fn deinit(self: *TensorGPU) void { + c.wgpuBufferRelease(self.buffer); + self.operations.deinit(); + } + + pub fn add(self: *TensorGPU, other: *TensorGPU) *TensorGPU { + self.operations.append(.{ .add = .{ .other = other } }) catch unreachable; + self.is_computed = false; + return self; + } + + pub fn mul(self: *TensorGPU, scalar: f32) *TensorGPU { + self.operations.append(.{ .mul = .{ .scalar = scalar } }) catch unreachable; + self.is_computed = false; + return self; + } + + pub fn sub(self: *TensorGPU, other: *TensorGPU) *TensorGPU { + self.operations.append(.{ .sub = .{ .other = other } }) catch unreachable; + self.is_computed = false; + return self; + } + + pub fn compute(self: *TensorGPU) !void { + if (self.is_computed or self.operations.items.len == 0) return; + + // Allocate ping-pong temp buffer (freed after compute) + const buf_temp = c.wgpuDeviceCreateBuffer(self.gpu.device, &.{ + .usage = c.WGPUBufferUsage_Storage + | c.WGPUBufferUsage_CopySrc + | c.WGPUBufferUsage_CopyDst, + .size = self.buf_bytes, + }) orelse return error.TempBuffer; + defer c.wgpuBufferRelease(buf_temp); + + // Build ONE command encoder for all operations + const encoder = c.wgpuDeviceCreateCommandEncoder(self.gpu.device, null) + orelse return error.Encoder; + defer c.wgpuCommandEncoderRelease(encoder); + + var buf_read = self.buffer; // input + var buf_write = buf_temp; // output (swap after each op) + + for (self.operations.items) |op| { + try self.encodeOp(encoder, op, buf_read, buf_write); + + // Swap: output becomes input for next op + const tmp = buf_read; + buf_read = buf_write; + buf_write = tmp; + } + + // Final result in buf_read; copy back to self.buffer if needed + if (buf_read != self.buffer) { + c.wgpuCommandEncoderCopyBufferToBuffer( + encoder, buf_read, 0, self.buffer, 0, self.buf_bytes, + ); + } + + const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null) + orelse return error.CommandBuffer; + defer c.wgpuCommandBufferRelease(cmdbuf); + + c.wgpuQueueSubmit(self.gpu.queue, 1, &cmdbuf); + + self.operations.clearAndFree(); + self.is_computed = true; + } + + fn encodeOp( + self: TensorGPU, + encoder: c.WGPUCommandEncoder, + op: Operation, + buf_in: c.WGPUBuffer, + buf_out: c.WGPUBuffer, + ) !void { + const shader_code = switch (op) { + .add => SHADER_ADD, + .mul => SHADER_MUL, + .sub => SHADER_SUB, + .div => SHADER_DIV, + }; + + var wgsl_src = c.WGPUShaderSourceWGSL{ + .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, + .code = sv(shader_code), + }; + + const shader = c.wgpuDeviceCreateShaderModule(self.gpu.device, &.{ + .nextInChain = @ptrCast(&wgsl_src), + }) orelse return error.Shader; + defer c.wgpuShaderModuleRelease(shader); + + const pipeline = c.wgpuDeviceCreateComputePipeline(self.gpu.device, &.{ + .compute = .{ .module = shader, .entryPoint = sv("main") }, + }) orelse return error.Pipeline; + defer c.wgpuComputePipelineRelease(pipeline); + + // Bind groups depend on operation (binary vs unary) + const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); + defer c.wgpuBindGroupLayoutRelease(bgl); + + var entries: [3]c.WGPUBindGroupEntry = undefined; + var entry_count: u32 = 2; + + entries[0] = .{ .binding = 0, .buffer = buf_in, .size = self.buf_bytes }; + entries[1] = .{ .binding = 1, .buffer = buf_out, .size = self.buf_bytes }; + + if (op == .add or op == .sub) { + entries[2] = .{ + .binding = 2, + .buffer = op.add.other.buffer, // or op.sub.other + .size = self.buf_bytes, + }; + entry_count = 3; + } + + const bind_group = c.wgpuDeviceCreateBindGroup(self.gpu.device, &.{ + .layout = bgl, + .entries = entries[0..entry_count], + .entryCount = entry_count, + }) orelse return error.BindGroup; + defer c.wgpuBindGroupRelease(bind_group); + + const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null); + c.wgpuComputePassEncoderSetPipeline(pass, pipeline); + c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null); + + const workgroups_x = (self.shape[1] + 3) / 4; + const workgroups_y = (self.shape[0] + 3) / 4; + c.wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1); + + c.wgpuComputePassEncoderEnd(pass); + c.wgpuComputePassEncoderRelease(pass); + } +}; + +// ── Shaders ────────────────────────────────────────────────────────────────── + +const SHADER_ADD = + \\@group(0) @binding(0) var mat_a : array; + \\@group(0) @binding(1) var mat_c : array; + \\@group(0) @binding(2) var mat_b : array; + \\@compute @workgroup_size(4, 4) + \\fn main(@builtin(global_invocation_id) gid : vec3) { + \\ let idx = gid.y * 4u + gid.x; + \\ mat_c[idx] = mat_a[idx] + mat_b[idx]; + \\} +; + +const SHADER_MUL = + \\@group(0) @binding(0) var mat_a : array; + \\@group(0) @binding(1) var mat_c : array; + \\fn main(@builtin(global_invocation_id) gid : vec3) { + \\ let idx = gid.y * 4u + gid.x; + \\ mat_c[idx] = mat_a[idx] * 5.0; // hardcoded for demo + \\} +; + +// ... SUB, DIV similar +``` + +--- + +## Usage + +```zig +var gpu_alloc = try AllocatorGPU.init(allocator); +defer gpu_alloc.deinit(); + +var m1 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator); +var m2 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator); +defer m1.deinit(); +defer m2.deinit(); + +// Chain: lazy, no GPU work yet +m1.add(m2).mul(5).sub(m1).compute(); // ← NOW executes all at once + +// m1.buffer contains final result +``` + +--- + +## Memory Breakdown + +| Buffer | Lifetime | Size | Notes | +|--------|----------|------|-------| +| `m1.buffer` | Persistent (user owns) | N×4 bytes | Input + final output | +| `m2.buffer` | Persistent (user owns) | N×4 bytes | Input (read-only) | +| `buf_temp` (ping-pong) | compute() scope | N×4 bytes | Allocated/freed per compute() | + +**Max GPU RAM for 3-op chain:** 2×buffer + 1×temp = 3× data size. Not 4×. + +--- + +## Key Points + +- **One command buffer:** all ops fused, single GPU submit +- **Ping-pong:** swap buf_read ↔ buf_write after each op (no extra allocs) +- **Lazy:** operations queued until `.compute()` called +- **No intermediate tensors:** user doesn't allocate intermediate results +- **Per-compute cleanup:** temp buffer freed immediately after execution + +Can now chain 100 ops with same 3-buffer peak.