Started to understand how it work and implemented chained operation

2026-04-28 23:30:21 +02:00 · 2026-04-28 23:30:21 +02:00 · 8050bab969
commit 8050bab969
parent 532e8c1897
3 changed files with 417 additions and 0 deletions
--- a/src/main.zig
+++ b/src/main.zig
@ -65,6 +65,7 @@ fn onDevice(
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
+    std.debug.print("{?}", .{device});
 }

 fn onMapped(
--- a/src/wgpu.zig
+++ b/src/wgpu.zig
@ -0,0 +1,137 @@
+const std = @import("std");
+const c = @cImport(@cInclude("wgpu.h"));
+
+/// Replace enum_WGPURequestAdapterStatus
+pub const RequestAdapterStatus = enum {
+    Success,
+    CallbackCancelled,
+    Unavailable,
+    Error,
+    Force32,
+};
+
+pub const BufferUsage = enum(u64) {
+    None = 0,
+    MapRead = 1, // CPU can read after GPU finishes
+    MapWrite = 2,
+    CopySrc = 4, // can copy from GPU to staging.
+    CopyDst = 8, // CPU can write to it
+    Index = 16,
+    Vertex = 32,
+    Uniform = 64,
+    Storage = 128,
+    Indirect = 256,
+    QueryResolve = 512,
+};
+
+const Ctx = struct {
+    adapter: c.WGPUAdapter = null,
+    device: c.WGPUDevice = null,
+};
+
+fn onAdapter(
+    status: c.WGPURequestAdapterStatus,
+    adapter: c.WGPUAdapter,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestAdapterStatus_Success) {
+        std.log.err("Adapter request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.adapter = adapter;
+}
+
+fn onDevice(
+    status: c.WGPURequestDeviceStatus,
+    device: c.WGPUDevice,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestDeviceStatus_Success) {
+        std.log.err("Device request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.device = device;
+    std.debug.print("{?}", .{device});
+}
+
+fn onMapped(
+    status: c.WGPUMapAsyncStatus,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
+    flag.* = (status == c.WGPUMapAsyncStatus_Success);
+}
+
+fn sv(s: []const u8) c.WGPUStringView {
+    return .{ .data = s.ptr, .length = s.len };
+}
+
+const AllocatorGPU = @This();
+
+allocator: std.mem.Allocator,
+instance: *c.struct_WGPUInstanceImpl,
+adapter: *c.struct_WGPUAdapterImpl,
+device: c.struct_WGPUDeviceImpl,
+queue: c.struct_WGPUQueueImpl,
+ctx: Ctx,
+
+buffers: std.AutoHashMap(*c.struct_WGPUBufferImpl, void),
+
+pub fn init(allocator: std.mem.Allocator) !AllocatorGPU {
+    var self: AllocatorGPU = undefined;
+    self.allocator = allocator;
+    self.ctx = .{};
+    self.buffers = try .init(allocator);
+
+    // 1. Instance ──────────────────────────────────────────────────────────────
+    self.instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse
+        return error.NoInstance;
+
+    // 2. Adapter (async → poll) ────────────────────────────────────────────────
+    _ = c.wgpuInstanceRequestAdapter(
+        self.instance,
+        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
+        .{ .callback = onAdapter, .userdata1 = &self.ctx },
+    );
+    c.wgpuInstanceProcessEvents(self.instance); // drive callbacks
+    self.adapter = self.ctx.adapter orelse return error.NoAdapter;
+
+    // 3. Device ────────────────────────────────────────────────────────────────
+    _ = c.wgpuAdapterRequestDevice(self.adapter, null, .{ .callback = onDevice, .userdata1 = &self.ctx });
+    c.wgpuInstanceProcessEvents(self.instance);
+    self.device = self.ctx.device orelse return error.NoDevice;
+
+    self.queue = c.wgpuDeviceGetQueue(self.device);
+
+    return self;
+}
+
+pub fn deinit(self: AllocatorGPU) void {
+    c.wgpuInstanceRelease(self.instance);
+    defer c.wgpuAdapterRelease(self.adapter);
+    defer c.wgpuDeviceRelease(self.device);
+    defer c.wgpuQueueRelease(self.queue);
+}
+
+pub fn addBuff(
+    self: AllocatorGPU,
+    comptime T: type,
+    comptime len: comptime_int,
+    comptime opt: struct {},
+) !void {
+    self.buffers.put(
+        c.wgpuDeviceCreateBuffer(self.device, &.{
+            .usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
+            .size = len * @bitSizeOf(T),
+        }) orelse return error.Buffer,
+        {},
+    );
+}
--- a/279
+++ b/279
@ -0,0 +1,279 @@
+# Tensor GPU: Memory & Pipeline Strategy
+
+**Best approach:** Lazy graph + ping-pong buffers + single command buffer.
+
+---
+
+## Architecture
+
+**Problem with eager pipelines:**
+```
+m1.add(m2)     → dispatch + sync point (slow)
+.mul(5)        → dispatch + sync point (slow)
+.sub(m3)       → dispatch + sync point (slow)
+Result: 3× GPU kernel submission overhead. Many intermediate buffers.
+```
+
+**Better: Build graph, execute once:**
+```
+m1.add(m2).mul(5).sub(m3)   // build operation list
+                .compute()   // ONE command buffer, all ops
+```
+
+---
+
+## Implementation
+
+```zig
+const std = @import("std");
+const c = @cImport(@cInclude("wgpu.h"));
+
+pub const Operation = union(enum) {
+    add: struct { other: *TensorGPU },
+    mul: struct { scalar: f32 },
+    sub: struct { other: *TensorGPU },
+    div: struct { scalar: f32 },
+};
+
+pub const TensorGPU = struct {
+    gpu: *AllocatorGPU,
+    buffer: c.WGPUBuffer,
+    shape: [2]u32,  // rows, cols
+    element_count: u32,
+    buf_bytes: u32,
+    
+    operations: std.ArrayList(Operation),
+    is_computed: bool,
+    allocator: std.mem.Allocator,
+
+    pub fn init(gpu: *AllocatorGPU, shape: [2]u32, allocator: std.mem.Allocator) !TensorGPU {
+        const rows = shape[0];
+        const cols = shape[1];
+        const element_count = rows * cols;
+        const buf_bytes = element_count * @sizeOf(f32);
+
+        const buffer = c.wgpuDeviceCreateBuffer(gpu.device, &.{
+            .usage = c.WGPUBufferUsage_Storage 
+                   | c.WGPUBufferUsage_CopySrc 
+                   | c.WGPUBufferUsage_CopyDst,
+            .size = buf_bytes,
+        }) orelse return error.BufferCreate;
+
+        var self: TensorGPU = .{
+            .gpu = gpu,
+            .buffer = buffer,
+            .shape = shape,
+            .element_count = element_count,
+            .buf_bytes = buf_bytes,
+            .operations = try std.ArrayList(Operation).initCapacity(allocator, 8),
+            .is_computed = true,
+            .allocator = allocator,
+        };
+
+        return self;
+    }
+
+    pub fn deinit(self: *TensorGPU) void {
+        c.wgpuBufferRelease(self.buffer);
+        self.operations.deinit();
+    }
+
+    pub fn add(self: *TensorGPU, other: *TensorGPU) *TensorGPU {
+        self.operations.append(.{ .add = .{ .other = other } }) catch unreachable;
+        self.is_computed = false;
+        return self;
+    }
+
+    pub fn mul(self: *TensorGPU, scalar: f32) *TensorGPU {
+        self.operations.append(.{ .mul = .{ .scalar = scalar } }) catch unreachable;
+        self.is_computed = false;
+        return self;
+    }
+
+    pub fn sub(self: *TensorGPU, other: *TensorGPU) *TensorGPU {
+        self.operations.append(.{ .sub = .{ .other = other } }) catch unreachable;
+        self.is_computed = false;
+        return self;
+    }
+
+    pub fn compute(self: *TensorGPU) !void {
+        if (self.is_computed or self.operations.items.len == 0) return;
+
+        // Allocate ping-pong temp buffer (freed after compute)
+        const buf_temp = c.wgpuDeviceCreateBuffer(self.gpu.device, &.{
+            .usage = c.WGPUBufferUsage_Storage 
+                   | c.WGPUBufferUsage_CopySrc 
+                   | c.WGPUBufferUsage_CopyDst,
+            .size = self.buf_bytes,
+        }) orelse return error.TempBuffer;
+        defer c.wgpuBufferRelease(buf_temp);
+
+        // Build ONE command encoder for all operations
+        const encoder = c.wgpuDeviceCreateCommandEncoder(self.gpu.device, null) 
+            orelse return error.Encoder;
+        defer c.wgpuCommandEncoderRelease(encoder);
+
+        var buf_read = self.buffer;   // input
+        var buf_write = buf_temp;     // output (swap after each op)
+
+        for (self.operations.items) |op| {
+            try self.encodeOp(encoder, op, buf_read, buf_write);
+            
+            // Swap: output becomes input for next op
+            const tmp = buf_read;
+            buf_read = buf_write;
+            buf_write = tmp;
+        }
+
+        // Final result in buf_read; copy back to self.buffer if needed
+        if (buf_read != self.buffer) {
+            c.wgpuCommandEncoderCopyBufferToBuffer(
+                encoder, buf_read, 0, self.buffer, 0, self.buf_bytes,
+            );
+        }
+
+        const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null) 
+            orelse return error.CommandBuffer;
+        defer c.wgpuCommandBufferRelease(cmdbuf);
+
+        c.wgpuQueueSubmit(self.gpu.queue, 1, &cmdbuf);
+
+        self.operations.clearAndFree();
+        self.is_computed = true;
+    }
+
+    fn encodeOp(
+        self: TensorGPU,
+        encoder: c.WGPUCommandEncoder,
+        op: Operation,
+        buf_in: c.WGPUBuffer,
+        buf_out: c.WGPUBuffer,
+    ) !void {
+        const shader_code = switch (op) {
+            .add => SHADER_ADD,
+            .mul => SHADER_MUL,
+            .sub => SHADER_SUB,
+            .div => SHADER_DIV,
+        };
+
+        var wgsl_src = c.WGPUShaderSourceWGSL{
+            .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
+            .code = sv(shader_code),
+        };
+
+        const shader = c.wgpuDeviceCreateShaderModule(self.gpu.device, &.{
+            .nextInChain = @ptrCast(&wgsl_src),
+        }) orelse return error.Shader;
+        defer c.wgpuShaderModuleRelease(shader);
+
+        const pipeline = c.wgpuDeviceCreateComputePipeline(self.gpu.device, &.{
+            .compute = .{ .module = shader, .entryPoint = sv("main") },
+        }) orelse return error.Pipeline;
+        defer c.wgpuComputePipelineRelease(pipeline);
+
+        // Bind groups depend on operation (binary vs unary)
+        const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
+        defer c.wgpuBindGroupLayoutRelease(bgl);
+
+        var entries: [3]c.WGPUBindGroupEntry = undefined;
+        var entry_count: u32 = 2;
+
+        entries[0] = .{ .binding = 0, .buffer = buf_in, .size = self.buf_bytes };
+        entries[1] = .{ .binding = 1, .buffer = buf_out, .size = self.buf_bytes };
+
+        if (op == .add or op == .sub) {
+            entries[2] = .{ 
+                .binding = 2, 
+                .buffer = op.add.other.buffer,  // or op.sub.other
+                .size = self.buf_bytes,
+            };
+            entry_count = 3;
+        }
+
+        const bind_group = c.wgpuDeviceCreateBindGroup(self.gpu.device, &.{
+            .layout = bgl,
+            .entries = entries[0..entry_count],
+            .entryCount = entry_count,
+        }) orelse return error.BindGroup;
+        defer c.wgpuBindGroupRelease(bind_group);
+
+        const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null);
+        c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
+        c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null);
+        
+        const workgroups_x = (self.shape[1] + 3) / 4;
+        const workgroups_y = (self.shape[0] + 3) / 4;
+        c.wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1);
+        
+        c.wgpuComputePassEncoderEnd(pass);
+        c.wgpuComputePassEncoderRelease(pass);
+    }
+};
+
+// ── Shaders ──────────────────────────────────────────────────────────────────
+
+const SHADER_ADD =
+    \\@group(0) @binding(0) var<storage, read>       mat_a : array<f32>;
+    \\@group(0) @binding(1) var<storage, read_write> mat_c : array<f32>;
+    \\@group(0) @binding(2) var<storage, read>       mat_b : array<f32>;
+    \\@compute @workgroup_size(4, 4)
+    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+    \\    let idx = gid.y * 4u + gid.x;
+    \\    mat_c[idx] = mat_a[idx] + mat_b[idx];
+    \\}
+;
+
+const SHADER_MUL =
+    \\@group(0) @binding(0) var<storage, read>       mat_a : array<f32>;
+    \\@group(0) @binding(1) var<storage, read_write> mat_c : array<f32>;
+    \\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
+    \\    let idx = gid.y * 4u + gid.x;
+    \\    mat_c[idx] = mat_a[idx] * 5.0;  // hardcoded for demo
+    \\}
+;
+
+// ... SUB, DIV similar
+```
+
+---
+
+## Usage
+
+```zig
+var gpu_alloc = try AllocatorGPU.init(allocator);
+defer gpu_alloc.deinit();
+
+var m1 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator);
+var m2 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator);
+defer m1.deinit();
+defer m2.deinit();
+
+// Chain: lazy, no GPU work yet
+m1.add(m2).mul(5).sub(m1).compute();  // ← NOW executes all at once
+
+// m1.buffer contains final result
+```
+
+---
+
+## Memory Breakdown
+
+| Buffer | Lifetime | Size | Notes |
+|--------|----------|------|-------|
+| `m1.buffer` | Persistent (user owns) | N×4 bytes | Input + final output |
+| `m2.buffer` | Persistent (user owns) | N×4 bytes | Input (read-only) |
+| `buf_temp` (ping-pong) | compute() scope | N×4 bytes | Allocated/freed per compute() |
+
+**Max GPU RAM for 3-op chain:** 2×buffer + 1×temp = 3× data size. Not 4×.
+
+---
+
+## Key Points
+
+- **One command buffer:** all ops fused, single GPU submit
+- **Ping-pong:** swap buf_read ↔ buf_write after each op (no extra allocs)
+- **Lazy:** operations queued until `.compute()` called
+- **No intermediate tensors:** user doesn't allocate intermediate results
+- **Per-compute cleanup:** temp buffer freed immediately after execution
+
+Can now chain 100 ops with same 3-buffer peak.