From 67f35eed345743107e208676be780263f241eb13 Mon Sep 17 00:00:00 2001 From: adrien Date: Fri, 15 May 2026 10:07:23 +0200 Subject: [PATCH] Removed tmp file and added gitattributes --- .gitattributes | 1 + tmp | 279 ------------------------------------------------- 2 files changed, 1 insertion(+), 279 deletions(-) create mode 100644 .gitattributes delete mode 100644 tmp diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..1fdf9c5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +libs/** linguist-vendored diff --git a/tmp b/tmp deleted file mode 100644 index 6597096..0000000 --- a/tmp +++ /dev/null @@ -1,279 +0,0 @@ -# Tensor GPU: Memory & Pipeline Strategy - -**Best approach:** Lazy graph + ping-pong buffers + single command buffer. - ---- - -## Architecture - -**Problem with eager pipelines:** -``` -m1.add(m2) → dispatch + sync point (slow) -.mul(5) → dispatch + sync point (slow) -.sub(m3) → dispatch + sync point (slow) -Result: 3× GPU kernel submission overhead. Many intermediate buffers. -``` - -**Better: Build graph, execute once:** -``` -m1.add(m2).mul(5).sub(m3) // build operation list - .compute() // ONE command buffer, all ops -``` - ---- - -## Implementation - -```zig -const std = @import("std"); -const c = @cImport(@cInclude("wgpu.h")); - -pub const Operation = union(enum) { - add: struct { other: *TensorGPU }, - mul: struct { scalar: f32 }, - sub: struct { other: *TensorGPU }, - div: struct { scalar: f32 }, -}; - -pub const TensorGPU = struct { - gpu: *AllocatorGPU, - buffer: c.WGPUBuffer, - shape: [2]u32, // rows, cols - element_count: u32, - buf_bytes: u32, - - operations: std.ArrayList(Operation), - is_computed: bool, - allocator: std.mem.Allocator, - - pub fn init(gpu: *AllocatorGPU, shape: [2]u32, allocator: std.mem.Allocator) !TensorGPU { - const rows = shape[0]; - const cols = shape[1]; - const element_count = rows * cols; - const buf_bytes = element_count * @sizeOf(f32); - - const buffer = c.wgpuDeviceCreateBuffer(gpu.device, &.{ - .usage = c.WGPUBufferUsage_Storage - | c.WGPUBufferUsage_CopySrc - | c.WGPUBufferUsage_CopyDst, - .size = buf_bytes, - }) orelse return error.BufferCreate; - - var self: TensorGPU = .{ - .gpu = gpu, - .buffer = buffer, - .shape = shape, - .element_count = element_count, - .buf_bytes = buf_bytes, - .operations = try std.ArrayList(Operation).initCapacity(allocator, 8), - .is_computed = true, - .allocator = allocator, - }; - - return self; - } - - pub fn deinit(self: *TensorGPU) void { - c.wgpuBufferRelease(self.buffer); - self.operations.deinit(); - } - - pub fn add(self: *TensorGPU, other: *TensorGPU) *TensorGPU { - self.operations.append(.{ .add = .{ .other = other } }) catch unreachable; - self.is_computed = false; - return self; - } - - pub fn mul(self: *TensorGPU, scalar: f32) *TensorGPU { - self.operations.append(.{ .mul = .{ .scalar = scalar } }) catch unreachable; - self.is_computed = false; - return self; - } - - pub fn sub(self: *TensorGPU, other: *TensorGPU) *TensorGPU { - self.operations.append(.{ .sub = .{ .other = other } }) catch unreachable; - self.is_computed = false; - return self; - } - - pub fn compute(self: *TensorGPU) !void { - if (self.is_computed or self.operations.items.len == 0) return; - - // Allocate ping-pong temp buffer (freed after compute) - const buf_temp = c.wgpuDeviceCreateBuffer(self.gpu.device, &.{ - .usage = c.WGPUBufferUsage_Storage - | c.WGPUBufferUsage_CopySrc - | c.WGPUBufferUsage_CopyDst, - .size = self.buf_bytes, - }) orelse return error.TempBuffer; - defer c.wgpuBufferRelease(buf_temp); - - // Build ONE command encoder for all operations - const encoder = c.wgpuDeviceCreateCommandEncoder(self.gpu.device, null) - orelse return error.Encoder; - defer c.wgpuCommandEncoderRelease(encoder); - - var buf_read = self.buffer; // input - var buf_write = buf_temp; // output (swap after each op) - - for (self.operations.items) |op| { - try self.encodeOp(encoder, op, buf_read, buf_write); - - // Swap: output becomes input for next op - const tmp = buf_read; - buf_read = buf_write; - buf_write = tmp; - } - - // Final result in buf_read; copy back to self.buffer if needed - if (buf_read != self.buffer) { - c.wgpuCommandEncoderCopyBufferToBuffer( - encoder, buf_read, 0, self.buffer, 0, self.buf_bytes, - ); - } - - const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null) - orelse return error.CommandBuffer; - defer c.wgpuCommandBufferRelease(cmdbuf); - - c.wgpuQueueSubmit(self.gpu.queue, 1, &cmdbuf); - - self.operations.clearAndFree(); - self.is_computed = true; - } - - fn encodeOp( - self: TensorGPU, - encoder: c.WGPUCommandEncoder, - op: Operation, - buf_in: c.WGPUBuffer, - buf_out: c.WGPUBuffer, - ) !void { - const shader_code = switch (op) { - .add => SHADER_ADD, - .mul => SHADER_MUL, - .sub => SHADER_SUB, - .div => SHADER_DIV, - }; - - var wgsl_src = c.WGPUShaderSourceWGSL{ - .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, - .code = sv(shader_code), - }; - - const shader = c.wgpuDeviceCreateShaderModule(self.gpu.device, &.{ - .nextInChain = @ptrCast(&wgsl_src), - }) orelse return error.Shader; - defer c.wgpuShaderModuleRelease(shader); - - const pipeline = c.wgpuDeviceCreateComputePipeline(self.gpu.device, &.{ - .compute = .{ .module = shader, .entryPoint = sv("main") }, - }) orelse return error.Pipeline; - defer c.wgpuComputePipelineRelease(pipeline); - - // Bind groups depend on operation (binary vs unary) - const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); - defer c.wgpuBindGroupLayoutRelease(bgl); - - var entries: [3]c.WGPUBindGroupEntry = undefined; - var entry_count: u32 = 2; - - entries[0] = .{ .binding = 0, .buffer = buf_in, .size = self.buf_bytes }; - entries[1] = .{ .binding = 1, .buffer = buf_out, .size = self.buf_bytes }; - - if (op == .add or op == .sub) { - entries[2] = .{ - .binding = 2, - .buffer = op.add.other.buffer, // or op.sub.other - .size = self.buf_bytes, - }; - entry_count = 3; - } - - const bind_group = c.wgpuDeviceCreateBindGroup(self.gpu.device, &.{ - .layout = bgl, - .entries = entries[0..entry_count], - .entryCount = entry_count, - }) orelse return error.BindGroup; - defer c.wgpuBindGroupRelease(bind_group); - - const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null); - c.wgpuComputePassEncoderSetPipeline(pass, pipeline); - c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null); - - const workgroups_x = (self.shape[1] + 3) / 4; - const workgroups_y = (self.shape[0] + 3) / 4; - c.wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1); - - c.wgpuComputePassEncoderEnd(pass); - c.wgpuComputePassEncoderRelease(pass); - } -}; - -// ── Shaders ────────────────────────────────────────────────────────────────── - -const SHADER_ADD = - \\@group(0) @binding(0) var mat_a : array; - \\@group(0) @binding(1) var mat_c : array; - \\@group(0) @binding(2) var mat_b : array; - \\@compute @workgroup_size(4, 4) - \\fn main(@builtin(global_invocation_id) gid : vec3) { - \\ let idx = gid.y * 4u + gid.x; - \\ mat_c[idx] = mat_a[idx] + mat_b[idx]; - \\} -; - -const SHADER_MUL = - \\@group(0) @binding(0) var mat_a : array; - \\@group(0) @binding(1) var mat_c : array; - \\fn main(@builtin(global_invocation_id) gid : vec3) { - \\ let idx = gid.y * 4u + gid.x; - \\ mat_c[idx] = mat_a[idx] * 5.0; // hardcoded for demo - \\} -; - -// ... SUB, DIV similar -``` - ---- - -## Usage - -```zig -var gpu_alloc = try AllocatorGPU.init(allocator); -defer gpu_alloc.deinit(); - -var m1 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator); -var m2 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator); -defer m1.deinit(); -defer m2.deinit(); - -// Chain: lazy, no GPU work yet -m1.add(m2).mul(5).sub(m1).compute(); // ← NOW executes all at once - -// m1.buffer contains final result -``` - ---- - -## Memory Breakdown - -| Buffer | Lifetime | Size | Notes | -|--------|----------|------|-------| -| `m1.buffer` | Persistent (user owns) | N×4 bytes | Input + final output | -| `m2.buffer` | Persistent (user owns) | N×4 bytes | Input (read-only) | -| `buf_temp` (ping-pong) | compute() scope | N×4 bytes | Allocated/freed per compute() | - -**Max GPU RAM for 3-op chain:** 2×buffer + 1×temp = 3× data size. Not 4×. - ---- - -## Key Points - -- **One command buffer:** all ops fused, single GPU submit -- **Ping-pong:** swap buf_read ↔ buf_write after each op (no extra allocs) -- **Lazy:** operations queued until `.compute()` called -- **No intermediate tensors:** user doesn't allocate intermediate results -- **Per-compute cleanup:** temp buffer freed immediately after execution - -Can now chain 100 ops with same 3-buffer peak.