Started to understand how it work and implemented chained operation
This commit is contained in:
parent
532e8c1897
commit
8050bab969
@ -65,6 +65,7 @@ fn onDevice(
|
|||||||
}
|
}
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
ctx.device = device;
|
ctx.device = device;
|
||||||
|
std.debug.print("{?}", .{device});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onMapped(
|
fn onMapped(
|
||||||
|
|||||||
137
src/wgpu.zig
Normal file
137
src/wgpu.zig
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const c = @cImport(@cInclude("wgpu.h"));
|
||||||
|
|
||||||
|
/// Replace enum_WGPURequestAdapterStatus
|
||||||
|
pub const RequestAdapterStatus = enum {
|
||||||
|
Success,
|
||||||
|
CallbackCancelled,
|
||||||
|
Unavailable,
|
||||||
|
Error,
|
||||||
|
Force32,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const BufferUsage = enum(u64) {
|
||||||
|
None = 0,
|
||||||
|
MapRead = 1, // CPU can read after GPU finishes
|
||||||
|
MapWrite = 2,
|
||||||
|
CopySrc = 4, // can copy from GPU to staging.
|
||||||
|
CopyDst = 8, // CPU can write to it
|
||||||
|
Index = 16,
|
||||||
|
Vertex = 32,
|
||||||
|
Uniform = 64,
|
||||||
|
Storage = 128,
|
||||||
|
Indirect = 256,
|
||||||
|
QueryResolve = 512,
|
||||||
|
};
|
||||||
|
|
||||||
|
const Ctx = struct {
|
||||||
|
adapter: c.WGPUAdapter = null,
|
||||||
|
device: c.WGPUDevice = null,
|
||||||
|
};
|
||||||
|
|
||||||
|
fn onAdapter(
|
||||||
|
status: c.WGPURequestAdapterStatus,
|
||||||
|
adapter: c.WGPUAdapter,
|
||||||
|
_: c.WGPUStringView,
|
||||||
|
userdata1: ?*anyopaque,
|
||||||
|
_: ?*anyopaque,
|
||||||
|
) callconv(.c) void {
|
||||||
|
if (status != c.WGPURequestAdapterStatus_Success) {
|
||||||
|
std.log.err("Adapter request failed (status={d})", .{status});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
|
ctx.adapter = adapter;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn onDevice(
|
||||||
|
status: c.WGPURequestDeviceStatus,
|
||||||
|
device: c.WGPUDevice,
|
||||||
|
_: c.WGPUStringView,
|
||||||
|
userdata1: ?*anyopaque,
|
||||||
|
_: ?*anyopaque,
|
||||||
|
) callconv(.c) void {
|
||||||
|
if (status != c.WGPURequestDeviceStatus_Success) {
|
||||||
|
std.log.err("Device request failed (status={d})", .{status});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
|
ctx.device = device;
|
||||||
|
std.debug.print("{?}", .{device});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn onMapped(
|
||||||
|
status: c.WGPUMapAsyncStatus,
|
||||||
|
_: c.WGPUStringView,
|
||||||
|
userdata1: ?*anyopaque,
|
||||||
|
_: ?*anyopaque,
|
||||||
|
) callconv(.c) void {
|
||||||
|
const flag: *bool = @ptrCast(@alignCast(userdata1.?));
|
||||||
|
flag.* = (status == c.WGPUMapAsyncStatus_Success);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sv(s: []const u8) c.WGPUStringView {
|
||||||
|
return .{ .data = s.ptr, .length = s.len };
|
||||||
|
}
|
||||||
|
|
||||||
|
const AllocatorGPU = @This();
|
||||||
|
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
instance: *c.struct_WGPUInstanceImpl,
|
||||||
|
adapter: *c.struct_WGPUAdapterImpl,
|
||||||
|
device: c.struct_WGPUDeviceImpl,
|
||||||
|
queue: c.struct_WGPUQueueImpl,
|
||||||
|
ctx: Ctx,
|
||||||
|
|
||||||
|
buffers: std.AutoHashMap(*c.struct_WGPUBufferImpl, void),
|
||||||
|
|
||||||
|
pub fn init(allocator: std.mem.Allocator) !AllocatorGPU {
|
||||||
|
var self: AllocatorGPU = undefined;
|
||||||
|
self.allocator = allocator;
|
||||||
|
self.ctx = .{};
|
||||||
|
self.buffers = try .init(allocator);
|
||||||
|
|
||||||
|
// 1. Instance ──────────────────────────────────────────────────────────────
|
||||||
|
self.instance = c.wgpuCreateInstance(&std.mem.zeroes(c.WGPUInstanceDescriptor)) orelse
|
||||||
|
return error.NoInstance;
|
||||||
|
|
||||||
|
// 2. Adapter (async → poll) ────────────────────────────────────────────────
|
||||||
|
_ = c.wgpuInstanceRequestAdapter(
|
||||||
|
self.instance,
|
||||||
|
&.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
|
||||||
|
.{ .callback = onAdapter, .userdata1 = &self.ctx },
|
||||||
|
);
|
||||||
|
c.wgpuInstanceProcessEvents(self.instance); // drive callbacks
|
||||||
|
self.adapter = self.ctx.adapter orelse return error.NoAdapter;
|
||||||
|
|
||||||
|
// 3. Device ────────────────────────────────────────────────────────────────
|
||||||
|
_ = c.wgpuAdapterRequestDevice(self.adapter, null, .{ .callback = onDevice, .userdata1 = &self.ctx });
|
||||||
|
c.wgpuInstanceProcessEvents(self.instance);
|
||||||
|
self.device = self.ctx.device orelse return error.NoDevice;
|
||||||
|
|
||||||
|
self.queue = c.wgpuDeviceGetQueue(self.device);
|
||||||
|
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: AllocatorGPU) void {
|
||||||
|
c.wgpuInstanceRelease(self.instance);
|
||||||
|
defer c.wgpuAdapterRelease(self.adapter);
|
||||||
|
defer c.wgpuDeviceRelease(self.device);
|
||||||
|
defer c.wgpuQueueRelease(self.queue);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn addBuff(
|
||||||
|
self: AllocatorGPU,
|
||||||
|
comptime T: type,
|
||||||
|
comptime len: comptime_int,
|
||||||
|
comptime opt: struct {},
|
||||||
|
) !void {
|
||||||
|
self.buffers.put(
|
||||||
|
c.wgpuDeviceCreateBuffer(self.device, &.{
|
||||||
|
.usage = c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst,
|
||||||
|
.size = len * @bitSizeOf(T),
|
||||||
|
}) orelse return error.Buffer,
|
||||||
|
{},
|
||||||
|
);
|
||||||
|
}
|
||||||
279
tmp
Normal file
279
tmp
Normal file
@ -0,0 +1,279 @@
|
|||||||
|
# Tensor GPU: Memory & Pipeline Strategy
|
||||||
|
|
||||||
|
**Best approach:** Lazy graph + ping-pong buffers + single command buffer.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
**Problem with eager pipelines:**
|
||||||
|
```
|
||||||
|
m1.add(m2) → dispatch + sync point (slow)
|
||||||
|
.mul(5) → dispatch + sync point (slow)
|
||||||
|
.sub(m3) → dispatch + sync point (slow)
|
||||||
|
Result: 3× GPU kernel submission overhead. Many intermediate buffers.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Better: Build graph, execute once:**
|
||||||
|
```
|
||||||
|
m1.add(m2).mul(5).sub(m3) // build operation list
|
||||||
|
.compute() // ONE command buffer, all ops
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
```zig
|
||||||
|
const std = @import("std");
|
||||||
|
const c = @cImport(@cInclude("wgpu.h"));
|
||||||
|
|
||||||
|
pub const Operation = union(enum) {
|
||||||
|
add: struct { other: *TensorGPU },
|
||||||
|
mul: struct { scalar: f32 },
|
||||||
|
sub: struct { other: *TensorGPU },
|
||||||
|
div: struct { scalar: f32 },
|
||||||
|
};
|
||||||
|
|
||||||
|
pub const TensorGPU = struct {
|
||||||
|
gpu: *AllocatorGPU,
|
||||||
|
buffer: c.WGPUBuffer,
|
||||||
|
shape: [2]u32, // rows, cols
|
||||||
|
element_count: u32,
|
||||||
|
buf_bytes: u32,
|
||||||
|
|
||||||
|
operations: std.ArrayList(Operation),
|
||||||
|
is_computed: bool,
|
||||||
|
allocator: std.mem.Allocator,
|
||||||
|
|
||||||
|
pub fn init(gpu: *AllocatorGPU, shape: [2]u32, allocator: std.mem.Allocator) !TensorGPU {
|
||||||
|
const rows = shape[0];
|
||||||
|
const cols = shape[1];
|
||||||
|
const element_count = rows * cols;
|
||||||
|
const buf_bytes = element_count * @sizeOf(f32);
|
||||||
|
|
||||||
|
const buffer = c.wgpuDeviceCreateBuffer(gpu.device, &.{
|
||||||
|
.usage = c.WGPUBufferUsage_Storage
|
||||||
|
| c.WGPUBufferUsage_CopySrc
|
||||||
|
| c.WGPUBufferUsage_CopyDst,
|
||||||
|
.size = buf_bytes,
|
||||||
|
}) orelse return error.BufferCreate;
|
||||||
|
|
||||||
|
var self: TensorGPU = .{
|
||||||
|
.gpu = gpu,
|
||||||
|
.buffer = buffer,
|
||||||
|
.shape = shape,
|
||||||
|
.element_count = element_count,
|
||||||
|
.buf_bytes = buf_bytes,
|
||||||
|
.operations = try std.ArrayList(Operation).initCapacity(allocator, 8),
|
||||||
|
.is_computed = true,
|
||||||
|
.allocator = allocator,
|
||||||
|
};
|
||||||
|
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *TensorGPU) void {
|
||||||
|
c.wgpuBufferRelease(self.buffer);
|
||||||
|
self.operations.deinit();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add(self: *TensorGPU, other: *TensorGPU) *TensorGPU {
|
||||||
|
self.operations.append(.{ .add = .{ .other = other } }) catch unreachable;
|
||||||
|
self.is_computed = false;
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mul(self: *TensorGPU, scalar: f32) *TensorGPU {
|
||||||
|
self.operations.append(.{ .mul = .{ .scalar = scalar } }) catch unreachable;
|
||||||
|
self.is_computed = false;
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sub(self: *TensorGPU, other: *TensorGPU) *TensorGPU {
|
||||||
|
self.operations.append(.{ .sub = .{ .other = other } }) catch unreachable;
|
||||||
|
self.is_computed = false;
|
||||||
|
return self;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn compute(self: *TensorGPU) !void {
|
||||||
|
if (self.is_computed or self.operations.items.len == 0) return;
|
||||||
|
|
||||||
|
// Allocate ping-pong temp buffer (freed after compute)
|
||||||
|
const buf_temp = c.wgpuDeviceCreateBuffer(self.gpu.device, &.{
|
||||||
|
.usage = c.WGPUBufferUsage_Storage
|
||||||
|
| c.WGPUBufferUsage_CopySrc
|
||||||
|
| c.WGPUBufferUsage_CopyDst,
|
||||||
|
.size = self.buf_bytes,
|
||||||
|
}) orelse return error.TempBuffer;
|
||||||
|
defer c.wgpuBufferRelease(buf_temp);
|
||||||
|
|
||||||
|
// Build ONE command encoder for all operations
|
||||||
|
const encoder = c.wgpuDeviceCreateCommandEncoder(self.gpu.device, null)
|
||||||
|
orelse return error.Encoder;
|
||||||
|
defer c.wgpuCommandEncoderRelease(encoder);
|
||||||
|
|
||||||
|
var buf_read = self.buffer; // input
|
||||||
|
var buf_write = buf_temp; // output (swap after each op)
|
||||||
|
|
||||||
|
for (self.operations.items) |op| {
|
||||||
|
try self.encodeOp(encoder, op, buf_read, buf_write);
|
||||||
|
|
||||||
|
// Swap: output becomes input for next op
|
||||||
|
const tmp = buf_read;
|
||||||
|
buf_read = buf_write;
|
||||||
|
buf_write = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final result in buf_read; copy back to self.buffer if needed
|
||||||
|
if (buf_read != self.buffer) {
|
||||||
|
c.wgpuCommandEncoderCopyBufferToBuffer(
|
||||||
|
encoder, buf_read, 0, self.buffer, 0, self.buf_bytes,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const cmdbuf = c.wgpuCommandEncoderFinish(encoder, null)
|
||||||
|
orelse return error.CommandBuffer;
|
||||||
|
defer c.wgpuCommandBufferRelease(cmdbuf);
|
||||||
|
|
||||||
|
c.wgpuQueueSubmit(self.gpu.queue, 1, &cmdbuf);
|
||||||
|
|
||||||
|
self.operations.clearAndFree();
|
||||||
|
self.is_computed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn encodeOp(
|
||||||
|
self: TensorGPU,
|
||||||
|
encoder: c.WGPUCommandEncoder,
|
||||||
|
op: Operation,
|
||||||
|
buf_in: c.WGPUBuffer,
|
||||||
|
buf_out: c.WGPUBuffer,
|
||||||
|
) !void {
|
||||||
|
const shader_code = switch (op) {
|
||||||
|
.add => SHADER_ADD,
|
||||||
|
.mul => SHADER_MUL,
|
||||||
|
.sub => SHADER_SUB,
|
||||||
|
.div => SHADER_DIV,
|
||||||
|
};
|
||||||
|
|
||||||
|
var wgsl_src = c.WGPUShaderSourceWGSL{
|
||||||
|
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
|
||||||
|
.code = sv(shader_code),
|
||||||
|
};
|
||||||
|
|
||||||
|
const shader = c.wgpuDeviceCreateShaderModule(self.gpu.device, &.{
|
||||||
|
.nextInChain = @ptrCast(&wgsl_src),
|
||||||
|
}) orelse return error.Shader;
|
||||||
|
defer c.wgpuShaderModuleRelease(shader);
|
||||||
|
|
||||||
|
const pipeline = c.wgpuDeviceCreateComputePipeline(self.gpu.device, &.{
|
||||||
|
.compute = .{ .module = shader, .entryPoint = sv("main") },
|
||||||
|
}) orelse return error.Pipeline;
|
||||||
|
defer c.wgpuComputePipelineRelease(pipeline);
|
||||||
|
|
||||||
|
// Bind groups depend on operation (binary vs unary)
|
||||||
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
|
|
||||||
|
var entries: [3]c.WGPUBindGroupEntry = undefined;
|
||||||
|
var entry_count: u32 = 2;
|
||||||
|
|
||||||
|
entries[0] = .{ .binding = 0, .buffer = buf_in, .size = self.buf_bytes };
|
||||||
|
entries[1] = .{ .binding = 1, .buffer = buf_out, .size = self.buf_bytes };
|
||||||
|
|
||||||
|
if (op == .add or op == .sub) {
|
||||||
|
entries[2] = .{
|
||||||
|
.binding = 2,
|
||||||
|
.buffer = op.add.other.buffer, // or op.sub.other
|
||||||
|
.size = self.buf_bytes,
|
||||||
|
};
|
||||||
|
entry_count = 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bind_group = c.wgpuDeviceCreateBindGroup(self.gpu.device, &.{
|
||||||
|
.layout = bgl,
|
||||||
|
.entries = entries[0..entry_count],
|
||||||
|
.entryCount = entry_count,
|
||||||
|
}) orelse return error.BindGroup;
|
||||||
|
defer c.wgpuBindGroupRelease(bind_group);
|
||||||
|
|
||||||
|
const pass = c.wgpuCommandEncoderBeginComputePass(encoder, null);
|
||||||
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
||||||
|
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bind_group, 0, null);
|
||||||
|
|
||||||
|
const workgroups_x = (self.shape[1] + 3) / 4;
|
||||||
|
const workgroups_y = (self.shape[0] + 3) / 4;
|
||||||
|
c.wgpuComputePassEncoderDispatchWorkgroups(pass, workgroups_x, workgroups_y, 1);
|
||||||
|
|
||||||
|
c.wgpuComputePassEncoderEnd(pass);
|
||||||
|
c.wgpuComputePassEncoderRelease(pass);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// ── Shaders ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const SHADER_ADD =
|
||||||
|
\\@group(0) @binding(0) var<storage, read> mat_a : array<f32>;
|
||||||
|
\\@group(0) @binding(1) var<storage, read_write> mat_c : array<f32>;
|
||||||
|
\\@group(0) @binding(2) var<storage, read> mat_b : array<f32>;
|
||||||
|
\\@compute @workgroup_size(4, 4)
|
||||||
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
|
\\ let idx = gid.y * 4u + gid.x;
|
||||||
|
\\ mat_c[idx] = mat_a[idx] + mat_b[idx];
|
||||||
|
\\}
|
||||||
|
;
|
||||||
|
|
||||||
|
const SHADER_MUL =
|
||||||
|
\\@group(0) @binding(0) var<storage, read> mat_a : array<f32>;
|
||||||
|
\\@group(0) @binding(1) var<storage, read_write> mat_c : array<f32>;
|
||||||
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
|
\\ let idx = gid.y * 4u + gid.x;
|
||||||
|
\\ mat_c[idx] = mat_a[idx] * 5.0; // hardcoded for demo
|
||||||
|
\\}
|
||||||
|
;
|
||||||
|
|
||||||
|
// ... SUB, DIV similar
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```zig
|
||||||
|
var gpu_alloc = try AllocatorGPU.init(allocator);
|
||||||
|
defer gpu_alloc.deinit();
|
||||||
|
|
||||||
|
var m1 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator);
|
||||||
|
var m2 = try TensorGPU.init(&gpu_alloc, .{4, 4}, allocator);
|
||||||
|
defer m1.deinit();
|
||||||
|
defer m2.deinit();
|
||||||
|
|
||||||
|
// Chain: lazy, no GPU work yet
|
||||||
|
m1.add(m2).mul(5).sub(m1).compute(); // ← NOW executes all at once
|
||||||
|
|
||||||
|
// m1.buffer contains final result
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Memory Breakdown
|
||||||
|
|
||||||
|
| Buffer | Lifetime | Size | Notes |
|
||||||
|
|--------|----------|------|-------|
|
||||||
|
| `m1.buffer` | Persistent (user owns) | N×4 bytes | Input + final output |
|
||||||
|
| `m2.buffer` | Persistent (user owns) | N×4 bytes | Input (read-only) |
|
||||||
|
| `buf_temp` (ping-pong) | compute() scope | N×4 bytes | Allocated/freed per compute() |
|
||||||
|
|
||||||
|
**Max GPU RAM for 3-op chain:** 2×buffer + 1×temp = 3× data size. Not 4×.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key Points
|
||||||
|
|
||||||
|
- **One command buffer:** all ops fused, single GPU submit
|
||||||
|
- **Ping-pong:** swap buf_read ↔ buf_write after each op (no extra allocs)
|
||||||
|
- **Lazy:** operations queued until `.compute()` called
|
||||||
|
- **No intermediate tensors:** user doesn't allocate intermediate results
|
||||||
|
- **Per-compute cleanup:** temp buffer freed immediately after execution
|
||||||
|
|
||||||
|
Can now chain 100 ops with same 3-buffer peak.
|
||||||
Loading…
x
Reference in New Issue
Block a user