zig-wgpu/src/Mat.zig
2026-05-17 20:39:03 +02:00

199 lines
6.3 KiB
Zig

/// Dummy
const std = @import("std");
const c = @import("c.zig").c;
const GpuAllocator = @import("GpuAllocator.zig");
const GpuBuffer = @import("GpuBuffer.zig");
const Mat = @This();
buf: GpuBuffer,
rows: usize,
cols: usize,
pub fn load(
gloc: *GpuAllocator,
data: []const f32,
rows: usize,
cols: usize,
) !Mat {
std.debug.assert(data.len == @as(usize, rows) * cols);
const bytes = data.len * @sizeOf(f32);
// Uses structural constructor initialization
const buf = try GpuBuffer.init(
gloc,
bytes,
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
);
c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
return .{ .buf = buf, .rows = rows, .cols = cols };
}
pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat {
const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
const buf = try GpuBuffer.init(
gloc,
bytes,
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
);
return .{ .buf = buf, .rows = rows, .cols = cols };
}
pub fn deinit(self: Mat) void {
self.buf.deinit();
}
pub fn len(self: Mat) usize {
return self.rows * self.cols;
}
pub fn byteSize(self: Mat) u64 {
return @as(u64, self.len()) * @sizeOf(f32);
}
pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
std.debug.assert(self.rows == other.rows and self.cols == other.cols);
const result = try Mat.zeros(gloc, self.rows, self.cols);
errdefer result.deinit();
try dispatch2in1out(gloc, gloc.pipelines.add, self.buf, other.buf, result.buf, self.byteSize());
return result;
}
pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
const out = try alloc.alloc(f32, self.len());
const bytes = self.byteSize();
const staging = try GpuBuffer.init(
gloc,
bytes,
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
);
defer staging.deinit();
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
const cmd = c.wgpuCommandEncoderFinish(enc, null);
defer c.wgpuCommandEncoderRelease(enc);
defer c.wgpuCommandBufferRelease(cmd);
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
var mapped = false;
staging.mapAsync(
c.WGPUMapMode_Read,
0,
bytes,
.{ .callback = onMapped, .userdata1 = &mapped },
);
while (!mapped) gloc.poll();
const ptr: [*]const f32 = @ptrCast(@alignCast(
staging.getConstMappedRange(0, bytes),
));
@memcpy(out[0..self.len()], ptr[0..self.len()]);
staging.unmap();
return out;
}
fn onMapped(
status: c.WGPUMapAsyncStatus,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
const flag: *bool = @ptrCast(@alignCast(userdata1.?));
flag.* = (status == c.WGPUMapAsyncStatus_Success);
}
// ── Dispatch helpers ──────────────────────────────────────────────────────────
/// Encode + submit a 2-input, 1-output compute pass (used by add).
fn dispatch2in1out(
gloc: *GpuAllocator,
pipeline: c.WGPUComputePipeline,
buf_a: GpuBuffer,
buf_b: GpuBuffer,
buf_out: GpuBuffer,
bytes: u64,
) !void {
const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB
var offset: u64 = 0;
while (offset < bytes) {
// Calculate bounds for the current chunk
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32));
// Create uniform buffer for this specific chunk's size
const info_buf = try GpuBuffer.init(
gloc,
@sizeOf(u32),
c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
);
defer info_buf.deinit();
// Write the number of elements *in this chunk* to the uniform buffer
c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
// Bind only the sub-slice for this chunk using `.offset` and `.size`
const entries = [_]c.WGPUBindGroupEntry{
.{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes },
.{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes },
.{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes },
.{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) },
};
// Submit the pass for this specific chunk
try submitPass(gloc, pipeline, &entries, current_chunk_elements);
offset += current_chunk_bytes;
}
}
/// Create bind group, encode pass, submit.
fn submitPass(
gloc: *GpuAllocator,
pipeline: c.WGPUComputePipeline,
entries: []const c.WGPUBindGroupEntry,
n: usize,
) !void {
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
defer c.wgpuBindGroupLayoutRelease(bgl);
const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{
.layout = bgl,
.entries = entries.ptr,
.entryCount = entries.len,
}) orelse return error.BindGroup;
defer c.wgpuBindGroupRelease(bg);
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
return error.Encoder;
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
const WORKGROUP_SIZE = 256;
const MAX_WORKGROUPS = 65535;
const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE);
const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS);
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
c.wgpuComputePassEncoderEnd(pass);
c.wgpuComputePassEncoderRelease(pass);
const cmd = c.wgpuCommandEncoderFinish(enc, null);
defer c.wgpuCommandEncoderRelease(enc);
defer c.wgpuCommandBufferRelease(cmd);
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
}
fn ceilDiv(n: usize, d: usize) usize {
return (n + d - 1) / d;
}