193 lines
5.9 KiB
Zig
193 lines
5.9 KiB
Zig
/// Dummy
|
|
const std = @import("std");
|
|
const c = @import("c.zig").c;
|
|
const GpuAllocator = @import("GpuAllocator.zig");
|
|
const GpuBuffer = @import("GpuBuffer.zig");
|
|
const GpuDevice = @import("GpuDevice.zig");
|
|
const GpuPipeline = @import("GpuPipeline.zig");
|
|
|
|
const Vec = @This();
|
|
|
|
buf: GpuBuffer,
|
|
len: usize,
|
|
|
|
pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
|
|
return .{
|
|
.buf = try GpuBuffer.init(
|
|
gloc,
|
|
len * @sizeOf(f32),
|
|
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
|
),
|
|
.len = len,
|
|
};
|
|
}
|
|
|
|
pub fn initLoad(gloc: *GpuAllocator, data: []const f32) !Vec {
|
|
var self = try initZero(gloc, data.len);
|
|
try self.load(gloc.device, data);
|
|
return self;
|
|
}
|
|
|
|
pub fn deinit(self: Vec) void {
|
|
self.buf.deinit();
|
|
}
|
|
|
|
/// CPU to GPU.
|
|
pub fn load(
|
|
self: Vec,
|
|
device: GpuDevice,
|
|
data: []const f32,
|
|
) !void {
|
|
std.debug.assert(data.len == self.len);
|
|
const bytes = data.len * @sizeOf(f32);
|
|
c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
|
|
}
|
|
|
|
pub fn byteSize(self: Vec) u64 {
|
|
return @as(u64, self.len) * @sizeOf(f32);
|
|
}
|
|
|
|
pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
|
std.debug.assert(self.len == other.len);
|
|
|
|
const result = try Vec.initZero(gloc, self.len);
|
|
errdefer result.deinit();
|
|
|
|
try dispatch2in1out(gloc, pip.raw, self.buf, other.buf, result.buf, self.byteSize());
|
|
|
|
return result;
|
|
}
|
|
|
|
/// GPU to CPU.
|
|
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
|
const out = try alloc.alloc(f32, self.len);
|
|
const bytes = self.byteSize();
|
|
|
|
const staging = try GpuBuffer.init(
|
|
gloc,
|
|
bytes,
|
|
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
|
);
|
|
defer staging.deinit();
|
|
|
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
|
|
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
|
|
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
|
defer c.wgpuCommandEncoderRelease(enc);
|
|
defer c.wgpuCommandBufferRelease(cmd);
|
|
c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
|
|
|
|
var mapped = false;
|
|
staging.mapAsync(
|
|
c.WGPUMapMode_Read,
|
|
0,
|
|
bytes,
|
|
.{ .callback = onMapped, .userdata1 = &mapped },
|
|
);
|
|
while (!mapped) gloc.device.poll();
|
|
|
|
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
|
staging.getConstMappedRange(0, bytes),
|
|
));
|
|
@memcpy(out[0..self.len], ptr[0..self.len]);
|
|
staging.unmap();
|
|
|
|
return out;
|
|
}
|
|
|
|
fn onMapped(
|
|
status: c.WGPUMapAsyncStatus,
|
|
_: c.WGPUStringView,
|
|
userdata1: ?*anyopaque,
|
|
_: ?*anyopaque,
|
|
) callconv(.c) void {
|
|
const flag: *bool = @ptrCast(@alignCast(userdata1.?));
|
|
flag.* = (status == c.WGPUMapAsyncStatus_Success);
|
|
}
|
|
|
|
/// Encode + submit a 2-input, 1-output compute pass (used by add).
|
|
fn dispatch2in1out(
|
|
gloc: *GpuAllocator,
|
|
pipeline: c.WGPUComputePipeline,
|
|
buf_a: GpuBuffer,
|
|
buf_b: GpuBuffer,
|
|
buf_out: GpuBuffer,
|
|
bytes: u64,
|
|
) !void {
|
|
const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB
|
|
|
|
var offset: u64 = 0;
|
|
while (offset < bytes) {
|
|
// Calculate bounds for the current chunk
|
|
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
|
|
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32));
|
|
|
|
// Create uniform buffer for this specific chunk's size
|
|
const info_buf = try GpuBuffer.init(
|
|
gloc,
|
|
@sizeOf(u32),
|
|
c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
|
|
);
|
|
defer info_buf.deinit();
|
|
|
|
// Write the number of elements *in this chunk* to the uniform buffer
|
|
c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32));
|
|
|
|
// Bind only the sub-slice for this chunk using `.offset` and `.size`
|
|
const entries = [_]c.WGPUBindGroupEntry{
|
|
.{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes },
|
|
.{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes },
|
|
.{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes },
|
|
.{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) },
|
|
};
|
|
|
|
// Submit the pass for this specific chunk
|
|
try submitPass(gloc, pipeline, &entries, current_chunk_elements);
|
|
|
|
offset += current_chunk_bytes;
|
|
}
|
|
}
|
|
|
|
/// Create bind group, encode pass, submit.
|
|
fn submitPass(
|
|
gloc: *GpuAllocator,
|
|
pipeline: c.WGPUComputePipeline,
|
|
entries: []const c.WGPUBindGroupEntry,
|
|
n: usize,
|
|
) !void {
|
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
|
|
|
const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
|
|
.layout = bgl,
|
|
.entries = entries.ptr,
|
|
.entryCount = entries.len,
|
|
}) orelse return error.BindGroup;
|
|
defer c.wgpuBindGroupRelease(bg);
|
|
|
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
|
|
return error.Encoder;
|
|
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
|
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
|
|
|
const WORKGROUP_SIZE = 256;
|
|
const MAX_WORKGROUPS = 65535;
|
|
|
|
const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE);
|
|
const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS);
|
|
|
|
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
|
|
c.wgpuComputePassEncoderEnd(pass);
|
|
c.wgpuComputePassEncoderRelease(pass);
|
|
|
|
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
|
defer c.wgpuCommandEncoderRelease(enc);
|
|
defer c.wgpuCommandBufferRelease(cmd);
|
|
c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
|
|
}
|
|
|
|
fn ceilDiv(n: usize, d: usize) usize {
|
|
return (n + d - 1) / d;
|
|
}
|