Compare commits
2 Commits
4725723d42
...
90a7cf946f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
90a7cf946f | ||
|
|
0da02f60c4 |
@ -31,45 +31,7 @@
|
|||||||
// `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
|
// `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
|
||||||
// Once all dependencies are fetched, `zig build` no longer requires
|
// Once all dependencies are fetched, `zig build` no longer requires
|
||||||
// internet connectivity.
|
// internet connectivity.
|
||||||
.dependencies = .{
|
.dependencies = .{},
|
||||||
// See `zig fetch --save <url>` for a command-line interface for adding dependencies.
|
|
||||||
//.example = .{
|
|
||||||
// // When updating this field to a new URL, be sure to delete the corresponding
|
|
||||||
// // `hash`, otherwise you are communicating that you expect to find the old hash at
|
|
||||||
// // the new URL. If the contents of a URL change this will result in a hash mismatch
|
|
||||||
// // which will prevent zig from using it.
|
|
||||||
// .url = "https://example.com/foo.tar.gz",
|
|
||||||
//
|
|
||||||
// // This is computed from the file contents of the directory of files that is
|
|
||||||
// // obtained after fetching `url` and applying the inclusion rules given by
|
|
||||||
// // `paths`.
|
|
||||||
// //
|
|
||||||
// // This field is the source of truth; packages do not come from a `url`; they
|
|
||||||
// // come from a `hash`. `url` is just one of many possible mirrors for how to
|
|
||||||
// // obtain a package matching this `hash`.
|
|
||||||
// //
|
|
||||||
// // Uses the [multihash](https://multiformats.io/multihash/) format.
|
|
||||||
// .hash = "...",
|
|
||||||
//
|
|
||||||
// // When this is provided, the package is found in a directory relative to the
|
|
||||||
// // build root. In this case the package's hash is irrelevant and therefore not
|
|
||||||
// // computed. This field and `url` are mutually exclusive.
|
|
||||||
// .path = "foo",
|
|
||||||
//
|
|
||||||
// // When this is set to `true`, a package is declared to be lazily
|
|
||||||
// // fetched. This makes the dependency only get fetched if it is
|
|
||||||
// // actually used.
|
|
||||||
// .lazy = false,
|
|
||||||
//},
|
|
||||||
},
|
|
||||||
// Specifies the set of files and directories that are included in this package.
|
|
||||||
// Only files and directories listed here are included in the `hash` that
|
|
||||||
// is computed for this package. Only files listed here will remain on disk
|
|
||||||
// when using the zig package manager. As a rule of thumb, one should list
|
|
||||||
// files required for compilation plus any license(s).
|
|
||||||
// Paths are relative to the build root. Use the empty string (`""`) to refer to
|
|
||||||
// the build root itself.
|
|
||||||
// A directory listed here means that all files within, recursively, are included.
|
|
||||||
.paths = .{
|
.paths = .{
|
||||||
"build.zig",
|
"build.zig",
|
||||||
"build.zig.zon",
|
"build.zig.zon",
|
||||||
|
|||||||
@ -4,16 +4,19 @@ const c = @import("c.zig").c;
|
|||||||
|
|
||||||
const GpuAllocator = @This();
|
const GpuAllocator = @This();
|
||||||
|
|
||||||
|
cpu_allocator: std.mem.Allocator,
|
||||||
instance: c.WGPUInstance,
|
instance: c.WGPUInstance,
|
||||||
adapter: c.WGPUAdapter,
|
adapter: c.WGPUAdapter,
|
||||||
device: c.WGPUDevice,
|
device: c.WGPUDevice,
|
||||||
queue: c.WGPUQueue,
|
queue: c.WGPUQueue,
|
||||||
|
|
||||||
|
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
||||||
|
|
||||||
// Lazily created, cached for lifetime of allocator
|
// Lazily created, cached for lifetime of allocator
|
||||||
_pip_add: c.WGPUComputePipeline = null,
|
_pip_add: c.WGPUComputePipeline = null,
|
||||||
_pip_scale: c.WGPUComputePipeline = null,
|
_pip_scale: c.WGPUComputePipeline = null,
|
||||||
|
|
||||||
pub fn init() !GpuAllocator {
|
pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
|
||||||
const instance = c.wgpuCreateInstance(
|
const instance = c.wgpuCreateInstance(
|
||||||
&std.mem.zeroes(c.WGPUInstanceDescriptor),
|
&std.mem.zeroes(c.WGPUInstanceDescriptor),
|
||||||
) orelse return error.NoInstance;
|
) orelse return error.NoInstance;
|
||||||
@ -38,22 +41,54 @@ pub fn init() !GpuAllocator {
|
|||||||
const device = ctx.device orelse return error.NoDevice;
|
const device = ctx.device orelse return error.NoDevice;
|
||||||
|
|
||||||
return .{
|
return .{
|
||||||
|
.cpu_allocator = cpu_allocator,
|
||||||
.instance = instance,
|
.instance = instance,
|
||||||
.adapter = adapter,
|
.adapter = adapter,
|
||||||
.device = device,
|
.device = device,
|
||||||
.queue = c.wgpuDeviceGetQueue(device),
|
.queue = c.wgpuDeviceGetQueue(device),
|
||||||
|
.tracked_buffers = .init(cpu_allocator),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deinit(self: *GpuAllocator) void {
|
pub fn deinit(self: *GpuAllocator) void {
|
||||||
if (self._pip_add) |p| c.wgpuComputePipelineRelease(p);
|
if (self._pip_add) |p| c.wgpuComputePipelineRelease(p);
|
||||||
if (self._pip_scale) |p| c.wgpuComputePipelineRelease(p);
|
if (self._pip_scale) |p| c.wgpuComputePipelineRelease(p);
|
||||||
|
|
||||||
|
var it = self.tracked_buffers.keyIterator();
|
||||||
|
while (it.next()) |buf_ptr| {
|
||||||
|
const buf = buf_ptr.*;
|
||||||
|
c.wgpuBufferDestroy(buf);
|
||||||
|
c.wgpuBufferRelease(buf);
|
||||||
|
}
|
||||||
|
self.tracked_buffers.deinit();
|
||||||
|
|
||||||
c.wgpuQueueRelease(self.queue);
|
c.wgpuQueueRelease(self.queue);
|
||||||
c.wgpuDeviceRelease(self.device);
|
c.wgpuDeviceRelease(self.device);
|
||||||
c.wgpuAdapterRelease(self.adapter);
|
c.wgpuAdapterRelease(self.adapter);
|
||||||
c.wgpuInstanceRelease(self.instance);
|
c.wgpuInstanceRelease(self.instance);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn registerBuffer(
|
||||||
|
self: *GpuAllocator,
|
||||||
|
bytes: u64,
|
||||||
|
usage: c.WGPUBufferUsage,
|
||||||
|
) !c.WGPUBuffer {
|
||||||
|
const buf = c.wgpuDeviceCreateBuffer(self.device, &.{
|
||||||
|
.usage = usage,
|
||||||
|
.size = bytes,
|
||||||
|
}) orelse return error.BufferAlloc;
|
||||||
|
|
||||||
|
try self.tracked_buffers.put(buf, {});
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
|
||||||
|
if (self.tracked_buffers.remove(buf)) {
|
||||||
|
c.wgpuBufferDestroy(buf);
|
||||||
|
c.wgpuBufferRelease(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Internal ─────────────────────────────────────────────────────────────
|
// ── Internal ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn makeBuffer(
|
pub fn makeBuffer(
|
||||||
@ -117,7 +152,6 @@ fn onDevice(
|
|||||||
}
|
}
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
ctx.device = device;
|
ctx.device = device;
|
||||||
std.debug.print("{?}", .{device});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
||||||
|
|||||||
47
src/GpuBuffer.zig
Normal file
47
src/GpuBuffer.zig
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const c = @import("c.zig").c;
|
||||||
|
const GpuAllocator = @import("GpuAllocator.zig");
|
||||||
|
|
||||||
|
const GpuBuffer = @This();
|
||||||
|
|
||||||
|
raw: c.WGPUBuffer,
|
||||||
|
size: u64,
|
||||||
|
usage: c.WGPUBufferUsage,
|
||||||
|
gloc: *GpuAllocator,
|
||||||
|
|
||||||
|
/// Allocates the underlying WebGPU handle and registers it to the parent GpuAllocator
|
||||||
|
pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffer {
|
||||||
|
const raw_handle = try gloc.registerBuffer(bytes, usage);
|
||||||
|
return .{
|
||||||
|
.raw = raw_handle,
|
||||||
|
.size = bytes,
|
||||||
|
.usage = usage,
|
||||||
|
.gloc = gloc,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
|
||||||
|
pub fn deinit(self: GpuBuffer) void {
|
||||||
|
self.gloc.unregisterAndDestroyBuffer(self.raw);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Native mapAsync wrapper
|
||||||
|
pub fn mapAsync(
|
||||||
|
self: GpuBuffer,
|
||||||
|
mode: c.WGPUMapMode,
|
||||||
|
offset: u64,
|
||||||
|
size: u64,
|
||||||
|
callback_info: c.WGPUBufferMapCallbackInfo,
|
||||||
|
) void {
|
||||||
|
_ = c.wgpuBufferMapAsync(self.raw, mode, offset, size, callback_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Native getConstMappedRange wrapper
|
||||||
|
pub fn getConstMappedRange(self: GpuBuffer, offset: u64, size: u64) ?*const anyopaque {
|
||||||
|
return c.wgpuBufferGetConstMappedRange(self.raw, offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Native unmap wrapper
|
||||||
|
pub fn unmap(self: GpuBuffer) void {
|
||||||
|
c.wgpuBufferUnmap(self.raw);
|
||||||
|
}
|
||||||
100
src/Mat.zig
100
src/Mat.zig
@ -1,51 +1,49 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const c = @import("c.zig").c;
|
const c = @import("c.zig").c;
|
||||||
const GpuAllocator = @import("GpuAllocator.zig");
|
const GpuAllocator = @import("GpuAllocator.zig");
|
||||||
|
const GpuBuffer = @import("GpuBuffer.zig");
|
||||||
|
|
||||||
const Mat = @This();
|
const Mat = @This();
|
||||||
|
|
||||||
buf: c.WGPUBuffer,
|
buf: GpuBuffer,
|
||||||
rows: u32,
|
rows: usize,
|
||||||
cols: u32,
|
cols: usize,
|
||||||
|
|
||||||
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/// Allocate GPU buffer and upload `data`. `data.len` must equal rows*cols.
|
|
||||||
pub fn load(
|
pub fn load(
|
||||||
gloc: *GpuAllocator,
|
gloc: *GpuAllocator,
|
||||||
data: []const f32,
|
data: []const f32,
|
||||||
rows: u32,
|
rows: usize,
|
||||||
cols: u32,
|
cols: usize,
|
||||||
) !Mat {
|
) !Mat {
|
||||||
std.debug.assert(data.len == @as(usize, rows) * cols);
|
std.debug.assert(data.len == @as(usize, rows) * cols);
|
||||||
const bytes = data.len * @sizeOf(f32);
|
const bytes = data.len * @sizeOf(f32);
|
||||||
const buf = try gloc.makeBuffer(
|
|
||||||
|
// Uses structural constructor initialization
|
||||||
|
const buf = try GpuBuffer.init(
|
||||||
|
gloc,
|
||||||
bytes,
|
bytes,
|
||||||
c.WGPUBufferUsage_Storage |
|
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
||||||
c.WGPUBufferUsage_CopyDst |
|
|
||||||
c.WGPUBufferUsage_CopySrc,
|
|
||||||
);
|
);
|
||||||
c.wgpuQueueWriteBuffer(gloc.queue, buf, 0, data.ptr, bytes);
|
|
||||||
|
c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
|
||||||
return .{ .buf = buf, .rows = rows, .cols = cols };
|
return .{ .buf = buf, .rows = rows, .cols = cols };
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Allocate zeroed GPU buffer (no upload).
|
pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat {
|
||||||
pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat {
|
|
||||||
const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
|
const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
|
||||||
const buf = try gloc.makeBuffer(
|
const buf = try GpuBuffer.init(
|
||||||
|
gloc,
|
||||||
bytes,
|
bytes,
|
||||||
c.WGPUBufferUsage_Storage |
|
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
||||||
c.WGPUBufferUsage_CopyDst |
|
|
||||||
c.WGPUBufferUsage_CopySrc,
|
|
||||||
);
|
);
|
||||||
return .{ .buf = buf, .rows = rows, .cols = cols };
|
return .{ .buf = buf, .rows = rows, .cols = cols };
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deinit(self: Mat) void {
|
pub fn deinit(self: Mat) void {
|
||||||
c.wgpuBufferRelease(self.buf);
|
self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(self: Mat) u32 {
|
pub fn len(self: Mat) usize {
|
||||||
return self.rows * self.cols;
|
return self.rows * self.cols;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -53,7 +51,6 @@ pub fn byteSize(self: Mat) u64 {
|
|||||||
return @as(u64, self.len()) * @sizeOf(f32);
|
return @as(u64, self.len()) * @sizeOf(f32);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Element-wise add. Shapes must match. Returns new Mat (caller owns).
|
|
||||||
pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
|
pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
|
||||||
std.debug.assert(self.rows == other.rows and self.cols == other.cols);
|
std.debug.assert(self.rows == other.rows and self.cols == other.cols);
|
||||||
|
|
||||||
@ -66,7 +63,6 @@ pub fn add(self: Mat, gloc: *GpuAllocator, other: Mat) !Mat {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Element-wise multiply by scalar. Returns new Mat (caller owns).
|
|
||||||
pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
|
pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
|
||||||
const result = try Mat.zeros(gloc, self.rows, self.cols);
|
const result = try Mat.zeros(gloc, self.rows, self.cols);
|
||||||
errdefer result.deinit();
|
errdefer result.deinit();
|
||||||
@ -74,52 +70,46 @@ pub fn scale(self: Mat, gloc: *GpuAllocator, scalar: f32) !Mat {
|
|||||||
const bytes = self.byteSize();
|
const bytes = self.byteSize();
|
||||||
const n = self.len();
|
const n = self.len();
|
||||||
|
|
||||||
// Upload scalar as uniform buffer
|
const uni_buf = try GpuBuffer.init(
|
||||||
const uni_buf = try gloc.makeBuffer(
|
gloc,
|
||||||
@sizeOf(f32),
|
@sizeOf(f32),
|
||||||
c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
|
c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
|
||||||
);
|
);
|
||||||
defer c.wgpuBufferRelease(uni_buf);
|
defer uni_buf.deinit(); // Gracefully deinitializes locally
|
||||||
c.wgpuQueueWriteBuffer(gloc.queue, uni_buf, 0, &scalar, @sizeOf(f32));
|
|
||||||
|
c.wgpuQueueWriteBuffer(gloc.queue, uni_buf.raw, 0, &scalar, @sizeOf(f32));
|
||||||
|
|
||||||
const pipeline = try gloc.pipScale();
|
const pipeline = try gloc.pipScale();
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
|
||||||
|
|
||||||
const entries = [_]c.WGPUBindGroupEntry{
|
const entries = [_]c.WGPUBindGroupEntry{
|
||||||
.{ .binding = 0, .buffer = self.buf, .offset = 0, .size = bytes },
|
.{ .binding = 0, .buffer = self.buf.raw, .offset = 0, .size = bytes },
|
||||||
.{ .binding = 1, .buffer = result.buf, .offset = 0, .size = bytes },
|
.{ .binding = 1, .buffer = result.buf.raw, .offset = 0, .size = bytes },
|
||||||
.{ .binding = 2, .buffer = uni_buf, .offset = 0, .size = @sizeOf(f32) },
|
.{ .binding = 2, .buffer = uni_buf.raw, .offset = 0, .size = @sizeOf(f32) },
|
||||||
};
|
};
|
||||||
try submitPass(gloc, pipeline, &entries, n);
|
try submitPass(gloc, pipeline, &entries, n);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read GPU buffer back to CPU. `out.len` must be >= rows*cols.
|
|
||||||
pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
|
pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
|
||||||
std.debug.assert(out.len >= self.len());
|
std.debug.assert(out.len >= self.len());
|
||||||
const bytes = self.byteSize();
|
const bytes = self.byteSize();
|
||||||
|
|
||||||
const staging = try gloc.makeBuffer(
|
const staging = try GpuBuffer.init(
|
||||||
|
gloc,
|
||||||
bytes,
|
bytes,
|
||||||
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
||||||
);
|
);
|
||||||
defer c.wgpuBufferRelease(staging);
|
defer staging.deinit();
|
||||||
|
|
||||||
// Copy result → staging
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
|
||||||
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
|
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
|
||||||
return error.Encoder;
|
|
||||||
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf, 0, staging, 0, bytes);
|
|
||||||
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
||||||
defer c.wgpuCommandEncoderRelease(enc);
|
defer c.wgpuCommandEncoderRelease(enc);
|
||||||
defer c.wgpuCommandBufferRelease(cmd);
|
defer c.wgpuCommandBufferRelease(cmd);
|
||||||
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
||||||
|
|
||||||
// Map and copy to slice
|
|
||||||
var mapped = false;
|
var mapped = false;
|
||||||
_ = c.wgpuBufferMapAsync(
|
staging.mapAsync(
|
||||||
staging,
|
|
||||||
c.WGPUMapMode_Read,
|
c.WGPUMapMode_Read,
|
||||||
0,
|
0,
|
||||||
bytes,
|
bytes,
|
||||||
@ -128,10 +118,10 @@ pub fn read(self: Mat, gloc: *GpuAllocator, out: []f32) !void {
|
|||||||
while (!mapped) gloc.poll();
|
while (!mapped) gloc.poll();
|
||||||
|
|
||||||
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
||||||
c.wgpuBufferGetConstMappedRange(staging, 0, bytes),
|
staging.getConstMappedRange(0, bytes),
|
||||||
));
|
));
|
||||||
@memcpy(out[0..self.len()], ptr[0..self.len()]);
|
@memcpy(out[0..self.len()], ptr[0..self.len()]);
|
||||||
c.wgpuBufferUnmap(staging);
|
staging.unmap();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onMapped(
|
fn onMapped(
|
||||||
@ -150,19 +140,19 @@ fn onMapped(
|
|||||||
fn dispatch2in1out(
|
fn dispatch2in1out(
|
||||||
gloc: *GpuAllocator,
|
gloc: *GpuAllocator,
|
||||||
pipeline: c.WGPUComputePipeline,
|
pipeline: c.WGPUComputePipeline,
|
||||||
buf_a: c.WGPUBuffer,
|
buf_a: GpuBuffer,
|
||||||
buf_b: c.WGPUBuffer,
|
buf_b: GpuBuffer,
|
||||||
buf_out: c.WGPUBuffer,
|
buf_out: GpuBuffer,
|
||||||
bytes: u64,
|
bytes: u64,
|
||||||
n: u32,
|
n: usize,
|
||||||
) !void {
|
) !void {
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
|
|
||||||
const entries = [_]c.WGPUBindGroupEntry{
|
const entries = [_]c.WGPUBindGroupEntry{
|
||||||
.{ .binding = 0, .buffer = buf_a, .offset = 0, .size = bytes },
|
.{ .binding = 0, .buffer = buf_a.raw, .offset = 0, .size = bytes },
|
||||||
.{ .binding = 1, .buffer = buf_b, .offset = 0, .size = bytes },
|
.{ .binding = 1, .buffer = buf_b.raw, .offset = 0, .size = bytes },
|
||||||
.{ .binding = 2, .buffer = buf_out, .offset = 0, .size = bytes },
|
.{ .binding = 2, .buffer = buf_out.raw, .offset = 0, .size = bytes },
|
||||||
};
|
};
|
||||||
try submitPass(gloc, pipeline, &entries, n);
|
try submitPass(gloc, pipeline, &entries, n);
|
||||||
}
|
}
|
||||||
@ -172,7 +162,7 @@ fn submitPass(
|
|||||||
gloc: *GpuAllocator,
|
gloc: *GpuAllocator,
|
||||||
pipeline: c.WGPUComputePipeline,
|
pipeline: c.WGPUComputePipeline,
|
||||||
entries: []const c.WGPUBindGroupEntry,
|
entries: []const c.WGPUBindGroupEntry,
|
||||||
n: u32,
|
n: usize,
|
||||||
) !void {
|
) !void {
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
@ -189,7 +179,7 @@ fn submitPass(
|
|||||||
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
||||||
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
||||||
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
||||||
c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1);
|
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(ceilDiv(n, 256)), 1, 1);
|
||||||
c.wgpuComputePassEncoderEnd(pass);
|
c.wgpuComputePassEncoderEnd(pass);
|
||||||
c.wgpuComputePassEncoderRelease(pass);
|
c.wgpuComputePassEncoderRelease(pass);
|
||||||
|
|
||||||
@ -199,6 +189,6 @@ fn submitPass(
|
|||||||
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ceilDiv(n: u32, d: u32) u32 {
|
fn ceilDiv(n: usize, d: usize) usize {
|
||||||
return (n + d - 1) / d;
|
return (n + d - 1) / d;
|
||||||
}
|
}
|
||||||
|
|||||||
90
src/main.zig
90
src/main.zig
@ -2,43 +2,65 @@ const std = @import("std");
|
|||||||
const GpuAllocator = @import("GpuAllocator.zig");
|
const GpuAllocator = @import("GpuAllocator.zig");
|
||||||
const Mat = @import("Mat.zig");
|
const Mat = @import("Mat.zig");
|
||||||
|
|
||||||
pub fn main() !void {
|
pub fn main(init: std.process.Init) !void {
|
||||||
var gloc = try GpuAllocator.init();
|
var gloc = try GpuAllocator.init(init.gpa);
|
||||||
defer gloc.deinit();
|
defer gloc.deinit();
|
||||||
|
|
||||||
// Input data: a[i] = i, b[i] = 15 - i → add should give all 15s
|
// Define the sizes you want to benchmark
|
||||||
var data_a: [16]f32 = undefined;
|
const sizes = [_]usize{ 1, 1024, 4096, 16384, 65536, 262144, 1024 * 1024, 4 * 1024 * 1024 };
|
||||||
var data_b: [16]f32 = undefined;
|
|
||||||
for (0..16) |i| {
|
// Print table header
|
||||||
data_a[i] = @floatFromInt(i);
|
std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{});
|
||||||
data_b[i] = @floatFromInt(15 - i);
|
std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{});
|
||||||
|
|
||||||
|
const allocator = init.gpa;
|
||||||
|
|
||||||
|
for (sizes) |size| {
|
||||||
|
// Dynamically allocate buffers for the current size
|
||||||
|
var data_a = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(data_a);
|
||||||
|
var data_b = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(data_b);
|
||||||
|
|
||||||
|
// Populate data
|
||||||
|
for (0..size) |i| {
|
||||||
|
data_a[i] = @floatFromInt(i);
|
||||||
|
data_b[i] = @floatFromInt(size - 1 - i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start timing the GPU operations
|
||||||
|
const start = std.Io.Clock.awake.now(init.io);
|
||||||
|
|
||||||
|
const a = try Mat.load(&gloc, data_a, size, 1);
|
||||||
|
defer a.deinit();
|
||||||
|
const b = try Mat.load(&gloc, data_b, size, 1);
|
||||||
|
defer b.deinit();
|
||||||
|
|
||||||
|
// a + b
|
||||||
|
const sum = try a.add(&gloc, b);
|
||||||
|
defer sum.deinit();
|
||||||
|
|
||||||
|
// sum * 2
|
||||||
|
const scaled = try sum.scale(&gloc, 2.0);
|
||||||
|
defer scaled.deinit();
|
||||||
|
|
||||||
|
// Read back (allocating dynamically for read-back buffers too)
|
||||||
|
const out_sum = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(out_sum);
|
||||||
|
const out_scaled = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(out_scaled);
|
||||||
|
|
||||||
|
try sum.read(&gloc, out_sum);
|
||||||
|
try scaled.read(&gloc, out_scaled);
|
||||||
|
|
||||||
|
const duration = start.durationTo(std.Io.Clock.awake.now(init.io));
|
||||||
|
const ns = duration.toNanoseconds();
|
||||||
|
const ms = @as(f64, @floatFromInt(ns)) / 1_000_000.0;
|
||||||
|
const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0);
|
||||||
|
|
||||||
|
// Print table row
|
||||||
|
std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns });
|
||||||
}
|
}
|
||||||
|
|
||||||
const a = try Mat.load(&gloc, &data_a, 4, 4);
|
|
||||||
defer a.deinit();
|
|
||||||
const b = try Mat.load(&gloc, &data_b, 4, 4);
|
|
||||||
defer b.deinit();
|
|
||||||
|
|
||||||
// a + b
|
|
||||||
const sum = try a.add(&gloc, b);
|
|
||||||
defer sum.deinit();
|
|
||||||
|
|
||||||
// sum * 2
|
|
||||||
const scaled = try sum.scale(&gloc, 2.0);
|
|
||||||
defer scaled.deinit();
|
|
||||||
|
|
||||||
// Read back
|
|
||||||
var out_sum: [16]f32 = undefined;
|
|
||||||
var out_scaled: [16]f32 = undefined;
|
|
||||||
try sum.read(&gloc, &out_sum);
|
|
||||||
try scaled.read(&gloc, &out_scaled);
|
|
||||||
|
|
||||||
// Print
|
|
||||||
std.debug.print("\na + b (expect all 15):\n", .{});
|
|
||||||
printMat(&out_sum, 4, 4);
|
|
||||||
|
|
||||||
std.debug.print("\n(a + b) * 2 (expect all 30):\n", .{});
|
|
||||||
printMat(&out_scaled, 4, 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn printMat(data: []const f32, rows: u32, cols: u32) void {
|
fn printMat(data: []const f32, rows: u32, cols: u32) void {
|
||||||
|
|||||||
@ -65,7 +65,6 @@ fn onDevice(
|
|||||||
}
|
}
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
ctx.device = device;
|
ctx.device = device;
|
||||||
std.debug.print("{?}", .{device});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onMapped(
|
fn onMapped(
|
||||||
|
|||||||
@ -3,7 +3,7 @@ pub const SHADER_ADD =
|
|||||||
\\@group(0) @binding(1) var<storage, read> b : array<f32>;
|
\\@group(0) @binding(1) var<storage, read> b : array<f32>;
|
||||||
\\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
|
\\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
|
||||||
\\
|
\\
|
||||||
\\@compute @workgroup_size(64)
|
\\@compute @workgroup_size(256)
|
||||||
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
\\ let i = gid.x;
|
\\ let i = gid.x;
|
||||||
\\ if (i < arrayLength(&out)) {
|
\\ if (i < arrayLength(&out)) {
|
||||||
@ -18,7 +18,7 @@ pub const SHADER_SCALE =
|
|||||||
\\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
|
\\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
|
||||||
\\@group(0) @binding(2) var<uniform> u : Uniforms;
|
\\@group(0) @binding(2) var<uniform> u : Uniforms;
|
||||||
\\
|
\\
|
||||||
\\@compute @workgroup_size(64)
|
\\@compute @workgroup_size(256)
|
||||||
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
\\ let i = gid.x;
|
\\ let i = gid.x;
|
||||||
\\ if (i < arrayLength(&out)) {
|
\\ if (i < arrayLength(&out)) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user