Splited GpuAllocator into GpuAllocator and GpuDevice

This commit is contained in:
adrien 2026-05-17 20:55:49 +02:00
parent d57968d6df
commit cef6155f41
4 changed files with 131 additions and 123 deletions

View File

@ -1,69 +1,24 @@
const std = @import("std"); const std = @import("std");
const sh = @import("shaders.zig"); const sh = @import("shaders.zig");
const GpuDevice = @import("GpuDevice.zig");
const c = @import("c.zig").c; const c = @import("c.zig").c;
const GpuAllocator = @This(); const GpuAllocator = @This();
device: GpuDevice,
cpu_allocator: std.mem.Allocator, cpu_allocator: std.mem.Allocator,
instance: c.WGPUInstance,
adapter: c.WGPUAdapter,
device: c.WGPUDevice,
queue: c.WGPUQueue,
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void), tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
pipelines: struct { pipelines: struct {
add: c.WGPUComputePipeline, add: c.WGPUComputePipeline,
}, },
pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator { pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
const instance = c.wgpuCreateInstance(
&std.mem.zeroes(c.WGPUInstanceDescriptor),
) orelse return error.NoInstance;
errdefer c.wgpuInstanceRelease(instance);
var ctx = Ctx{};
_ = c.wgpuInstanceRequestAdapter(
instance,
&.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
.{ .callback = onAdapter, .userdata1 = &ctx },
);
c.wgpuInstanceProcessEvents(instance);
const adapter = ctx.adapter orelse return error.NoAdapter;
errdefer c.wgpuAdapterRelease(adapter);
// --- QUERY HARDWARE LIMITS ---
var supported_limits = std.mem.zeroes(c.WGPULimits);
supported_limits.nextInChain = null;
// Fetch what your physical graphic card can actually handle
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
const device_descriptor = c.WGPUDeviceDescriptor{
.nextInChain = null,
.label = sv("TensorCompilerDevice"),
.requiredFeatureCount = 0,
.requiredFeatures = null,
.requiredLimits = &supported_limits,
};
_ = c.wgpuAdapterRequestDevice(
adapter,
&device_descriptor,
.{ .callback = onDevice, .userdata1 = &ctx },
);
c.wgpuInstanceProcessEvents(instance);
const device = ctx.device orelse return error.NoDevice;
return .{ return .{
.cpu_allocator = cpu_allocator,
.instance = instance,
.adapter = adapter,
.device = device, .device = device,
.queue = c.wgpuDeviceGetQueue(device), .cpu_allocator = cpu_allocator,
.tracked_buffers = .init(cpu_allocator), .tracked_buffers = .init(cpu_allocator),
.pipelines = .{ .pipelines = .{
.add = try buildPipeline(device, sh.SHADER_ADD), .add = try buildPipeline(device.device, sh.SHADER_ADD),
}, },
}; };
} }
@ -79,11 +34,6 @@ pub fn deinit(self: *GpuAllocator) void {
c.wgpuBufferRelease(buf); c.wgpuBufferRelease(buf);
} }
self.tracked_buffers.deinit(); self.tracked_buffers.deinit();
c.wgpuQueueRelease(self.queue);
c.wgpuDeviceRelease(self.device);
c.wgpuAdapterRelease(self.adapter);
c.wgpuInstanceRelease(self.instance);
} }
pub fn registerBuffer( pub fn registerBuffer(
@ -91,7 +41,7 @@ pub fn registerBuffer(
bytes: u64, bytes: u64,
usage: c.WGPUBufferUsage, usage: c.WGPUBufferUsage,
) !c.WGPUBuffer { ) !c.WGPUBuffer {
const buf = c.wgpuDeviceCreateBuffer(self.device, &.{ const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
.usage = usage, .usage = usage,
.size = bytes, .size = bytes,
}) orelse return error.BufferAlloc; }) orelse return error.BufferAlloc;
@ -107,59 +57,6 @@ pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
} }
} }
// Internal
pub fn makeBuffer(
self: *GpuAllocator,
bytes: u64,
usage: c.WGPUBufferUsage,
) !c.WGPUBuffer {
return c.wgpuDeviceCreateBuffer(self.device, &.{
.usage = usage,
.size = bytes,
}) orelse error.BufferAlloc;
}
/// Poll until GPU work completes. Use after submit if you need CPU sync.
pub fn poll(self: *GpuAllocator) void {
_ = c.wgpuDevicePoll(self.device, 1, null);
}
const Ctx = struct {
adapter: c.WGPUAdapter = null,
device: c.WGPUDevice = null,
};
fn onAdapter(
status: c.WGPURequestAdapterStatus,
adapter: c.WGPUAdapter,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
if (status != c.WGPURequestAdapterStatus_Success) {
std.log.err("Adapter request failed (status={d})", .{status});
return;
}
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
ctx.adapter = adapter;
}
fn onDevice(
status: c.WGPURequestDeviceStatus,
device: c.WGPUDevice,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
if (status != c.WGPURequestDeviceStatus_Success) {
std.log.err("Device request failed (status={d})", .{status});
return;
}
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
ctx.device = device;
}
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline { fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
var wgsl_src = c.WGPUShaderSourceWGSL{ var wgsl_src = c.WGPUShaderSourceWGSL{
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },

107
src/GpuDevice.zig Normal file
View File

@ -0,0 +1,107 @@
const std = @import("std");
const sh = @import("shaders.zig");
const c = @import("c.zig").c;
const Ctx = struct {
adapter: c.WGPUAdapter = null,
device: c.WGPUDevice = null,
};
const GpuAllocator = @This();
instance: c.WGPUInstance,
adapter: c.WGPUAdapter,
device: c.WGPUDevice,
queue: c.WGPUQueue,
pub fn init() !GpuAllocator {
const instance = c.wgpuCreateInstance(
&std.mem.zeroes(c.WGPUInstanceDescriptor),
) orelse return error.NoInstance;
errdefer c.wgpuInstanceRelease(instance);
var ctx = Ctx{};
_ = c.wgpuInstanceRequestAdapter(
instance,
&.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
.{ .callback = onAdapter, .userdata1 = &ctx },
);
c.wgpuInstanceProcessEvents(instance);
const adapter = ctx.adapter orelse return error.NoAdapter;
errdefer c.wgpuAdapterRelease(adapter);
// --- QUERY HARDWARE LIMITS ---
var supported_limits = std.mem.zeroes(c.WGPULimits);
supported_limits.nextInChain = null;
// Fetch what your physical graphic card can actually handle
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
const device_descriptor = c.WGPUDeviceDescriptor{
.nextInChain = null,
.label = sv("TensorCompilerDevice"),
.requiredFeatureCount = 0,
.requiredFeatures = null,
.requiredLimits = &supported_limits,
};
_ = c.wgpuAdapterRequestDevice(
adapter,
&device_descriptor,
.{ .callback = onDevice, .userdata1 = &ctx },
);
c.wgpuInstanceProcessEvents(instance);
const device = ctx.device orelse return error.NoDevice;
return .{
.instance = instance,
.adapter = adapter,
.device = device,
.queue = c.wgpuDeviceGetQueue(device),
};
}
pub fn deinit(self: GpuAllocator) void {
c.wgpuQueueRelease(self.queue);
c.wgpuDeviceRelease(self.device);
c.wgpuAdapterRelease(self.adapter);
c.wgpuInstanceRelease(self.instance);
}
pub fn poll(self: *GpuAllocator) void {
_ = c.wgpuDevicePoll(self.device, 1, null);
}
fn onAdapter(
status: c.WGPURequestAdapterStatus,
adapter: c.WGPUAdapter,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
if (status != c.WGPURequestAdapterStatus_Success) {
std.log.err("Adapter request failed (status={d})", .{status});
return;
}
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
ctx.adapter = adapter;
}
fn onDevice(
status: c.WGPURequestDeviceStatus,
device: c.WGPUDevice,
_: c.WGPUStringView,
userdata1: ?*anyopaque,
_: ?*anyopaque,
) callconv(.c) void {
if (status != c.WGPURequestDeviceStatus_Success) {
std.log.err("Device request failed (status={d})", .{status});
return;
}
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
ctx.device = device;
}
fn sv(s: []const u8) c.WGPUStringView {
return .{ .data = s.ptr, .length = s.len };
}

View File

@ -26,7 +26,7 @@ pub fn load(
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc, c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
); );
c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes); c.wgpuQueueWriteBuffer(gloc.device.queue, buf.raw, 0, data.ptr, bytes);
return .{ .buf = buf, .rows = rows, .cols = cols }; return .{ .buf = buf, .rows = rows, .cols = cols };
} }
@ -74,12 +74,12 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
); );
defer staging.deinit(); defer staging.deinit();
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder; const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes); c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
const cmd = c.wgpuCommandEncoderFinish(enc, null); const cmd = c.wgpuCommandEncoderFinish(enc, null);
defer c.wgpuCommandEncoderRelease(enc); defer c.wgpuCommandEncoderRelease(enc);
defer c.wgpuCommandBufferRelease(cmd); defer c.wgpuCommandBufferRelease(cmd);
c.wgpuQueueSubmit(gloc.queue, 1, &cmd); c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
var mapped = false; var mapped = false;
staging.mapAsync( staging.mapAsync(
@ -88,7 +88,7 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
bytes, bytes,
.{ .callback = onMapped, .userdata1 = &mapped }, .{ .callback = onMapped, .userdata1 = &mapped },
); );
while (!mapped) gloc.poll(); while (!mapped) gloc.device.poll();
const ptr: [*]const f32 = @ptrCast(@alignCast( const ptr: [*]const f32 = @ptrCast(@alignCast(
staging.getConstMappedRange(0, bytes), staging.getConstMappedRange(0, bytes),
@ -137,7 +137,7 @@ fn dispatch2in1out(
defer info_buf.deinit(); defer info_buf.deinit();
// Write the number of elements *in this chunk* to the uniform buffer // Write the number of elements *in this chunk* to the uniform buffer
c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32)); c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
// Bind only the sub-slice for this chunk using `.offset` and `.size` // Bind only the sub-slice for this chunk using `.offset` and `.size`
const entries = [_]c.WGPUBindGroupEntry{ const entries = [_]c.WGPUBindGroupEntry{
@ -164,14 +164,14 @@ fn submitPass(
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
defer c.wgpuBindGroupLayoutRelease(bgl); defer c.wgpuBindGroupLayoutRelease(bgl);
const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{ const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
.layout = bgl, .layout = bgl,
.entries = entries.ptr, .entries = entries.ptr,
.entryCount = entries.len, .entryCount = entries.len,
}) orelse return error.BindGroup; }) orelse return error.BindGroup;
defer c.wgpuBindGroupRelease(bg); defer c.wgpuBindGroupRelease(bg);
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
return error.Encoder; return error.Encoder;
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
c.wgpuComputePassEncoderSetPipeline(pass, pipeline); c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
@ -190,7 +190,7 @@ fn submitPass(
const cmd = c.wgpuCommandEncoderFinish(enc, null); const cmd = c.wgpuCommandEncoderFinish(enc, null);
defer c.wgpuCommandEncoderRelease(enc); defer c.wgpuCommandEncoderRelease(enc);
defer c.wgpuCommandBufferRelease(cmd); defer c.wgpuCommandBufferRelease(cmd);
c.wgpuQueueSubmit(gloc.queue, 1, &cmd); c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
} }
fn ceilDiv(n: usize, d: usize) usize { fn ceilDiv(n: usize, d: usize) usize {

View File

@ -1,9 +1,13 @@
const std = @import("std"); const std = @import("std");
const GpuDevice = @import("GpuDevice.zig");
const GpuAllocator = @import("GpuAllocator.zig"); const GpuAllocator = @import("GpuAllocator.zig");
const Mat = @import("Mat.zig"); const Mat = @import("Mat.zig");
pub fn main(init: std.process.Init) !void { pub fn main(init: std.process.Init) !void {
var gloc = try GpuAllocator.init(init.gpa); const device = try GpuDevice.init();
defer device.deinit();
var gloc = try GpuAllocator.init(init.gpa, device);
defer gloc.deinit(); defer gloc.deinit();
// Define the sizes you want to benchmark // Define the sizes you want to benchmark
@ -15,11 +19,11 @@ pub fn main(init: std.process.Init) !void {
65536, 65536,
262144, 262144,
1024 * 1024, 1024 * 1024,
4 * 1024 * 1024, // 4 * 1024 * 1024,
4 * 4 * 1024 * 1024, // 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 1024 * 1024, // 4 * 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 4 * 1024 * 1024, // 4 * 4 * 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 4 * 2 * 1024 * 1024, // 4 * 4 * 4 * 4 * 2 * 1024 * 1024,
}; };
// Print table header // Print table header