Splited GpuAllocator into GpuAllocator and GpuDevice
This commit is contained in:
parent
d57968d6df
commit
cef6155f41
@ -1,69 +1,24 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const sh = @import("shaders.zig");
|
const sh = @import("shaders.zig");
|
||||||
|
const GpuDevice = @import("GpuDevice.zig");
|
||||||
const c = @import("c.zig").c;
|
const c = @import("c.zig").c;
|
||||||
|
|
||||||
const GpuAllocator = @This();
|
const GpuAllocator = @This();
|
||||||
|
|
||||||
|
device: GpuDevice,
|
||||||
cpu_allocator: std.mem.Allocator,
|
cpu_allocator: std.mem.Allocator,
|
||||||
instance: c.WGPUInstance,
|
|
||||||
adapter: c.WGPUAdapter,
|
|
||||||
device: c.WGPUDevice,
|
|
||||||
queue: c.WGPUQueue,
|
|
||||||
|
|
||||||
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
||||||
|
|
||||||
pipelines: struct {
|
pipelines: struct {
|
||||||
add: c.WGPUComputePipeline,
|
add: c.WGPUComputePipeline,
|
||||||
},
|
},
|
||||||
|
|
||||||
pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
|
pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
|
||||||
const instance = c.wgpuCreateInstance(
|
|
||||||
&std.mem.zeroes(c.WGPUInstanceDescriptor),
|
|
||||||
) orelse return error.NoInstance;
|
|
||||||
errdefer c.wgpuInstanceRelease(instance);
|
|
||||||
|
|
||||||
var ctx = Ctx{};
|
|
||||||
_ = c.wgpuInstanceRequestAdapter(
|
|
||||||
instance,
|
|
||||||
&.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
|
|
||||||
.{ .callback = onAdapter, .userdata1 = &ctx },
|
|
||||||
);
|
|
||||||
c.wgpuInstanceProcessEvents(instance);
|
|
||||||
const adapter = ctx.adapter orelse return error.NoAdapter;
|
|
||||||
errdefer c.wgpuAdapterRelease(adapter);
|
|
||||||
|
|
||||||
// --- QUERY HARDWARE LIMITS ---
|
|
||||||
var supported_limits = std.mem.zeroes(c.WGPULimits);
|
|
||||||
supported_limits.nextInChain = null;
|
|
||||||
|
|
||||||
// Fetch what your physical graphic card can actually handle
|
|
||||||
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
|
|
||||||
|
|
||||||
const device_descriptor = c.WGPUDeviceDescriptor{
|
|
||||||
.nextInChain = null,
|
|
||||||
.label = sv("TensorCompilerDevice"),
|
|
||||||
.requiredFeatureCount = 0,
|
|
||||||
.requiredFeatures = null,
|
|
||||||
.requiredLimits = &supported_limits,
|
|
||||||
};
|
|
||||||
|
|
||||||
_ = c.wgpuAdapterRequestDevice(
|
|
||||||
adapter,
|
|
||||||
&device_descriptor,
|
|
||||||
.{ .callback = onDevice, .userdata1 = &ctx },
|
|
||||||
);
|
|
||||||
c.wgpuInstanceProcessEvents(instance);
|
|
||||||
const device = ctx.device orelse return error.NoDevice;
|
|
||||||
|
|
||||||
return .{
|
return .{
|
||||||
.cpu_allocator = cpu_allocator,
|
|
||||||
.instance = instance,
|
|
||||||
.adapter = adapter,
|
|
||||||
.device = device,
|
.device = device,
|
||||||
.queue = c.wgpuDeviceGetQueue(device),
|
.cpu_allocator = cpu_allocator,
|
||||||
.tracked_buffers = .init(cpu_allocator),
|
.tracked_buffers = .init(cpu_allocator),
|
||||||
.pipelines = .{
|
.pipelines = .{
|
||||||
.add = try buildPipeline(device, sh.SHADER_ADD),
|
.add = try buildPipeline(device.device, sh.SHADER_ADD),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -79,11 +34,6 @@ pub fn deinit(self: *GpuAllocator) void {
|
|||||||
c.wgpuBufferRelease(buf);
|
c.wgpuBufferRelease(buf);
|
||||||
}
|
}
|
||||||
self.tracked_buffers.deinit();
|
self.tracked_buffers.deinit();
|
||||||
|
|
||||||
c.wgpuQueueRelease(self.queue);
|
|
||||||
c.wgpuDeviceRelease(self.device);
|
|
||||||
c.wgpuAdapterRelease(self.adapter);
|
|
||||||
c.wgpuInstanceRelease(self.instance);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn registerBuffer(
|
pub fn registerBuffer(
|
||||||
@ -91,7 +41,7 @@ pub fn registerBuffer(
|
|||||||
bytes: u64,
|
bytes: u64,
|
||||||
usage: c.WGPUBufferUsage,
|
usage: c.WGPUBufferUsage,
|
||||||
) !c.WGPUBuffer {
|
) !c.WGPUBuffer {
|
||||||
const buf = c.wgpuDeviceCreateBuffer(self.device, &.{
|
const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
|
||||||
.usage = usage,
|
.usage = usage,
|
||||||
.size = bytes,
|
.size = bytes,
|
||||||
}) orelse return error.BufferAlloc;
|
}) orelse return error.BufferAlloc;
|
||||||
@ -107,59 +57,6 @@ pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Internal ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
pub fn makeBuffer(
|
|
||||||
self: *GpuAllocator,
|
|
||||||
bytes: u64,
|
|
||||||
usage: c.WGPUBufferUsage,
|
|
||||||
) !c.WGPUBuffer {
|
|
||||||
return c.wgpuDeviceCreateBuffer(self.device, &.{
|
|
||||||
.usage = usage,
|
|
||||||
.size = bytes,
|
|
||||||
}) orelse error.BufferAlloc;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Poll until GPU work completes. Use after submit if you need CPU sync.
|
|
||||||
pub fn poll(self: *GpuAllocator) void {
|
|
||||||
_ = c.wgpuDevicePoll(self.device, 1, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
const Ctx = struct {
|
|
||||||
adapter: c.WGPUAdapter = null,
|
|
||||||
device: c.WGPUDevice = null,
|
|
||||||
};
|
|
||||||
|
|
||||||
fn onAdapter(
|
|
||||||
status: c.WGPURequestAdapterStatus,
|
|
||||||
adapter: c.WGPUAdapter,
|
|
||||||
_: c.WGPUStringView,
|
|
||||||
userdata1: ?*anyopaque,
|
|
||||||
_: ?*anyopaque,
|
|
||||||
) callconv(.c) void {
|
|
||||||
if (status != c.WGPURequestAdapterStatus_Success) {
|
|
||||||
std.log.err("Adapter request failed (status={d})", .{status});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
|
||||||
ctx.adapter = adapter;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn onDevice(
|
|
||||||
status: c.WGPURequestDeviceStatus,
|
|
||||||
device: c.WGPUDevice,
|
|
||||||
_: c.WGPUStringView,
|
|
||||||
userdata1: ?*anyopaque,
|
|
||||||
_: ?*anyopaque,
|
|
||||||
) callconv(.c) void {
|
|
||||||
if (status != c.WGPURequestDeviceStatus_Success) {
|
|
||||||
std.log.err("Device request failed (status={d})", .{status});
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
|
||||||
ctx.device = device;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
||||||
var wgsl_src = c.WGPUShaderSourceWGSL{
|
var wgsl_src = c.WGPUShaderSourceWGSL{
|
||||||
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
|
.chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
|
||||||
|
|||||||
107
src/GpuDevice.zig
Normal file
107
src/GpuDevice.zig
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
const std = @import("std");
|
||||||
|
const sh = @import("shaders.zig");
|
||||||
|
const c = @import("c.zig").c;
|
||||||
|
|
||||||
|
const Ctx = struct {
|
||||||
|
adapter: c.WGPUAdapter = null,
|
||||||
|
device: c.WGPUDevice = null,
|
||||||
|
};
|
||||||
|
|
||||||
|
const GpuAllocator = @This();
|
||||||
|
|
||||||
|
instance: c.WGPUInstance,
|
||||||
|
adapter: c.WGPUAdapter,
|
||||||
|
device: c.WGPUDevice,
|
||||||
|
queue: c.WGPUQueue,
|
||||||
|
|
||||||
|
pub fn init() !GpuAllocator {
|
||||||
|
const instance = c.wgpuCreateInstance(
|
||||||
|
&std.mem.zeroes(c.WGPUInstanceDescriptor),
|
||||||
|
) orelse return error.NoInstance;
|
||||||
|
errdefer c.wgpuInstanceRelease(instance);
|
||||||
|
|
||||||
|
var ctx = Ctx{};
|
||||||
|
_ = c.wgpuInstanceRequestAdapter(
|
||||||
|
instance,
|
||||||
|
&.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
|
||||||
|
.{ .callback = onAdapter, .userdata1 = &ctx },
|
||||||
|
);
|
||||||
|
c.wgpuInstanceProcessEvents(instance);
|
||||||
|
const adapter = ctx.adapter orelse return error.NoAdapter;
|
||||||
|
errdefer c.wgpuAdapterRelease(adapter);
|
||||||
|
|
||||||
|
// --- QUERY HARDWARE LIMITS ---
|
||||||
|
var supported_limits = std.mem.zeroes(c.WGPULimits);
|
||||||
|
supported_limits.nextInChain = null;
|
||||||
|
|
||||||
|
// Fetch what your physical graphic card can actually handle
|
||||||
|
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
|
||||||
|
|
||||||
|
const device_descriptor = c.WGPUDeviceDescriptor{
|
||||||
|
.nextInChain = null,
|
||||||
|
.label = sv("TensorCompilerDevice"),
|
||||||
|
.requiredFeatureCount = 0,
|
||||||
|
.requiredFeatures = null,
|
||||||
|
.requiredLimits = &supported_limits,
|
||||||
|
};
|
||||||
|
|
||||||
|
_ = c.wgpuAdapterRequestDevice(
|
||||||
|
adapter,
|
||||||
|
&device_descriptor,
|
||||||
|
.{ .callback = onDevice, .userdata1 = &ctx },
|
||||||
|
);
|
||||||
|
c.wgpuInstanceProcessEvents(instance);
|
||||||
|
const device = ctx.device orelse return error.NoDevice;
|
||||||
|
|
||||||
|
return .{
|
||||||
|
.instance = instance,
|
||||||
|
.adapter = adapter,
|
||||||
|
.device = device,
|
||||||
|
.queue = c.wgpuDeviceGetQueue(device),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: GpuAllocator) void {
|
||||||
|
c.wgpuQueueRelease(self.queue);
|
||||||
|
c.wgpuDeviceRelease(self.device);
|
||||||
|
c.wgpuAdapterRelease(self.adapter);
|
||||||
|
c.wgpuInstanceRelease(self.instance);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn poll(self: *GpuAllocator) void {
|
||||||
|
_ = c.wgpuDevicePoll(self.device, 1, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn onAdapter(
|
||||||
|
status: c.WGPURequestAdapterStatus,
|
||||||
|
adapter: c.WGPUAdapter,
|
||||||
|
_: c.WGPUStringView,
|
||||||
|
userdata1: ?*anyopaque,
|
||||||
|
_: ?*anyopaque,
|
||||||
|
) callconv(.c) void {
|
||||||
|
if (status != c.WGPURequestAdapterStatus_Success) {
|
||||||
|
std.log.err("Adapter request failed (status={d})", .{status});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
|
ctx.adapter = adapter;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn onDevice(
|
||||||
|
status: c.WGPURequestDeviceStatus,
|
||||||
|
device: c.WGPUDevice,
|
||||||
|
_: c.WGPUStringView,
|
||||||
|
userdata1: ?*anyopaque,
|
||||||
|
_: ?*anyopaque,
|
||||||
|
) callconv(.c) void {
|
||||||
|
if (status != c.WGPURequestDeviceStatus_Success) {
|
||||||
|
std.log.err("Device request failed (status={d})", .{status});
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
|
ctx.device = device;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sv(s: []const u8) c.WGPUStringView {
|
||||||
|
return .{ .data = s.ptr, .length = s.len };
|
||||||
|
}
|
||||||
16
src/Mat.zig
16
src/Mat.zig
@ -26,7 +26,7 @@ pub fn load(
|
|||||||
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
||||||
);
|
);
|
||||||
|
|
||||||
c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
|
c.wgpuQueueWriteBuffer(gloc.device.queue, buf.raw, 0, data.ptr, bytes);
|
||||||
return .{ .buf = buf, .rows = rows, .cols = cols };
|
return .{ .buf = buf, .rows = rows, .cols = cols };
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,12 +74,12 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
|||||||
);
|
);
|
||||||
defer staging.deinit();
|
defer staging.deinit();
|
||||||
|
|
||||||
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
|
||||||
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
|
c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
|
||||||
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
||||||
defer c.wgpuCommandEncoderRelease(enc);
|
defer c.wgpuCommandEncoderRelease(enc);
|
||||||
defer c.wgpuCommandBufferRelease(cmd);
|
defer c.wgpuCommandBufferRelease(cmd);
|
||||||
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
|
||||||
|
|
||||||
var mapped = false;
|
var mapped = false;
|
||||||
staging.mapAsync(
|
staging.mapAsync(
|
||||||
@ -88,7 +88,7 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
|||||||
bytes,
|
bytes,
|
||||||
.{ .callback = onMapped, .userdata1 = &mapped },
|
.{ .callback = onMapped, .userdata1 = &mapped },
|
||||||
);
|
);
|
||||||
while (!mapped) gloc.poll();
|
while (!mapped) gloc.device.poll();
|
||||||
|
|
||||||
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
||||||
staging.getConstMappedRange(0, bytes),
|
staging.getConstMappedRange(0, bytes),
|
||||||
@ -137,7 +137,7 @@ fn dispatch2in1out(
|
|||||||
defer info_buf.deinit();
|
defer info_buf.deinit();
|
||||||
|
|
||||||
// Write the number of elements *in this chunk* to the uniform buffer
|
// Write the number of elements *in this chunk* to the uniform buffer
|
||||||
c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32));
|
c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32));
|
||||||
|
|
||||||
// Bind only the sub-slice for this chunk using `.offset` and `.size`
|
// Bind only the sub-slice for this chunk using `.offset` and `.size`
|
||||||
const entries = [_]c.WGPUBindGroupEntry{
|
const entries = [_]c.WGPUBindGroupEntry{
|
||||||
@ -164,14 +164,14 @@ fn submitPass(
|
|||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
|
|
||||||
const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{
|
const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
|
||||||
.layout = bgl,
|
.layout = bgl,
|
||||||
.entries = entries.ptr,
|
.entries = entries.ptr,
|
||||||
.entryCount = entries.len,
|
.entryCount = entries.len,
|
||||||
}) orelse return error.BindGroup;
|
}) orelse return error.BindGroup;
|
||||||
defer c.wgpuBindGroupRelease(bg);
|
defer c.wgpuBindGroupRelease(bg);
|
||||||
|
|
||||||
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
|
const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
|
||||||
return error.Encoder;
|
return error.Encoder;
|
||||||
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
||||||
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
||||||
@ -190,7 +190,7 @@ fn submitPass(
|
|||||||
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
const cmd = c.wgpuCommandEncoderFinish(enc, null);
|
||||||
defer c.wgpuCommandEncoderRelease(enc);
|
defer c.wgpuCommandEncoderRelease(enc);
|
||||||
defer c.wgpuCommandBufferRelease(cmd);
|
defer c.wgpuCommandBufferRelease(cmd);
|
||||||
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ceilDiv(n: usize, d: usize) usize {
|
fn ceilDiv(n: usize, d: usize) usize {
|
||||||
|
|||||||
16
src/main.zig
16
src/main.zig
@ -1,9 +1,13 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
|
const GpuDevice = @import("GpuDevice.zig");
|
||||||
const GpuAllocator = @import("GpuAllocator.zig");
|
const GpuAllocator = @import("GpuAllocator.zig");
|
||||||
const Mat = @import("Mat.zig");
|
const Mat = @import("Mat.zig");
|
||||||
|
|
||||||
pub fn main(init: std.process.Init) !void {
|
pub fn main(init: std.process.Init) !void {
|
||||||
var gloc = try GpuAllocator.init(init.gpa);
|
const device = try GpuDevice.init();
|
||||||
|
defer device.deinit();
|
||||||
|
|
||||||
|
var gloc = try GpuAllocator.init(init.gpa, device);
|
||||||
defer gloc.deinit();
|
defer gloc.deinit();
|
||||||
|
|
||||||
// Define the sizes you want to benchmark
|
// Define the sizes you want to benchmark
|
||||||
@ -15,11 +19,11 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
65536,
|
65536,
|
||||||
262144,
|
262144,
|
||||||
1024 * 1024,
|
1024 * 1024,
|
||||||
4 * 1024 * 1024,
|
// 4 * 1024 * 1024,
|
||||||
4 * 4 * 1024 * 1024,
|
// 4 * 4 * 1024 * 1024,
|
||||||
4 * 4 * 4 * 1024 * 1024,
|
// 4 * 4 * 4 * 1024 * 1024,
|
||||||
4 * 4 * 4 * 4 * 1024 * 1024,
|
// 4 * 4 * 4 * 4 * 1024 * 1024,
|
||||||
4 * 4 * 4 * 4 * 2 * 1024 * 1024,
|
// 4 * 4 * 4 * 4 * 2 * 1024 * 1024,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Print table header
|
// Print table header
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user