Splited GpuAllocator into GpuAllocator and GpuDevice

2026-05-17 20:55:49 +02:00 · 2026-05-17 20:55:49 +02:00 · cef6155f41
commit cef6155f41
parent d57968d6df
4 changed files with 131 additions and 123 deletions
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@ -1,69 +1,24 @@
 const std = @import("std");
 const sh = @import("shaders.zig");
 const GpuDevice = @import("GpuDevice.zig");
 const c = @import("c.zig").c;
 const GpuAllocator = @This();
 device: GpuDevice,
 cpu_allocator: std.mem.Allocator,
 instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
 pipelines: struct {
    add: c.WGPUComputePipeline,
 },
-pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
+pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
    const instance = c.wgpuCreateInstance(
        &std.mem.zeroes(c.WGPUInstanceDescriptor),
    ) orelse return error.NoInstance;
    errdefer c.wgpuInstanceRelease(instance);
    var ctx = Ctx{};
    _ = c.wgpuInstanceRequestAdapter(
        instance,
        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
        .{ .callback = onAdapter, .userdata1 = &ctx },
    );
    c.wgpuInstanceProcessEvents(instance);
    const adapter = ctx.adapter orelse return error.NoAdapter;
    errdefer c.wgpuAdapterRelease(adapter);
    // --- QUERY HARDWARE LIMITS ---
    var supported_limits = std.mem.zeroes(c.WGPULimits);
    supported_limits.nextInChain = null;
    // Fetch what your physical graphic card can actually handle
    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
    const device_descriptor = c.WGPUDeviceDescriptor{
        .nextInChain = null,
        .label = sv("TensorCompilerDevice"),
        .requiredFeatureCount = 0,
        .requiredFeatures = null,
        .requiredLimits = &supported_limits,
    };
    _ = c.wgpuAdapterRequestDevice(
        adapter,
        &device_descriptor,
        .{ .callback = onDevice, .userdata1 = &ctx },
    );
    c.wgpuInstanceProcessEvents(instance);
    const device = ctx.device orelse return error.NoDevice;
    return .{
        .cpu_allocator = cpu_allocator,
        .instance = instance,
        .adapter = adapter,
        .device = device,
-        .queue = c.wgpuDeviceGetQueue(device),
+        .cpu_allocator = cpu_allocator,
        .tracked_buffers = .init(cpu_allocator),
        .pipelines = .{
-            .add = try buildPipeline(device, sh.SHADER_ADD),
+            .add = try buildPipeline(device.device, sh.SHADER_ADD),
        },
    };
 }
@ -79,11 +34,6 @@ pub fn deinit(self: *GpuAllocator) void {
        c.wgpuBufferRelease(buf);
    }
    self.tracked_buffers.deinit();
    c.wgpuQueueRelease(self.queue);
    c.wgpuDeviceRelease(self.device);
    c.wgpuAdapterRelease(self.adapter);
    c.wgpuInstanceRelease(self.instance);
 }
 pub fn registerBuffer(
@ -91,7 +41,7 @@ pub fn registerBuffer(
    bytes: u64,
    usage: c.WGPUBufferUsage,
 ) !c.WGPUBuffer {
-    const buf = c.wgpuDeviceCreateBuffer(self.device, &.{
+    const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
        .usage = usage,
        .size = bytes,
    }) orelse return error.BufferAlloc;
@ -107,59 +57,6 @@ pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
    }
 }
 // ── Internal ─────────────────────────────────────────────────────────────
 pub fn makeBuffer(
    self: *GpuAllocator,
    bytes: u64,
    usage: c.WGPUBufferUsage,
 ) !c.WGPUBuffer {
    return c.wgpuDeviceCreateBuffer(self.device, &.{
        .usage = usage,
        .size = bytes,
    }) orelse error.BufferAlloc;
 }
 /// Poll until GPU work completes. Use after submit if you need CPU sync.
 pub fn poll(self: *GpuAllocator) void {
    _ = c.wgpuDevicePoll(self.device, 1, null);
 }
 const Ctx = struct {
    adapter: c.WGPUAdapter = null,
    device: c.WGPUDevice = null,
 };
 fn onAdapter(
    status: c.WGPURequestAdapterStatus,
    adapter: c.WGPUAdapter,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    if (status != c.WGPURequestAdapterStatus_Success) {
        std.log.err("Adapter request failed (status={d})", .{status});
        return;
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.adapter = adapter;
 }
 fn onDevice(
    status: c.WGPURequestDeviceStatus,
    device: c.WGPUDevice,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    if (status != c.WGPURequestDeviceStatus_Success) {
        std.log.err("Device request failed (status={d})", .{status});
        return;
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
 }
 fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
    var wgsl_src = c.WGPUShaderSourceWGSL{
        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
--- a/src/GpuDevice.zig
+++ b/src/GpuDevice.zig
@ -0,0 +1,107 @@
 const std = @import("std");
 const sh = @import("shaders.zig");
 const c = @import("c.zig").c;
 const Ctx = struct {
    adapter: c.WGPUAdapter = null,
    device: c.WGPUDevice = null,
 };
 const GpuAllocator = @This();
 instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,
 pub fn init() !GpuAllocator {
    const instance = c.wgpuCreateInstance(
        &std.mem.zeroes(c.WGPUInstanceDescriptor),
    ) orelse return error.NoInstance;
    errdefer c.wgpuInstanceRelease(instance);
    var ctx = Ctx{};
    _ = c.wgpuInstanceRequestAdapter(
        instance,
        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
        .{ .callback = onAdapter, .userdata1 = &ctx },
    );
    c.wgpuInstanceProcessEvents(instance);
    const adapter = ctx.adapter orelse return error.NoAdapter;
    errdefer c.wgpuAdapterRelease(adapter);
    // --- QUERY HARDWARE LIMITS ---
    var supported_limits = std.mem.zeroes(c.WGPULimits);
    supported_limits.nextInChain = null;
    // Fetch what your physical graphic card can actually handle
    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
    const device_descriptor = c.WGPUDeviceDescriptor{
        .nextInChain = null,
        .label = sv("TensorCompilerDevice"),
        .requiredFeatureCount = 0,
        .requiredFeatures = null,
        .requiredLimits = &supported_limits,
    };
    _ = c.wgpuAdapterRequestDevice(
        adapter,
        &device_descriptor,
        .{ .callback = onDevice, .userdata1 = &ctx },
    );
    c.wgpuInstanceProcessEvents(instance);
    const device = ctx.device orelse return error.NoDevice;
    return .{
        .instance = instance,
        .adapter = adapter,
        .device = device,
        .queue = c.wgpuDeviceGetQueue(device),
    };
 }
 pub fn deinit(self: GpuAllocator) void {
    c.wgpuQueueRelease(self.queue);
    c.wgpuDeviceRelease(self.device);
    c.wgpuAdapterRelease(self.adapter);
    c.wgpuInstanceRelease(self.instance);
 }
 pub fn poll(self: *GpuAllocator) void {
    _ = c.wgpuDevicePoll(self.device, 1, null);
 }
 fn onAdapter(
    status: c.WGPURequestAdapterStatus,
    adapter: c.WGPUAdapter,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    if (status != c.WGPURequestAdapterStatus_Success) {
        std.log.err("Adapter request failed (status={d})", .{status});
        return;
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.adapter = adapter;
 }
 fn onDevice(
    status: c.WGPURequestDeviceStatus,
    device: c.WGPUDevice,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    if (status != c.WGPURequestDeviceStatus_Success) {
        std.log.err("Device request failed (status={d})", .{status});
        return;
    }
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
 }
 fn sv(s: []const u8) c.WGPUStringView {
    return .{ .data = s.ptr, .length = s.len };
 }
--- a/src/Mat.zig
+++ b/src/Mat.zig
@ -26,7 +26,7 @@ pub fn load(
        c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
    );
-    c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
+    c.wgpuQueueWriteBuffer(gloc.device.queue, buf.raw, 0, data.ptr, bytes);
    return .{ .buf = buf, .rows = rows, .cols = cols };
 }
@ -74,12 +74,12 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
    );
    defer staging.deinit();
-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
    c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandEncoderRelease(enc);
    defer c.wgpuCommandBufferRelease(cmd);
-    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
    var mapped = false;
    staging.mapAsync(
@ -88,7 +88,7 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
        bytes,
        .{ .callback = onMapped, .userdata1 = &mapped },
    );
-    while (!mapped) gloc.poll();
+    while (!mapped) gloc.device.poll();
    const ptr: [*]const f32 = @ptrCast(@alignCast(
        staging.getConstMappedRange(0, bytes),
@ -137,7 +137,7 @@ fn dispatch2in1out(
        defer info_buf.deinit();
        // Write the number of elements *in this chunk* to the uniform buffer
-        c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
+        c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
        // Bind only the sub-slice for this chunk using `.offset` and `.size`
        const entries = [_]c.WGPUBindGroupEntry{
@ -164,14 +164,14 @@ fn submitPass(
    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
    defer c.wgpuBindGroupLayoutRelease(bgl);
-    const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{
+    const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
        .layout = bgl,
        .entries = entries.ptr,
        .entryCount = entries.len,
    }) orelse return error.BindGroup;
    defer c.wgpuBindGroupRelease(bg);
-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
        return error.Encoder;
    const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
@ -190,7 +190,7 @@ fn submitPass(
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandEncoderRelease(enc);
    defer c.wgpuCommandBufferRelease(cmd);
-    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
 }
 fn ceilDiv(n: usize, d: usize) usize {
--- a/src/main.zig
+++ b/src/main.zig
@ -1,9 +1,13 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
 const GpuAllocator = @import("GpuAllocator.zig");
 const Mat = @import("Mat.zig");
 pub fn main(init: std.process.Init) !void {
-    var gloc = try GpuAllocator.init(init.gpa);
+    const device = try GpuDevice.init();
    defer device.deinit();
    var gloc = try GpuAllocator.init(init.gpa, device);
    defer gloc.deinit();
    // Define the sizes you want to benchmark
@ -15,11 +19,11 @@ pub fn main(init: std.process.Init) !void {
        65536,
        262144,
        1024 * 1024,
-        4 * 1024 * 1024,
+        // 4 * 1024 * 1024,
-        4 * 4 * 1024 * 1024,
+        // 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 2 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 2 * 1024 * 1024,
    };
    // Print table header