Added a VRAM limit to GpuAllocator

2026-05-17 23:51:51 +02:00 · 2026-05-17 23:51:51 +02:00 · 1c8e12b1e6
commit 1c8e12b1e6
parent 38538fcd80
4 changed files with 45 additions and 24 deletions
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@ -1,17 +1,17 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const c = @import("c.zig").c;
 const GpuAllocator = @This();
 device: GpuDevice,
 cpu_allocator: std.mem.Allocator,
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
 allocated_vram_bytes: u64 = 0,
 pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
    return .{
        .device = device,
        .cpu_allocator = cpu_allocator,
        .tracked_buffers = .init(cpu_allocator),
    };
 }
@ -31,18 +31,27 @@ pub fn registerBuffer(
    bytes: u64,
    usage: c.WGPUBufferUsage,
 ) !c.WGPUBuffer {
    if (bytes > self.device.limits.maxBufferSize)
        return error.SingleBufferExceedsLimit;
    if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
        return error.ExceedsVramBudget;
    const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
        .usage = usage,
        .size = bytes,
    }) orelse return error.BufferAlloc;
    try self.tracked_buffers.put(buf, {});
    self.allocated_vram_bytes += bytes;
    return buf;
 }
-pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
+pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: GpuBuffer) void {
-    if (self.tracked_buffers.remove(buf)) {
+    if (self.tracked_buffers.remove(buf.raw)) {
-        c.wgpuBufferDestroy(buf);
+        c.wgpuBufferDestroy(buf.raw);
-        c.wgpuBufferRelease(buf);
+        c.wgpuBufferRelease(buf.raw);
        self.allocated_vram_bytes -= buf.size;
        self.device.poll();
    }
 }
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@ -22,7 +22,7 @@ pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffe
 /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
 pub fn deinit(self: GpuBuffer) void {
-    self.gloc.unregisterAndDestroyBuffer(self.raw);
+    self.gloc.unregisterAndDestroyBuffer(self);
 }
 /// Native mapAsync wrapper
--- a/src/GpuDevice.zig
+++ b/src/GpuDevice.zig
@ -12,6 +12,11 @@ instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,
 limits: c.WGPULimits,
 config: struct {
    vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
 } = .{},
 pub fn init() !GpuAllocator {
    const instance = c.wgpuCreateInstance(
@ -57,6 +62,7 @@ pub fn init() !GpuAllocator {
        .adapter = adapter,
        .device = device,
        .queue = c.wgpuDeviceGetQueue(device),
        .limits = supported_limits,
    };
 }
--- a/src/bench.zig
+++ b/src/bench.zig
@ -34,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
    const sizes = [_]usize{
        1,
        256,
        1024,
        4 * 1024,
        4 * 4 * 1024,
@ -44,13 +45,14 @@ pub fn main(init: std.process.Init) !void {
        4 * 4 * 1024 * 1024,
        4 * 4 * 4 * 1024 * 1024,
        4 * 4 * 4 * 4 * 1024 * 1024,
        4 * 4 * 4 * 4 * 4 * 1024 * 1024,
    };
-    const iterations = 5;
+    const iterations = 10;
-    // Print clear structural table headers
+    // Updated headers to include VRAM footprint info
-    std.debug.print("\n| Size (MB) | Phase             | Time (ms)  |   GB/s   |\n", .{});
+    std.debug.print("\n| Size (MB) | Phase             | Time (ms)  |   GB/s   | VRAM Peak |\n", .{});
-    std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{});
+    std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{});
    for (sizes) |size| {
        // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
@ -69,13 +71,13 @@ pub fn main(init: std.process.Init) !void {
        var min_transfer_ns: u64 = std.math.maxInt(u64);
        var min_compute_ns: u64 = std.math.maxInt(u64);
        // Track peak VRAM usage observed during the iterations
        var peak_vram_bytes: usize = 0;
        for (0..iterations) |_| {
            // --- 1. GPU ALLOCATION PHASE ---
            // Assumes Vec.init or similar handles uninitialized device allocation if exposed,
            // otherwise we isolate data movement directly inside the step.
            const alloc_start = std.Io.Clock.awake.now(init.io);
            // (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint)
            const a = try Vec.initLoad(&gloc, data_a);
            defer a.deinit();
            const b = try Vec.initLoad(&gloc, data_b);
@ -91,6 +93,12 @@ pub fn main(init: std.process.Init) !void {
            const sum = try a.run(&gloc, b, add_pip);
            defer sum.deinit();
            // All 3 buffers (a, b, sum) are currently resident in VRAM here.
            // Querying now catches the true peak allocation step.
            if (gloc.allocated_vram_bytes > peak_vram_bytes) {
                peak_vram_bytes = gloc.allocated_vram_bytes;
            }
            _ = c.wgpuDevicePoll(device.device, 1, null);
            const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
@ -119,19 +127,17 @@ pub fn main(init: std.process.Init) !void {
        const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
        // Bandwidth Calculations
        // Alloc phase moves 2 buffers worth of data from Host -> GPU
        const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
        // Compute phase performs 2 reads and 1 write completely on VRAM
        const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
        // Transfer phase pulls 1 buffer back from GPU -> Host
        const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
-        // Print Results per Size Block
+        // Convert Peak VRAM bytes to Megabytes for clean display
-        std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s });
+        const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0);
-        std.debug.print("|           | 2. Compute        | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s });
+
-        std.debug.print("|           | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s });
+        // Print Results per Size Block with VRAM column aligned
-        std.debug.print("|-----------|-------------------|------------|---------:|\n", .{});
+        std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |           |\n", .{ mb, alloc_ms, alloc_gb_s });
        std.debug.print("|           | 2. Compute        | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb });
        std.debug.print("|           | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |           |\n", .{ transfer_ms, transfer_gb_s });
        std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{});
    }
 }