From 1c8e12b1e60d2e7a1747f3b876b3152561bcf236 Mon Sep 17 00:00:00 2001
From: adrien <adrien@bouvais.lu>
Date: Sun, 17 May 2026 23:51:51 +0200
Subject: [PATCH] Added a VRAM limit to GpuAllocator

---
 src/GpuAllocator.zig | 21 +++++++++++++++------
 src/GpuBuffer.zig    |  2 +-
 src/GpuDevice.zig    |  6 ++++++
 src/bench.zig        | 40 +++++++++++++++++++++++-----------------
 4 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig
index efc005d..5f9274d 100644
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@@ -1,17 +1,17 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
+const GpuBuffer = @import("GpuBuffer.zig");
 const c = @import("c.zig").c;
 
 const GpuAllocator = @This();
 
 device: GpuDevice,
-cpu_allocator: std.mem.Allocator,
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
+allocated_vram_bytes: u64 = 0,
 
 pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
     return .{
         .device = device,
-        .cpu_allocator = cpu_allocator,
         .tracked_buffers = .init(cpu_allocator),
     };
 }
@@ -31,18 +31,27 @@ pub fn registerBuffer(
     bytes: u64,
     usage: c.WGPUBufferUsage,
 ) !c.WGPUBuffer {
+    if (bytes > self.device.limits.maxBufferSize)
+        return error.SingleBufferExceedsLimit;
+
+    if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
+        return error.ExceedsVramBudget;
+
     const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
         .usage = usage,
         .size = bytes,
     }) orelse return error.BufferAlloc;
 
     try self.tracked_buffers.put(buf, {});
+    self.allocated_vram_bytes += bytes;
     return buf;
 }
 
-pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
-    if (self.tracked_buffers.remove(buf)) {
-        c.wgpuBufferDestroy(buf);
-        c.wgpuBufferRelease(buf);
+pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: GpuBuffer) void {
+    if (self.tracked_buffers.remove(buf.raw)) {
+        c.wgpuBufferDestroy(buf.raw);
+        c.wgpuBufferRelease(buf.raw);
+        self.allocated_vram_bytes -= buf.size;
+        self.device.poll();
     }
 }
diff --git a/src/GpuBuffer.zig b/src/GpuBuffer.zig
index 90aeae7..a000c10 100644
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@@ -22,7 +22,7 @@ pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffe
 
 /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
 pub fn deinit(self: GpuBuffer) void {
-    self.gloc.unregisterAndDestroyBuffer(self.raw);
+    self.gloc.unregisterAndDestroyBuffer(self);
 }
 
 /// Native mapAsync wrapper
diff --git a/src/GpuDevice.zig b/src/GpuDevice.zig
index 601cd02..b8aa03b 100644
--- a/src/GpuDevice.zig
+++ b/src/GpuDevice.zig
@@ -12,6 +12,11 @@ instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,
+limits: c.WGPULimits,
+
+config: struct {
+    vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
+} = .{},
 
 pub fn init() !GpuAllocator {
     const instance = c.wgpuCreateInstance(
@@ -57,6 +62,7 @@ pub fn init() !GpuAllocator {
         .adapter = adapter,
         .device = device,
         .queue = c.wgpuDeviceGetQueue(device),
+        .limits = supported_limits,
     };
 }
 
diff --git a/src/bench.zig b/src/bench.zig
index a163889..3c5ee01 100644
--- a/src/bench.zig
+++ b/src/bench.zig
@@ -34,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
 
     const sizes = [_]usize{
         1,
+        256,
         1024,
         4 * 1024,
         4 * 4 * 1024,
@@ -44,13 +45,14 @@ pub fn main(init: std.process.Init) !void {
         4 * 4 * 1024 * 1024,
         4 * 4 * 4 * 1024 * 1024,
         4 * 4 * 4 * 4 * 1024 * 1024,
+        4 * 4 * 4 * 4 * 4 * 1024 * 1024,
     };
 
-    const iterations = 5;
+    const iterations = 10;
 
-    // Print clear structural table headers
-    std.debug.print("\n| Size (MB) | Phase             | Time (ms)  |   GB/s   |\n", .{});
-    std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{});
+    // Updated headers to include VRAM footprint info
+    std.debug.print("\n| Size (MB) | Phase             | Time (ms)  |   GB/s   | VRAM Peak |\n", .{});
+    std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{});
 
     for (sizes) |size| {
         // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
@@ -69,13 +71,13 @@ pub fn main(init: std.process.Init) !void {
         var min_transfer_ns: u64 = std.math.maxInt(u64);
         var min_compute_ns: u64 = std.math.maxInt(u64);
 
+        // Track peak VRAM usage observed during the iterations
+        var peak_vram_bytes: usize = 0;
+
         for (0..iterations) |_| {
             // --- 1. GPU ALLOCATION PHASE ---
-            // Assumes Vec.init or similar handles uninitialized device allocation if exposed,
-            // otherwise we isolate data movement directly inside the step.
             const alloc_start = std.Io.Clock.awake.now(init.io);
 
-            // (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint)
             const a = try Vec.initLoad(&gloc, data_a);
             defer a.deinit();
             const b = try Vec.initLoad(&gloc, data_b);
@@ -91,6 +93,12 @@ pub fn main(init: std.process.Init) !void {
             const sum = try a.run(&gloc, b, add_pip);
             defer sum.deinit();
 
+            // All 3 buffers (a, b, sum) are currently resident in VRAM here.
+            // Querying now catches the true peak allocation step.
+            if (gloc.allocated_vram_bytes > peak_vram_bytes) {
+                peak_vram_bytes = gloc.allocated_vram_bytes;
+            }
+
             _ = c.wgpuDevicePoll(device.device, 1, null);
 
             const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
@@ -119,19 +127,17 @@ pub fn main(init: std.process.Init) !void {
         const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
 
         // Bandwidth Calculations
-        // Alloc phase moves 2 buffers worth of data from Host -> GPU
         const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
-
-        // Compute phase performs 2 reads and 1 write completely on VRAM
         const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
-
-        // Transfer phase pulls 1 buffer back from GPU -> Host
         const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
 
-        // Print Results per Size Block
-        std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s });
-        std.debug.print("|           | 2. Compute        | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s });
-        std.debug.print("|           | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s });
-        std.debug.print("|-----------|-------------------|------------|---------:|\n", .{});
+        // Convert Peak VRAM bytes to Megabytes for clean display
+        const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0);
+
+        // Print Results per Size Block with VRAM column aligned
+        std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |           |\n", .{ mb, alloc_ms, alloc_gb_s });
+        std.debug.print("|           | 2. Compute        | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb });
+        std.debug.print("|           | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |           |\n", .{ transfer_ms, transfer_gb_s });
+        std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{});
     }
 }