From 1c8e12b1e60d2e7a1747f3b876b3152561bcf236 Mon Sep 17 00:00:00 2001 From: adrien Date: Sun, 17 May 2026 23:51:51 +0200 Subject: [PATCH] Added a VRAM limit to GpuAllocator --- src/GpuAllocator.zig | 21 +++++++++++++++------ src/GpuBuffer.zig | 2 +- src/GpuDevice.zig | 6 ++++++ src/bench.zig | 40 +++++++++++++++++++++++----------------- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig index efc005d..5f9274d 100644 --- a/src/GpuAllocator.zig +++ b/src/GpuAllocator.zig @@ -1,17 +1,17 @@ const std = @import("std"); const GpuDevice = @import("GpuDevice.zig"); +const GpuBuffer = @import("GpuBuffer.zig"); const c = @import("c.zig").c; const GpuAllocator = @This(); device: GpuDevice, -cpu_allocator: std.mem.Allocator, tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void), +allocated_vram_bytes: u64 = 0, pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator { return .{ .device = device, - .cpu_allocator = cpu_allocator, .tracked_buffers = .init(cpu_allocator), }; } @@ -31,18 +31,27 @@ pub fn registerBuffer( bytes: u64, usage: c.WGPUBufferUsage, ) !c.WGPUBuffer { + if (bytes > self.device.limits.maxBufferSize) + return error.SingleBufferExceedsLimit; + + if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit) + return error.ExceedsVramBudget; + const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{ .usage = usage, .size = bytes, }) orelse return error.BufferAlloc; try self.tracked_buffers.put(buf, {}); + self.allocated_vram_bytes += bytes; return buf; } -pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void { - if (self.tracked_buffers.remove(buf)) { - c.wgpuBufferDestroy(buf); - c.wgpuBufferRelease(buf); +pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: GpuBuffer) void { + if (self.tracked_buffers.remove(buf.raw)) { + c.wgpuBufferDestroy(buf.raw); + c.wgpuBufferRelease(buf.raw); + self.allocated_vram_bytes -= buf.size; + self.device.poll(); } } diff --git a/src/GpuBuffer.zig b/src/GpuBuffer.zig index 90aeae7..a000c10 100644 --- a/src/GpuBuffer.zig +++ b/src/GpuBuffer.zig @@ -22,7 +22,7 @@ pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffe /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources pub fn deinit(self: GpuBuffer) void { - self.gloc.unregisterAndDestroyBuffer(self.raw); + self.gloc.unregisterAndDestroyBuffer(self); } /// Native mapAsync wrapper diff --git a/src/GpuDevice.zig b/src/GpuDevice.zig index 601cd02..b8aa03b 100644 --- a/src/GpuDevice.zig +++ b/src/GpuDevice.zig @@ -12,6 +12,11 @@ instance: c.WGPUInstance, adapter: c.WGPUAdapter, device: c.WGPUDevice, queue: c.WGPUQueue, +limits: c.WGPULimits, + +config: struct { + vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB +} = .{}, pub fn init() !GpuAllocator { const instance = c.wgpuCreateInstance( @@ -57,6 +62,7 @@ pub fn init() !GpuAllocator { .adapter = adapter, .device = device, .queue = c.wgpuDeviceGetQueue(device), + .limits = supported_limits, }; } diff --git a/src/bench.zig b/src/bench.zig index a163889..3c5ee01 100644 --- a/src/bench.zig +++ b/src/bench.zig @@ -34,6 +34,7 @@ pub fn main(init: std.process.Init) !void { const sizes = [_]usize{ 1, + 256, 1024, 4 * 1024, 4 * 4 * 1024, @@ -44,13 +45,14 @@ pub fn main(init: std.process.Init) !void { 4 * 4 * 1024 * 1024, 4 * 4 * 4 * 1024 * 1024, 4 * 4 * 4 * 4 * 1024 * 1024, + 4 * 4 * 4 * 4 * 4 * 1024 * 1024, }; - const iterations = 5; + const iterations = 10; - // Print clear structural table headers - std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s |\n", .{}); - std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{}); + // Updated headers to include VRAM footprint info + std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s | VRAM Peak |\n", .{}); + std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{}); for (sizes) |size| { // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) --- @@ -69,13 +71,13 @@ pub fn main(init: std.process.Init) !void { var min_transfer_ns: u64 = std.math.maxInt(u64); var min_compute_ns: u64 = std.math.maxInt(u64); + // Track peak VRAM usage observed during the iterations + var peak_vram_bytes: usize = 0; + for (0..iterations) |_| { // --- 1. GPU ALLOCATION PHASE --- - // Assumes Vec.init or similar handles uninitialized device allocation if exposed, - // otherwise we isolate data movement directly inside the step. const alloc_start = std.Io.Clock.awake.now(init.io); - // (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint) const a = try Vec.initLoad(&gloc, data_a); defer a.deinit(); const b = try Vec.initLoad(&gloc, data_b); @@ -91,6 +93,12 @@ pub fn main(init: std.process.Init) !void { const sum = try a.run(&gloc, b, add_pip); defer sum.deinit(); + // All 3 buffers (a, b, sum) are currently resident in VRAM here. + // Querying now catches the true peak allocation step. + if (gloc.allocated_vram_bytes > peak_vram_bytes) { + peak_vram_bytes = gloc.allocated_vram_bytes; + } + _ = c.wgpuDevicePoll(device.device, 1, null); const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io)); @@ -119,19 +127,17 @@ pub fn main(init: std.process.Init) !void { const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0; // Bandwidth Calculations - // Alloc phase moves 2 buffers worth of data from Host -> GPU const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0); - - // Compute phase performs 2 reads and 1 write completely on VRAM const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0); - - // Transfer phase pulls 1 buffer back from GPU -> Host const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0); - // Print Results per Size Block - std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s }); - std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s }); - std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s }); - std.debug.print("|-----------|-------------------|------------|---------:|\n", .{}); + // Convert Peak VRAM bytes to Megabytes for clean display + const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0); + + // Print Results per Size Block with VRAM column aligned + std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} | |\n", .{ mb, alloc_ms, alloc_gb_s }); + std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb }); + std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} | |\n", .{ transfer_ms, transfer_gb_s }); + std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{}); } }