Added a VRAM limit to GpuAllocator

This commit is contained in:
adrien 2026-05-17 23:51:51 +02:00
parent 38538fcd80
commit 1c8e12b1e6
4 changed files with 45 additions and 24 deletions

View File

@ -1,17 +1,17 @@
const std = @import("std"); const std = @import("std");
const GpuDevice = @import("GpuDevice.zig"); const GpuDevice = @import("GpuDevice.zig");
const GpuBuffer = @import("GpuBuffer.zig");
const c = @import("c.zig").c; const c = @import("c.zig").c;
const GpuAllocator = @This(); const GpuAllocator = @This();
device: GpuDevice, device: GpuDevice,
cpu_allocator: std.mem.Allocator,
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void), tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
allocated_vram_bytes: u64 = 0,
pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator { pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
return .{ return .{
.device = device, .device = device,
.cpu_allocator = cpu_allocator,
.tracked_buffers = .init(cpu_allocator), .tracked_buffers = .init(cpu_allocator),
}; };
} }
@ -31,18 +31,27 @@ pub fn registerBuffer(
bytes: u64, bytes: u64,
usage: c.WGPUBufferUsage, usage: c.WGPUBufferUsage,
) !c.WGPUBuffer { ) !c.WGPUBuffer {
if (bytes > self.device.limits.maxBufferSize)
return error.SingleBufferExceedsLimit;
if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
return error.ExceedsVramBudget;
const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{ const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
.usage = usage, .usage = usage,
.size = bytes, .size = bytes,
}) orelse return error.BufferAlloc; }) orelse return error.BufferAlloc;
try self.tracked_buffers.put(buf, {}); try self.tracked_buffers.put(buf, {});
self.allocated_vram_bytes += bytes;
return buf; return buf;
} }
pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void { pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: GpuBuffer) void {
if (self.tracked_buffers.remove(buf)) { if (self.tracked_buffers.remove(buf.raw)) {
c.wgpuBufferDestroy(buf); c.wgpuBufferDestroy(buf.raw);
c.wgpuBufferRelease(buf); c.wgpuBufferRelease(buf.raw);
self.allocated_vram_bytes -= buf.size;
self.device.poll();
} }
} }

View File

@ -22,7 +22,7 @@ pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffe
/// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
pub fn deinit(self: GpuBuffer) void { pub fn deinit(self: GpuBuffer) void {
self.gloc.unregisterAndDestroyBuffer(self.raw); self.gloc.unregisterAndDestroyBuffer(self);
} }
/// Native mapAsync wrapper /// Native mapAsync wrapper

View File

@ -12,6 +12,11 @@ instance: c.WGPUInstance,
adapter: c.WGPUAdapter, adapter: c.WGPUAdapter,
device: c.WGPUDevice, device: c.WGPUDevice,
queue: c.WGPUQueue, queue: c.WGPUQueue,
limits: c.WGPULimits,
config: struct {
vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
} = .{},
pub fn init() !GpuAllocator { pub fn init() !GpuAllocator {
const instance = c.wgpuCreateInstance( const instance = c.wgpuCreateInstance(
@ -57,6 +62,7 @@ pub fn init() !GpuAllocator {
.adapter = adapter, .adapter = adapter,
.device = device, .device = device,
.queue = c.wgpuDeviceGetQueue(device), .queue = c.wgpuDeviceGetQueue(device),
.limits = supported_limits,
}; };
} }

View File

@ -34,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
const sizes = [_]usize{ const sizes = [_]usize{
1, 1,
256,
1024, 1024,
4 * 1024, 4 * 1024,
4 * 4 * 1024, 4 * 4 * 1024,
@ -44,13 +45,14 @@ pub fn main(init: std.process.Init) !void {
4 * 4 * 1024 * 1024, 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 1024 * 1024, 4 * 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 4 * 1024 * 1024, 4 * 4 * 4 * 4 * 1024 * 1024,
4 * 4 * 4 * 4 * 4 * 1024 * 1024,
}; };
const iterations = 5; const iterations = 10;
// Print clear structural table headers // Updated headers to include VRAM footprint info
std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s |\n", .{}); std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s | VRAM Peak |\n", .{});
std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{}); std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{});
for (sizes) |size| { for (sizes) |size| {
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) --- // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
@ -69,13 +71,13 @@ pub fn main(init: std.process.Init) !void {
var min_transfer_ns: u64 = std.math.maxInt(u64); var min_transfer_ns: u64 = std.math.maxInt(u64);
var min_compute_ns: u64 = std.math.maxInt(u64); var min_compute_ns: u64 = std.math.maxInt(u64);
// Track peak VRAM usage observed during the iterations
var peak_vram_bytes: usize = 0;
for (0..iterations) |_| { for (0..iterations) |_| {
// --- 1. GPU ALLOCATION PHASE --- // --- 1. GPU ALLOCATION PHASE ---
// Assumes Vec.init or similar handles uninitialized device allocation if exposed,
// otherwise we isolate data movement directly inside the step.
const alloc_start = std.Io.Clock.awake.now(init.io); const alloc_start = std.Io.Clock.awake.now(init.io);
// (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint)
const a = try Vec.initLoad(&gloc, data_a); const a = try Vec.initLoad(&gloc, data_a);
defer a.deinit(); defer a.deinit();
const b = try Vec.initLoad(&gloc, data_b); const b = try Vec.initLoad(&gloc, data_b);
@ -91,6 +93,12 @@ pub fn main(init: std.process.Init) !void {
const sum = try a.run(&gloc, b, add_pip); const sum = try a.run(&gloc, b, add_pip);
defer sum.deinit(); defer sum.deinit();
// All 3 buffers (a, b, sum) are currently resident in VRAM here.
// Querying now catches the true peak allocation step.
if (gloc.allocated_vram_bytes > peak_vram_bytes) {
peak_vram_bytes = gloc.allocated_vram_bytes;
}
_ = c.wgpuDevicePoll(device.device, 1, null); _ = c.wgpuDevicePoll(device.device, 1, null);
const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io)); const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
@ -119,19 +127,17 @@ pub fn main(init: std.process.Init) !void {
const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0; const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
// Bandwidth Calculations // Bandwidth Calculations
// Alloc phase moves 2 buffers worth of data from Host -> GPU
const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0); const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
// Compute phase performs 2 reads and 1 write completely on VRAM
const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0); const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
// Transfer phase pulls 1 buffer back from GPU -> Host
const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0); const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
// Print Results per Size Block // Convert Peak VRAM bytes to Megabytes for clean display
std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s }); const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0);
std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s });
std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s }); // Print Results per Size Block with VRAM column aligned
std.debug.print("|-----------|-------------------|------------|---------:|\n", .{}); std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} | |\n", .{ mb, alloc_ms, alloc_gb_s });
std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb });
std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} | |\n", .{ transfer_ms, transfer_gb_s });
std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{});
} }
} }