Added a VRAM limit to GpuAllocator
This commit is contained in:
parent
38538fcd80
commit
1c8e12b1e6
@ -1,17 +1,17 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const GpuDevice = @import("GpuDevice.zig");
|
const GpuDevice = @import("GpuDevice.zig");
|
||||||
|
const GpuBuffer = @import("GpuBuffer.zig");
|
||||||
const c = @import("c.zig").c;
|
const c = @import("c.zig").c;
|
||||||
|
|
||||||
const GpuAllocator = @This();
|
const GpuAllocator = @This();
|
||||||
|
|
||||||
device: GpuDevice,
|
device: GpuDevice,
|
||||||
cpu_allocator: std.mem.Allocator,
|
|
||||||
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
||||||
|
allocated_vram_bytes: u64 = 0,
|
||||||
|
|
||||||
pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
|
pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
|
||||||
return .{
|
return .{
|
||||||
.device = device,
|
.device = device,
|
||||||
.cpu_allocator = cpu_allocator,
|
|
||||||
.tracked_buffers = .init(cpu_allocator),
|
.tracked_buffers = .init(cpu_allocator),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -31,18 +31,27 @@ pub fn registerBuffer(
|
|||||||
bytes: u64,
|
bytes: u64,
|
||||||
usage: c.WGPUBufferUsage,
|
usage: c.WGPUBufferUsage,
|
||||||
) !c.WGPUBuffer {
|
) !c.WGPUBuffer {
|
||||||
|
if (bytes > self.device.limits.maxBufferSize)
|
||||||
|
return error.SingleBufferExceedsLimit;
|
||||||
|
|
||||||
|
if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
|
||||||
|
return error.ExceedsVramBudget;
|
||||||
|
|
||||||
const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
|
const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
|
||||||
.usage = usage,
|
.usage = usage,
|
||||||
.size = bytes,
|
.size = bytes,
|
||||||
}) orelse return error.BufferAlloc;
|
}) orelse return error.BufferAlloc;
|
||||||
|
|
||||||
try self.tracked_buffers.put(buf, {});
|
try self.tracked_buffers.put(buf, {});
|
||||||
|
self.allocated_vram_bytes += bytes;
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
|
pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: GpuBuffer) void {
|
||||||
if (self.tracked_buffers.remove(buf)) {
|
if (self.tracked_buffers.remove(buf.raw)) {
|
||||||
c.wgpuBufferDestroy(buf);
|
c.wgpuBufferDestroy(buf.raw);
|
||||||
c.wgpuBufferRelease(buf);
|
c.wgpuBufferRelease(buf.raw);
|
||||||
|
self.allocated_vram_bytes -= buf.size;
|
||||||
|
self.device.poll();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -22,7 +22,7 @@ pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !GpuBuffe
|
|||||||
|
|
||||||
/// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
|
/// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
|
||||||
pub fn deinit(self: GpuBuffer) void {
|
pub fn deinit(self: GpuBuffer) void {
|
||||||
self.gloc.unregisterAndDestroyBuffer(self.raw);
|
self.gloc.unregisterAndDestroyBuffer(self);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Native mapAsync wrapper
|
/// Native mapAsync wrapper
|
||||||
|
|||||||
@ -12,6 +12,11 @@ instance: c.WGPUInstance,
|
|||||||
adapter: c.WGPUAdapter,
|
adapter: c.WGPUAdapter,
|
||||||
device: c.WGPUDevice,
|
device: c.WGPUDevice,
|
||||||
queue: c.WGPUQueue,
|
queue: c.WGPUQueue,
|
||||||
|
limits: c.WGPULimits,
|
||||||
|
|
||||||
|
config: struct {
|
||||||
|
vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
|
||||||
|
} = .{},
|
||||||
|
|
||||||
pub fn init() !GpuAllocator {
|
pub fn init() !GpuAllocator {
|
||||||
const instance = c.wgpuCreateInstance(
|
const instance = c.wgpuCreateInstance(
|
||||||
@ -57,6 +62,7 @@ pub fn init() !GpuAllocator {
|
|||||||
.adapter = adapter,
|
.adapter = adapter,
|
||||||
.device = device,
|
.device = device,
|
||||||
.queue = c.wgpuDeviceGetQueue(device),
|
.queue = c.wgpuDeviceGetQueue(device),
|
||||||
|
.limits = supported_limits,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
const sizes = [_]usize{
|
const sizes = [_]usize{
|
||||||
1,
|
1,
|
||||||
|
256,
|
||||||
1024,
|
1024,
|
||||||
4 * 1024,
|
4 * 1024,
|
||||||
4 * 4 * 1024,
|
4 * 4 * 1024,
|
||||||
@ -44,13 +45,14 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
4 * 4 * 1024 * 1024,
|
4 * 4 * 1024 * 1024,
|
||||||
4 * 4 * 4 * 1024 * 1024,
|
4 * 4 * 4 * 1024 * 1024,
|
||||||
4 * 4 * 4 * 4 * 1024 * 1024,
|
4 * 4 * 4 * 4 * 1024 * 1024,
|
||||||
|
4 * 4 * 4 * 4 * 4 * 1024 * 1024,
|
||||||
};
|
};
|
||||||
|
|
||||||
const iterations = 5;
|
const iterations = 10;
|
||||||
|
|
||||||
// Print clear structural table headers
|
// Updated headers to include VRAM footprint info
|
||||||
std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s |\n", .{});
|
std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s | VRAM Peak |\n", .{});
|
||||||
std.debug.print("|----------:|:------------------|-----------:|---------:|\n", .{});
|
std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{});
|
||||||
|
|
||||||
for (sizes) |size| {
|
for (sizes) |size| {
|
||||||
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
|
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
|
||||||
@ -69,13 +71,13 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
var min_transfer_ns: u64 = std.math.maxInt(u64);
|
var min_transfer_ns: u64 = std.math.maxInt(u64);
|
||||||
var min_compute_ns: u64 = std.math.maxInt(u64);
|
var min_compute_ns: u64 = std.math.maxInt(u64);
|
||||||
|
|
||||||
|
// Track peak VRAM usage observed during the iterations
|
||||||
|
var peak_vram_bytes: usize = 0;
|
||||||
|
|
||||||
for (0..iterations) |_| {
|
for (0..iterations) |_| {
|
||||||
// --- 1. GPU ALLOCATION PHASE ---
|
// --- 1. GPU ALLOCATION PHASE ---
|
||||||
// Assumes Vec.init or similar handles uninitialized device allocation if exposed,
|
|
||||||
// otherwise we isolate data movement directly inside the step.
|
|
||||||
const alloc_start = std.Io.Clock.awake.now(init.io);
|
const alloc_start = std.Io.Clock.awake.now(init.io);
|
||||||
|
|
||||||
// (If your Vec API allocates and loads simultaneously, this step doubles as your Host->Device allocation footprint)
|
|
||||||
const a = try Vec.initLoad(&gloc, data_a);
|
const a = try Vec.initLoad(&gloc, data_a);
|
||||||
defer a.deinit();
|
defer a.deinit();
|
||||||
const b = try Vec.initLoad(&gloc, data_b);
|
const b = try Vec.initLoad(&gloc, data_b);
|
||||||
@ -91,6 +93,12 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
const sum = try a.run(&gloc, b, add_pip);
|
const sum = try a.run(&gloc, b, add_pip);
|
||||||
defer sum.deinit();
|
defer sum.deinit();
|
||||||
|
|
||||||
|
// All 3 buffers (a, b, sum) are currently resident in VRAM here.
|
||||||
|
// Querying now catches the true peak allocation step.
|
||||||
|
if (gloc.allocated_vram_bytes > peak_vram_bytes) {
|
||||||
|
peak_vram_bytes = gloc.allocated_vram_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
_ = c.wgpuDevicePoll(device.device, 1, null);
|
_ = c.wgpuDevicePoll(device.device, 1, null);
|
||||||
|
|
||||||
const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
|
const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io));
|
||||||
@ -119,19 +127,17 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
|
const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0;
|
||||||
|
|
||||||
// Bandwidth Calculations
|
// Bandwidth Calculations
|
||||||
// Alloc phase moves 2 buffers worth of data from Host -> GPU
|
|
||||||
const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
|
const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0);
|
||||||
|
|
||||||
// Compute phase performs 2 reads and 1 write completely on VRAM
|
|
||||||
const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
|
const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0);
|
||||||
|
|
||||||
// Transfer phase pulls 1 buffer back from GPU -> Host
|
|
||||||
const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
|
const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0);
|
||||||
|
|
||||||
// Print Results per Size Block
|
// Convert Peak VRAM bytes to Megabytes for clean display
|
||||||
std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} |\n", .{ mb, alloc_ms, alloc_gb_s });
|
const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0);
|
||||||
std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} |\n", .{ compute_ms, compute_gb_s });
|
|
||||||
std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} |\n", .{ transfer_ms, transfer_gb_s });
|
// Print Results per Size Block with VRAM column aligned
|
||||||
std.debug.print("|-----------|-------------------|------------|---------:|\n", .{});
|
std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} | |\n", .{ mb, alloc_ms, alloc_gb_s });
|
||||||
|
std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb });
|
||||||
|
std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} | |\n", .{ transfer_ms, transfer_gb_s });
|
||||||
|
std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user