const std = @import("std"); const gpu = @import("gpu"); const GpuDevice = gpu.GpuDevice; const GpuArenaAllocator = gpu.GpuArenaAllocator; const GpuAllocator = gpu.GpuAllocator; const GpuBuffer = gpu.GpuBuffer; const GpuCompute = gpu.GpuCompute; pub const std_options = std.Options{ .log_level = .info }; /// Minimal implementation of a f16 Vector const Vec = struct { buf: GpuBuffer, len: usize, // Changed: glloc is passed by value (const) pub fn initZero(glloc: GpuAllocator, len: usize) !Vec { return .{ .buf = try GpuBuffer.init(glloc, .{ .size = len * @sizeOf(f16), .usage = .initMany(&.{ .Storage, .CopyDst, .CopySrc }), }), .len = len, }; } // Changed: glloc is passed by value pub fn initLoad(glloc: GpuAllocator, data: []const f16) !Vec { var self = try initZero(glloc, data.len); try self.load(data); // Direct access via the interface copy return self; } pub fn deinit(self: Vec) void { self.buf.deinit(); } /// CPU to GPU. pub fn load(self: Vec, data: []const f16) !void { try self.buf.load(f16, data); } // Changed: glloc is passed by value instead of *GpuAllocator pub fn run(self: Vec, glloc: GpuAllocator, other: Vec, process: GpuCompute) !Vec { std.debug.assert(self.len == other.len); const result = try Vec.initZero(glloc, self.len); errdefer result.deinit(); try process.run(glloc, .{ self.buf, other.buf, result.buf }); return result; } // Changed: glloc is passed by value instead of *GpuAllocator pub fn read(self: Vec, alloc: std.mem.Allocator) ![]f16 { return self.buf.read(alloc, f16); } }; pub fn main(init: std.process.Init) !void { const device = try GpuDevice.init(.{ .vram_bytes_limit = 4 * 1024 * 1024 * 1024 }); defer device.deinit(); var grena = GpuArenaAllocator.init(init.gpa, device.gpuAllocator()); defer grena.deinit(); const glloc = grena.gpuAllocator(); const add_pip = try GpuCompute.init(glloc, @embedFile("shaders/add.wgsl"), .{ .bindings = &.{ .{ .element_size = @sizeOf(f16) }, .{ .element_size = @sizeOf(f16) }, .{ .element_size = @sizeOf(f16) }, } }); defer add_pip.deinit(); const allocator = init.gpa; const sizes = [_]usize{ 1, 256, 1024, 4 * 1024, 4 * 4 * 1024, 4 * 4 * 4 * 1024, 4 * 4 * 4 * 4 * 1024, 1024 * 1024, 4 * 1024 * 1024, 4 * 4 * 1024 * 1024, 4 * 4 * 4 * 1024 * 1024, // 4 * 4 * 4 * 4 * 1024 * 1024, // 4 * 4 * 4 * 4 * 4 * 1024 * 1024, }; const iterations = 10; // Updated headers to include VRAM footprint info std.debug.print("\n| Size (MB) | Phase | Time (ms) | GB/s | VRAM Peak |\n", .{}); std.debug.print("|----------:|:------------------|-----------:|---------:|----------:|\n", .{}); for (sizes) |size| { // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) --- const data_a = try allocator.alloc(f16, size); defer allocator.free(data_a); const data_b = try allocator.alloc(f16, size); defer allocator.free(data_b); for (0..size) |i| { data_a[i] = @floatFromInt(i); data_b[i] = @floatFromInt(size - 1 - i); } // Track best times across iterations var min_alloc_ns: u64 = std.math.maxInt(u64); var min_transfer_ns: u64 = std.math.maxInt(u64); var min_compute_ns: u64 = std.math.maxInt(u64); // Track peak VRAM usage observed during the iterations var peak_vram_bytes: u64 = 0; for (0..iterations) |_| { // --- 1. GPU ALLOCATION PHASE --- const alloc_start = std.Io.Clock.awake.now(init.io); const a = try Vec.initLoad(glloc, data_a); defer a.deinit(); const b = try Vec.initLoad(glloc, data_b); defer b.deinit(); const alloc_duration = alloc_start.durationTo(std.Io.Clock.awake.now(init.io)); const alloc_ns = @as(u64, @intCast(alloc_duration.toNanoseconds())); if (alloc_ns < min_alloc_ns) min_alloc_ns = alloc_ns; // --- 2. COMPUTE PHASE --- const compute_start = std.Io.Clock.awake.now(init.io); const sum = try a.run(glloc, b, add_pip); defer sum.deinit(); // All 3 buffers (a, b, sum) are currently resident in VRAM here. // Querying now catches the true peak allocation step. if (grena.allocated_vram_bytes > peak_vram_bytes) peak_vram_bytes = grena.allocated_vram_bytes; device.poll(); const compute_duration = compute_start.durationTo(std.Io.Clock.awake.now(init.io)); const compute_ns = @as(u64, @intCast(compute_duration.toNanoseconds())); if (compute_ns < min_compute_ns) min_compute_ns = compute_ns; // --- 3. TRANSFER PHASE (Device -> Host) --- const transfer_start = std.Io.Clock.awake.now(init.io); const out = try sum.read(allocator); defer allocator.free(out); const transfer_duration = transfer_start.durationTo(std.Io.Clock.awake.now(init.io)); const transfer_ns = @as(u64, @intCast(transfer_duration.toNanoseconds())); if (transfer_ns < min_transfer_ns) min_transfer_ns = transfer_ns; } // --- Metrics Calculations --- const f_size = @as(f64, @floatFromInt(size)); const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f16))); const mb = element_bytes / (1024.0 * 1024.0); // Individual Phase Timings (ms) const alloc_ms = @as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000.0; const compute_ms = @as(f64, @floatFromInt(min_compute_ns)) / 1_000_000.0; const transfer_ms = @as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000.0; // Bandwidth Calculations const alloc_gb_s = (element_bytes * 2.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_alloc_ns)) / 1_000_000_000.0); const compute_gb_s = (element_bytes * 3.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_compute_ns)) / 1_000_000_000.0); const transfer_gb_s = (element_bytes * 1.0 / 1_000_000_000.0) / (@as(f64, @floatFromInt(min_transfer_ns)) / 1_000_000_000.0); // Convert Peak VRAM bytes to Megabytes for clean display const peak_vram_mb = @as(f64, @floatFromInt(peak_vram_bytes)) / (1024.0 * 1024.0); // Print Results per Size Block with VRAM column aligned std.debug.print("| {d:9.2} | 1. GPU Alloc/Load | {d:10.3} | {d:8.2} | |\n", .{ mb, alloc_ms, alloc_gb_s }); std.debug.print("| | 2. Compute | {d:10.3} | {d:8.2} | {d:7.2} MB|\n", .{ compute_ms, compute_gb_s, peak_vram_mb }); std.debug.print("| | 3. Transfer (D->H)| {d:10.3} | {d:8.2} | |\n", .{ transfer_ms, transfer_gb_s }); std.debug.print("|-----------|-------------------|------------|---------:|----------:|\n", .{}); } }