diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig index 77df6eb..9600710 100644 --- a/src/GpuAllocator.zig +++ b/src/GpuAllocator.zig @@ -1,69 +1,24 @@ const std = @import("std"); const sh = @import("shaders.zig"); +const GpuDevice = @import("GpuDevice.zig"); const c = @import("c.zig").c; const GpuAllocator = @This(); +device: GpuDevice, cpu_allocator: std.mem.Allocator, -instance: c.WGPUInstance, -adapter: c.WGPUAdapter, -device: c.WGPUDevice, -queue: c.WGPUQueue, - tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void), - pipelines: struct { add: c.WGPUComputePipeline, }, -pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator { - const instance = c.wgpuCreateInstance( - &std.mem.zeroes(c.WGPUInstanceDescriptor), - ) orelse return error.NoInstance; - errdefer c.wgpuInstanceRelease(instance); - - var ctx = Ctx{}; - _ = c.wgpuInstanceRequestAdapter( - instance, - &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, - .{ .callback = onAdapter, .userdata1 = &ctx }, - ); - c.wgpuInstanceProcessEvents(instance); - const adapter = ctx.adapter orelse return error.NoAdapter; - errdefer c.wgpuAdapterRelease(adapter); - - // --- QUERY HARDWARE LIMITS --- - var supported_limits = std.mem.zeroes(c.WGPULimits); - supported_limits.nextInChain = null; - - // Fetch what your physical graphic card can actually handle - if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits; - - const device_descriptor = c.WGPUDeviceDescriptor{ - .nextInChain = null, - .label = sv("TensorCompilerDevice"), - .requiredFeatureCount = 0, - .requiredFeatures = null, - .requiredLimits = &supported_limits, - }; - - _ = c.wgpuAdapterRequestDevice( - adapter, - &device_descriptor, - .{ .callback = onDevice, .userdata1 = &ctx }, - ); - c.wgpuInstanceProcessEvents(instance); - const device = ctx.device orelse return error.NoDevice; - +pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator { return .{ - .cpu_allocator = cpu_allocator, - .instance = instance, - .adapter = adapter, .device = device, - .queue = c.wgpuDeviceGetQueue(device), + .cpu_allocator = cpu_allocator, .tracked_buffers = .init(cpu_allocator), .pipelines = .{ - .add = try buildPipeline(device, sh.SHADER_ADD), + .add = try buildPipeline(device.device, sh.SHADER_ADD), }, }; } @@ -79,11 +34,6 @@ pub fn deinit(self: *GpuAllocator) void { c.wgpuBufferRelease(buf); } self.tracked_buffers.deinit(); - - c.wgpuQueueRelease(self.queue); - c.wgpuDeviceRelease(self.device); - c.wgpuAdapterRelease(self.adapter); - c.wgpuInstanceRelease(self.instance); } pub fn registerBuffer( @@ -91,7 +41,7 @@ pub fn registerBuffer( bytes: u64, usage: c.WGPUBufferUsage, ) !c.WGPUBuffer { - const buf = c.wgpuDeviceCreateBuffer(self.device, &.{ + const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{ .usage = usage, .size = bytes, }) orelse return error.BufferAlloc; @@ -107,59 +57,6 @@ pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void { } } -// ── Internal ───────────────────────────────────────────────────────────── - -pub fn makeBuffer( - self: *GpuAllocator, - bytes: u64, - usage: c.WGPUBufferUsage, -) !c.WGPUBuffer { - return c.wgpuDeviceCreateBuffer(self.device, &.{ - .usage = usage, - .size = bytes, - }) orelse error.BufferAlloc; -} - -/// Poll until GPU work completes. Use after submit if you need CPU sync. -pub fn poll(self: *GpuAllocator) void { - _ = c.wgpuDevicePoll(self.device, 1, null); -} - -const Ctx = struct { - adapter: c.WGPUAdapter = null, - device: c.WGPUDevice = null, -}; - -fn onAdapter( - status: c.WGPURequestAdapterStatus, - adapter: c.WGPUAdapter, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestAdapterStatus_Success) { - std.log.err("Adapter request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.adapter = adapter; -} - -fn onDevice( - status: c.WGPURequestDeviceStatus, - device: c.WGPUDevice, - _: c.WGPUStringView, - userdata1: ?*anyopaque, - _: ?*anyopaque, -) callconv(.c) void { - if (status != c.WGPURequestDeviceStatus_Success) { - std.log.err("Device request failed (status={d})", .{status}); - return; - } - const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); - ctx.device = device; -} - fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline { var wgsl_src = c.WGPUShaderSourceWGSL{ .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL }, diff --git a/src/GpuDevice.zig b/src/GpuDevice.zig new file mode 100644 index 0000000..be4026c --- /dev/null +++ b/src/GpuDevice.zig @@ -0,0 +1,107 @@ +const std = @import("std"); +const sh = @import("shaders.zig"); +const c = @import("c.zig").c; + +const Ctx = struct { + adapter: c.WGPUAdapter = null, + device: c.WGPUDevice = null, +}; + +const GpuAllocator = @This(); + +instance: c.WGPUInstance, +adapter: c.WGPUAdapter, +device: c.WGPUDevice, +queue: c.WGPUQueue, + +pub fn init() !GpuAllocator { + const instance = c.wgpuCreateInstance( + &std.mem.zeroes(c.WGPUInstanceDescriptor), + ) orelse return error.NoInstance; + errdefer c.wgpuInstanceRelease(instance); + + var ctx = Ctx{}; + _ = c.wgpuInstanceRequestAdapter( + instance, + &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance }, + .{ .callback = onAdapter, .userdata1 = &ctx }, + ); + c.wgpuInstanceProcessEvents(instance); + const adapter = ctx.adapter orelse return error.NoAdapter; + errdefer c.wgpuAdapterRelease(adapter); + + // --- QUERY HARDWARE LIMITS --- + var supported_limits = std.mem.zeroes(c.WGPULimits); + supported_limits.nextInChain = null; + + // Fetch what your physical graphic card can actually handle + if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits; + + const device_descriptor = c.WGPUDeviceDescriptor{ + .nextInChain = null, + .label = sv("TensorCompilerDevice"), + .requiredFeatureCount = 0, + .requiredFeatures = null, + .requiredLimits = &supported_limits, + }; + + _ = c.wgpuAdapterRequestDevice( + adapter, + &device_descriptor, + .{ .callback = onDevice, .userdata1 = &ctx }, + ); + c.wgpuInstanceProcessEvents(instance); + const device = ctx.device orelse return error.NoDevice; + + return .{ + .instance = instance, + .adapter = adapter, + .device = device, + .queue = c.wgpuDeviceGetQueue(device), + }; +} + +pub fn deinit(self: GpuAllocator) void { + c.wgpuQueueRelease(self.queue); + c.wgpuDeviceRelease(self.device); + c.wgpuAdapterRelease(self.adapter); + c.wgpuInstanceRelease(self.instance); +} + +pub fn poll(self: *GpuAllocator) void { + _ = c.wgpuDevicePoll(self.device, 1, null); +} + +fn onAdapter( + status: c.WGPURequestAdapterStatus, + adapter: c.WGPUAdapter, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestAdapterStatus_Success) { + std.log.err("Adapter request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.adapter = adapter; +} + +fn onDevice( + status: c.WGPURequestDeviceStatus, + device: c.WGPUDevice, + _: c.WGPUStringView, + userdata1: ?*anyopaque, + _: ?*anyopaque, +) callconv(.c) void { + if (status != c.WGPURequestDeviceStatus_Success) { + std.log.err("Device request failed (status={d})", .{status}); + return; + } + const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?)); + ctx.device = device; +} + +fn sv(s: []const u8) c.WGPUStringView { + return .{ .data = s.ptr, .length = s.len }; +} diff --git a/src/Mat.zig b/src/Mat.zig index c3cde6f..144132a 100644 --- a/src/Mat.zig +++ b/src/Mat.zig @@ -26,7 +26,7 @@ pub fn load( c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc, ); - c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes); + c.wgpuQueueWriteBuffer(gloc.device.queue, buf.raw, 0, data.ptr, bytes); return .{ .buf = buf, .rows = rows, .cols = cols }; } @@ -74,12 +74,12 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 { ); defer staging.deinit(); - const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder; + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder; c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes); const cmd = c.wgpuCommandEncoderFinish(enc, null); defer c.wgpuCommandEncoderRelease(enc); defer c.wgpuCommandBufferRelease(cmd); - c.wgpuQueueSubmit(gloc.queue, 1, &cmd); + c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); var mapped = false; staging.mapAsync( @@ -88,7 +88,7 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 { bytes, .{ .callback = onMapped, .userdata1 = &mapped }, ); - while (!mapped) gloc.poll(); + while (!mapped) gloc.device.poll(); const ptr: [*]const f32 = @ptrCast(@alignCast( staging.getConstMappedRange(0, bytes), @@ -137,7 +137,7 @@ fn dispatch2in1out( defer info_buf.deinit(); // Write the number of elements *in this chunk* to the uniform buffer - c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32)); + c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, ¤t_chunk_elements, @sizeOf(u32)); // Bind only the sub-slice for this chunk using `.offset` and `.size` const entries = [_]c.WGPUBindGroupEntry{ @@ -164,14 +164,14 @@ fn submitPass( const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0); defer c.wgpuBindGroupLayoutRelease(bgl); - const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{ + const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{ .layout = bgl, .entries = entries.ptr, .entryCount = entries.len, }) orelse return error.BindGroup; defer c.wgpuBindGroupRelease(bg); - const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse + const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder; const pass = c.wgpuCommandEncoderBeginComputePass(enc, null); c.wgpuComputePassEncoderSetPipeline(pass, pipeline); @@ -190,7 +190,7 @@ fn submitPass( const cmd = c.wgpuCommandEncoderFinish(enc, null); defer c.wgpuCommandEncoderRelease(enc); defer c.wgpuCommandBufferRelease(cmd); - c.wgpuQueueSubmit(gloc.queue, 1, &cmd); + c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd); } fn ceilDiv(n: usize, d: usize) usize { diff --git a/src/main.zig b/src/main.zig index 9ed4e20..8f7748a 100644 --- a/src/main.zig +++ b/src/main.zig @@ -1,9 +1,13 @@ const std = @import("std"); +const GpuDevice = @import("GpuDevice.zig"); const GpuAllocator = @import("GpuAllocator.zig"); const Mat = @import("Mat.zig"); pub fn main(init: std.process.Init) !void { - var gloc = try GpuAllocator.init(init.gpa); + const device = try GpuDevice.init(); + defer device.deinit(); + + var gloc = try GpuAllocator.init(init.gpa, device); defer gloc.deinit(); // Define the sizes you want to benchmark @@ -15,11 +19,11 @@ pub fn main(init: std.process.Init) !void { 65536, 262144, 1024 * 1024, - 4 * 1024 * 1024, - 4 * 4 * 1024 * 1024, - 4 * 4 * 4 * 1024 * 1024, - 4 * 4 * 4 * 4 * 1024 * 1024, - 4 * 4 * 4 * 4 * 2 * 1024 * 1024, + // 4 * 1024 * 1024, + // 4 * 4 * 1024 * 1024, + // 4 * 4 * 4 * 1024 * 1024, + // 4 * 4 * 4 * 4 * 1024 * 1024, + // 4 * 4 * 4 * 4 * 2 * 1024 * 1024, }; // Print table header