diff --git a/src/GpuAllocator.zig b/src/GpuAllocator.zig
index 77df6eb..9600710 100644
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@@ -1,69 +1,24 @@
 const std = @import("std");
 const sh = @import("shaders.zig");
+const GpuDevice = @import("GpuDevice.zig");
 const c = @import("c.zig").c;
 
 const GpuAllocator = @This();
 
+device: GpuDevice,
 cpu_allocator: std.mem.Allocator,
-instance: c.WGPUInstance,
-adapter: c.WGPUAdapter,
-device: c.WGPUDevice,
-queue: c.WGPUQueue,
-
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
-
 pipelines: struct {
     add: c.WGPUComputePipeline,
 },
 
-pub fn init(cpu_allocator: std.mem.Allocator) !GpuAllocator {
-    const instance = c.wgpuCreateInstance(
-        &std.mem.zeroes(c.WGPUInstanceDescriptor),
-    ) orelse return error.NoInstance;
-    errdefer c.wgpuInstanceRelease(instance);
-
-    var ctx = Ctx{};
-    _ = c.wgpuInstanceRequestAdapter(
-        instance,
-        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
-        .{ .callback = onAdapter, .userdata1 = &ctx },
-    );
-    c.wgpuInstanceProcessEvents(instance);
-    const adapter = ctx.adapter orelse return error.NoAdapter;
-    errdefer c.wgpuAdapterRelease(adapter);
-
-    // --- QUERY HARDWARE LIMITS ---
-    var supported_limits = std.mem.zeroes(c.WGPULimits);
-    supported_limits.nextInChain = null;
-
-    // Fetch what your physical graphic card can actually handle
-    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
-
-    const device_descriptor = c.WGPUDeviceDescriptor{
-        .nextInChain = null,
-        .label = sv("TensorCompilerDevice"),
-        .requiredFeatureCount = 0,
-        .requiredFeatures = null,
-        .requiredLimits = &supported_limits,
-    };
-
-    _ = c.wgpuAdapterRequestDevice(
-        adapter,
-        &device_descriptor,
-        .{ .callback = onDevice, .userdata1 = &ctx },
-    );
-    c.wgpuInstanceProcessEvents(instance);
-    const device = ctx.device orelse return error.NoDevice;
-
+pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) !GpuAllocator {
     return .{
-        .cpu_allocator = cpu_allocator,
-        .instance = instance,
-        .adapter = adapter,
         .device = device,
-        .queue = c.wgpuDeviceGetQueue(device),
+        .cpu_allocator = cpu_allocator,
         .tracked_buffers = .init(cpu_allocator),
         .pipelines = .{
-            .add = try buildPipeline(device, sh.SHADER_ADD),
+            .add = try buildPipeline(device.device, sh.SHADER_ADD),
         },
     };
 }
@@ -79,11 +34,6 @@ pub fn deinit(self: *GpuAllocator) void {
         c.wgpuBufferRelease(buf);
     }
     self.tracked_buffers.deinit();
-
-    c.wgpuQueueRelease(self.queue);
-    c.wgpuDeviceRelease(self.device);
-    c.wgpuAdapterRelease(self.adapter);
-    c.wgpuInstanceRelease(self.instance);
 }
 
 pub fn registerBuffer(
@@ -91,7 +41,7 @@ pub fn registerBuffer(
     bytes: u64,
     usage: c.WGPUBufferUsage,
 ) !c.WGPUBuffer {
-    const buf = c.wgpuDeviceCreateBuffer(self.device, &.{
+    const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
         .usage = usage,
         .size = bytes,
     }) orelse return error.BufferAlloc;
@@ -107,59 +57,6 @@ pub fn unregisterAndDestroyBuffer(self: *GpuAllocator, buf: c.WGPUBuffer) void {
     }
 }
 
-// ── Internal ─────────────────────────────────────────────────────────────
-
-pub fn makeBuffer(
-    self: *GpuAllocator,
-    bytes: u64,
-    usage: c.WGPUBufferUsage,
-) !c.WGPUBuffer {
-    return c.wgpuDeviceCreateBuffer(self.device, &.{
-        .usage = usage,
-        .size = bytes,
-    }) orelse error.BufferAlloc;
-}
-
-/// Poll until GPU work completes. Use after submit if you need CPU sync.
-pub fn poll(self: *GpuAllocator) void {
-    _ = c.wgpuDevicePoll(self.device, 1, null);
-}
-
-const Ctx = struct {
-    adapter: c.WGPUAdapter = null,
-    device: c.WGPUDevice = null,
-};
-
-fn onAdapter(
-    status: c.WGPURequestAdapterStatus,
-    adapter: c.WGPUAdapter,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestAdapterStatus_Success) {
-        std.log.err("Adapter request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.adapter = adapter;
-}
-
-fn onDevice(
-    status: c.WGPURequestDeviceStatus,
-    device: c.WGPUDevice,
-    _: c.WGPUStringView,
-    userdata1: ?*anyopaque,
-    _: ?*anyopaque,
-) callconv(.c) void {
-    if (status != c.WGPURequestDeviceStatus_Success) {
-        std.log.err("Device request failed (status={d})", .{status});
-        return;
-    }
-    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
-    ctx.device = device;
-}
-
 fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
     var wgsl_src = c.WGPUShaderSourceWGSL{
         .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
diff --git a/src/GpuDevice.zig b/src/GpuDevice.zig
new file mode 100644
index 0000000..be4026c
--- /dev/null
+++ b/src/GpuDevice.zig
@@ -0,0 +1,107 @@
+const std = @import("std");
+const sh = @import("shaders.zig");
+const c = @import("c.zig").c;
+
+const Ctx = struct {
+    adapter: c.WGPUAdapter = null,
+    device: c.WGPUDevice = null,
+};
+
+const GpuAllocator = @This();
+
+instance: c.WGPUInstance,
+adapter: c.WGPUAdapter,
+device: c.WGPUDevice,
+queue: c.WGPUQueue,
+
+pub fn init() !GpuAllocator {
+    const instance = c.wgpuCreateInstance(
+        &std.mem.zeroes(c.WGPUInstanceDescriptor),
+    ) orelse return error.NoInstance;
+    errdefer c.wgpuInstanceRelease(instance);
+
+    var ctx = Ctx{};
+    _ = c.wgpuInstanceRequestAdapter(
+        instance,
+        &.{ .powerPreference = c.WGPUPowerPreference_HighPerformance },
+        .{ .callback = onAdapter, .userdata1 = &ctx },
+    );
+    c.wgpuInstanceProcessEvents(instance);
+    const adapter = ctx.adapter orelse return error.NoAdapter;
+    errdefer c.wgpuAdapterRelease(adapter);
+
+    // --- QUERY HARDWARE LIMITS ---
+    var supported_limits = std.mem.zeroes(c.WGPULimits);
+    supported_limits.nextInChain = null;
+
+    // Fetch what your physical graphic card can actually handle
+    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
+
+    const device_descriptor = c.WGPUDeviceDescriptor{
+        .nextInChain = null,
+        .label = sv("TensorCompilerDevice"),
+        .requiredFeatureCount = 0,
+        .requiredFeatures = null,
+        .requiredLimits = &supported_limits,
+    };
+
+    _ = c.wgpuAdapterRequestDevice(
+        adapter,
+        &device_descriptor,
+        .{ .callback = onDevice, .userdata1 = &ctx },
+    );
+    c.wgpuInstanceProcessEvents(instance);
+    const device = ctx.device orelse return error.NoDevice;
+
+    return .{
+        .instance = instance,
+        .adapter = adapter,
+        .device = device,
+        .queue = c.wgpuDeviceGetQueue(device),
+    };
+}
+
+pub fn deinit(self: GpuAllocator) void {
+    c.wgpuQueueRelease(self.queue);
+    c.wgpuDeviceRelease(self.device);
+    c.wgpuAdapterRelease(self.adapter);
+    c.wgpuInstanceRelease(self.instance);
+}
+
+pub fn poll(self: *GpuAllocator) void {
+    _ = c.wgpuDevicePoll(self.device, 1, null);
+}
+
+fn onAdapter(
+    status: c.WGPURequestAdapterStatus,
+    adapter: c.WGPUAdapter,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestAdapterStatus_Success) {
+        std.log.err("Adapter request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.adapter = adapter;
+}
+
+fn onDevice(
+    status: c.WGPURequestDeviceStatus,
+    device: c.WGPUDevice,
+    _: c.WGPUStringView,
+    userdata1: ?*anyopaque,
+    _: ?*anyopaque,
+) callconv(.c) void {
+    if (status != c.WGPURequestDeviceStatus_Success) {
+        std.log.err("Device request failed (status={d})", .{status});
+        return;
+    }
+    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
+    ctx.device = device;
+}
+
+fn sv(s: []const u8) c.WGPUStringView {
+    return .{ .data = s.ptr, .length = s.len };
+}
diff --git a/src/Mat.zig b/src/Mat.zig
index c3cde6f..144132a 100644
--- a/src/Mat.zig
+++ b/src/Mat.zig
@@ -26,7 +26,7 @@ pub fn load(
         c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
     );
 
-    c.wgpuQueueWriteBuffer(gloc.queue, buf.raw, 0, data.ptr, bytes);
+    c.wgpuQueueWriteBuffer(gloc.device.queue, buf.raw, 0, data.ptr, bytes);
     return .{ .buf = buf, .rows = rows, .cols = cols };
 }
 
@@ -74,12 +74,12 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
     );
     defer staging.deinit();
 
-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse return error.Encoder;
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
     c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.buf.raw, 0, staging.raw, 0, bytes);
     const cmd = c.wgpuCommandEncoderFinish(enc, null);
     defer c.wgpuCommandEncoderRelease(enc);
     defer c.wgpuCommandBufferRelease(cmd);
-    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
 
     var mapped = false;
     staging.mapAsync(
@@ -88,7 +88,7 @@ pub fn read(self: Mat, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
         bytes,
         .{ .callback = onMapped, .userdata1 = &mapped },
     );
-    while (!mapped) gloc.poll();
+    while (!mapped) gloc.device.poll();
 
     const ptr: [*]const f32 = @ptrCast(@alignCast(
         staging.getConstMappedRange(0, bytes),
@@ -137,7 +137,7 @@ fn dispatch2in1out(
         defer info_buf.deinit();
 
         // Write the number of elements *in this chunk* to the uniform buffer
-        c.wgpuQueueWriteBuffer(gloc.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
+        c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
 
         // Bind only the sub-slice for this chunk using `.offset` and `.size`
         const entries = [_]c.WGPUBindGroupEntry{
@@ -164,14 +164,14 @@ fn submitPass(
     const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
     defer c.wgpuBindGroupLayoutRelease(bgl);
 
-    const bg = c.wgpuDeviceCreateBindGroup(gloc.device, &.{
+    const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
         .layout = bgl,
         .entries = entries.ptr,
         .entryCount = entries.len,
     }) orelse return error.BindGroup;
     defer c.wgpuBindGroupRelease(bg);
 
-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device, null) orelse
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
         return error.Encoder;
     const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
     c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
@@ -190,7 +190,7 @@ fn submitPass(
     const cmd = c.wgpuCommandEncoderFinish(enc, null);
     defer c.wgpuCommandEncoderRelease(enc);
     defer c.wgpuCommandBufferRelease(cmd);
-    c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
+    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
 }
 
 fn ceilDiv(n: usize, d: usize) usize {
diff --git a/src/main.zig b/src/main.zig
index 9ed4e20..8f7748a 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -1,9 +1,13 @@
 const std = @import("std");
+const GpuDevice = @import("GpuDevice.zig");
 const GpuAllocator = @import("GpuAllocator.zig");
 const Mat = @import("Mat.zig");
 
 pub fn main(init: std.process.Init) !void {
-    var gloc = try GpuAllocator.init(init.gpa);
+    const device = try GpuDevice.init();
+    defer device.deinit();
+
+    var gloc = try GpuAllocator.init(init.gpa, device);
     defer gloc.deinit();
 
     // Define the sizes you want to benchmark
@@ -15,11 +19,11 @@ pub fn main(init: std.process.Init) !void {
         65536,
         262144,
         1024 * 1024,
-        4 * 1024 * 1024,
-        4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 2 * 1024 * 1024,
+        // 4 * 1024 * 1024,
+        // 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 2 * 1024 * 1024,
     };
 
     // Print table header