Added optional label in definitions and added logs when alloc, free and device info

2026-05-21 10:26:20 +02:00 · 2026-05-21 10:26:20 +02:00 · d503ce7dea
commit d503ce7dea
parent 69f9cad2c1
11 changed files with 203 additions and 73 deletions
--- a/examples/bench_cp.zig
+++ b/examples/bench_cp.zig
@ -6,6 +6,8 @@ const GpuAllocator = gpu.GpuAllocator;
 const GpuBuffer = gpu.GpuBuffer;
 const GpuCompute = gpu.GpuCompute;
 pub const std_options = std.Options{ .log_level = .info };
 /// Minimal implementation of a f16 Vector
 const Vec = struct {
    buf: GpuBuffer,
@ -14,11 +16,10 @@ const Vec = struct {
    // Changed: gloc is passed by value (const)
    pub fn initZero(gloc: GpuAllocator, len: usize) !Vec {
        return .{
-            .buf = try GpuBuffer.init(
+            .buf = try GpuBuffer.init(gloc, .{
-                gloc,
+                .size = len * @sizeOf(f16),
-                len * @sizeOf(f16),
+                .usage = .initMany(&.{ .Storage, .CopyDst, .CopySrc }),
-                .initMany(&.{ .Storage, .CopyDst, .CopySrc }),
+            }),
            ),
            .len = len,
        };
    }
--- a/examples/circle.zig
+++ b/examples/circle.zig
@ -18,7 +18,9 @@ pub fn main(init: std.process.Init) !void {
    defer device.deinit();
    // 2. Init VRAM Arena
-    const gloc = device.gpuAllocator();
+    var grena = GpuArenaAllocator.init(allocator, device.gpuAllocator());
    defer grena.deinit();
    const gloc = grena.gpuAllocator();
    // 3. Load Render Pipeline
    const circle_rp = try GpuRender.init(
--- a/examples/compute.zig
+++ b/examples/compute.zig
@ -21,11 +21,14 @@ pub fn main(init: std.process.Init) !void {
    const add_cp = try GpuCompute.init(
        gloc,
        @embedFile("shaders/add.wgsl"),
-        .{ .bindings = &.{
+        .{
-            .{ .element_size = @sizeOf(f16) },
+            .label = "add",
-            .{ .element_size = @sizeOf(f16) },
+            .bindings = &.{
-            .{ .element_size = @sizeOf(f16) },
+                .{ .element_size = @sizeOf(f16) },
-        } },
+                .{ .element_size = @sizeOf(f16) },
                .{ .element_size = @sizeOf(f16) },
            },
        },
    );
    // 4. Setup CPU data
@ -42,9 +45,9 @@ pub fn main(init: std.process.Init) !void {
    // 5. Initialize raw GPU Buffers
    const byte_size = len * @sizeOf(f16);
-    const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
+    const buf_a = try GpuBuffer.init(gloc, .{ .label = "a", .size = byte_size, .usage = .initMany(&.{ .Storage, .CopyDst, .CopySrc }) });
-    const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
+    const buf_b = try GpuBuffer.init(gloc, .{ .label = "b", .size = byte_size, .usage = .initMany(&.{ .Storage, .CopyDst, .CopySrc }) });
-    const buf_out = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
+    const buf_out = try GpuBuffer.init(gloc, .{ .label = "out", .size = byte_size, .usage = .initMany(&.{ .Storage, .CopyDst, .CopySrc }) });
    // Note: Buffers are safely tied to the GpuArenaAllocator which will automatically
    // release them at the end. You can also manually call buf_x.deinit() if desired.
--- a/src/GpuArenaAllocator.zig
+++ b/src/GpuArenaAllocator.zig
@ -3,6 +3,7 @@ const GpuDevice = @import("GpuDevice.zig");
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const c = @import("utils.zig").c;
 const viewStr = @import("utils.zig").viewStr;
 child_allocator: GpuAllocator, // I use Zig naming child_allocator, but that should be a parent for me. Likely something idk
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, c.WGPUBufferDescriptor),
@ -24,30 +25,40 @@ pub fn init(cpu_allocator: std.mem.Allocator, child_allocator: GpuAllocator) @Th
 }
 pub fn deinit(self: *@This()) void {
    std.log.debug("Freeing GpuArenaAllocator (Used VRAM: {}/{} MB)", .{
        self.allocated_vram_bytes / 1024 / 1024,
        self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
    });
    var it_buffer = self.tracked_buffers.keyIterator();
    while (it_buffer.next()) |buf_ptr|
-        self.child_allocator.freeBuffer(buf_ptr.*);
+        freeBuffer(self, buf_ptr.*);
    self.tracked_buffers.deinit();
    var it_tex = self.tracked_textures.keyIterator();
    while (it_tex.next()) |buf_ptr|
-        self.child_allocator.freeTexture(buf_ptr.*);
+        freeTexture(self, buf_ptr.*);
    self.tracked_textures.deinit();
    var it_view = self.tracked_views.keyIterator();
    while (it_view.next()) |buf_ptr|
-        self.child_allocator.freeTextureView(buf_ptr.*);
+        freeTextureView(self, buf_ptr.*);
    self.tracked_views.deinit();
    var it_render = self.tracked_renders.keyIterator();
    while (it_render.next()) |buf_ptr|
-        self.child_allocator.freeRenderPipeline(buf_ptr.*);
+        freeRenderPipeline(self, buf_ptr.*);
    self.tracked_renders.deinit();
    var it_compute = self.tracked_computes.keyIterator();
    while (it_compute.next()) |buf_ptr|
-        self.child_allocator.freeComputePipeline(buf_ptr.*);
+        freeComputePipeline(self, buf_ptr.*);
    self.tracked_computes.deinit();
    std.log.debug("Freed GpuArenaAllocator (Used VRAM: {}/{} MB)", .{
        self.allocated_vram_bytes / 1024 / 1024,
        self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
    });
 }
 /// Returns the type-erased immutable interface wrapper
@ -79,6 +90,14 @@ fn allocBuffer(ctx: *anyopaque, desc: c.WGPUBufferDescriptor) anyerror!c.WGPUBuf
    const raw = try self.child_allocator.allocBuffer(desc);
    self.tracked_buffers.putAssumeCapacity(raw, desc);
    self.allocated_vram_bytes += desc.size;
    std.log.debug("Allocated Buffer '{s}': {d} B (Total VRAM: {}/{} MB)", .{
        viewStr(desc.label),
        desc.size,
        self.allocated_vram_bytes / 1024 / 1024,
        self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
    });
    return raw;
 }
@ -87,6 +106,12 @@ fn freeBuffer(ctx: *anyopaque, raw: c.WGPUBuffer) void {
    if (self.tracked_buffers.fetchRemove(raw)) |kv| {
        self.child_allocator.freeBuffer(raw);
        self.allocated_vram_bytes -= kv.value.size;
        std.log.debug("Freed Buffer '{s}' (Total VRAM: {}/{} MB)", .{
            viewStr(kv.value.label),
            self.allocated_vram_bytes / 1024 / 1024,
            self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
        });
    }
 }
@ -97,13 +122,21 @@ fn allocTexture(ctx: *anyopaque, desc: c.WGPUTextureDescriptor) anyerror!c.WGPUT
    const format: GpuTextureFormat = @enumFromInt(desc.format);
    const bytes_size = desc.size.width * desc.size.height * format.bytesPerPixel();
-    if (bytes_size + self.allocated_vram_bytes > self.child_allocator.device.config.vram_bytes_limit)
+    if (bytes_size + self.allocated_vram_bytes > self.child_allocator.device.def.vram_bytes_limit)
        return error.ExceedsVramBudget;
    const raw = try self.child_allocator.allocTexture(desc);
    self.tracked_textures.putAssumeCapacity(raw, desc);
    self.allocated_vram_bytes += bytes_size;
    std.log.debug("Allocated Texture '{s}': {d} B (Total VRAM: {}/{} MB)", .{
        viewStr(desc.label),
        bytes_size,
        self.allocated_vram_bytes / 1024 / 1024,
        self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
    });
    return raw;
 }
@ -117,6 +150,12 @@ fn freeTexture(ctx: *anyopaque, raw: c.WGPUTexture) void {
        const format: GpuTextureFormat = @enumFromInt(desc.format);
        const bytes_size = desc.size.width * desc.size.height * format.bytesPerPixel();
        self.allocated_vram_bytes -= bytes_size;
        std.log.debug("Freed Texture '{s}' (Total VRAM: {}/{} MB)", .{
            viewStr(desc.label),
            self.allocated_vram_bytes / 1024 / 1024,
            self.child_allocator.device.def.vram_bytes_limit / 1024 / 1024,
        });
    }
 }
@ -125,13 +164,17 @@ fn allocTextureView(ctx: *anyopaque, texture: c.WGPUTexture, desc: c.WGPUTexture
    try self.tracked_views.ensureTotalCapacity(self.tracked_views.count() + 1);
    const raw = try self.child_allocator.allocTextureView(texture, desc);
    self.tracked_views.putAssumeCapacity(raw, desc);
    std.log.debug("Allocated Texture View '{s}'", .{viewStr(desc.label)});
    return raw;
 }
 fn freeTextureView(ctx: *anyopaque, raw: c.WGPUTextureView) void {
    const self: *@This() = @ptrCast(@alignCast(ctx));
-    if (self.tracked_views.remove(raw))
+    if (self.tracked_views.fetchRemove(raw)) |kv| {
        self.child_allocator.freeTextureView(raw);
        const desc = kv.value;
        std.log.debug("Freed Texture View '{s}'", .{viewStr(desc.label)});
    }
 }
 fn allocRenderPipeline(ctx: *anyopaque, desc: c.WGPURenderPipelineDescriptor) anyerror!c.WGPURenderPipeline {
@ -139,13 +182,17 @@ fn allocRenderPipeline(ctx: *anyopaque, desc: c.WGPURenderPipelineDescriptor) an
    try self.tracked_renders.ensureTotalCapacity(self.tracked_renders.count() + 1);
    const raw = try self.child_allocator.allocRenderPipeline(desc);
    self.tracked_renders.putAssumeCapacity(raw, desc);
    std.log.debug("Allocated Render Pipeline '{s}'", .{viewStr(desc.label)});
    return raw;
 }
 fn freeRenderPipeline(ctx: *anyopaque, raw: c.WGPURenderPipeline) void {
    const self: *@This() = @ptrCast(@alignCast(ctx));
-    if (self.tracked_renders.remove(raw))
+    if (self.tracked_renders.fetchRemove(raw)) |kv| {
        self.child_allocator.freeRenderPipeline(raw);
        const desc = kv.value;
        std.log.debug("Freed Render Pipeline '{s}'", .{viewStr(desc.label)});
    }
 }
 fn allocComputePipeline(ctx: *anyopaque, desc: c.WGPUComputePipelineDescriptor) anyerror!c.WGPUComputePipeline {
@ -153,11 +200,15 @@ fn allocComputePipeline(ctx: *anyopaque, desc: c.WGPUComputePipelineDescriptor)
    try self.tracked_computes.ensureTotalCapacity(self.tracked_computes.count() + 1);
    const raw = try self.child_allocator.allocComputePipeline(desc);
    self.tracked_computes.putAssumeCapacity(raw, desc);
    std.log.debug("Allocated Compute Pipeline '{s}'", .{viewStr(desc.label)});
    return raw;
 }
 fn freeComputePipeline(ctx: *anyopaque, raw: c.WGPUComputePipeline) void {
    const self: *@This() = @ptrCast(@alignCast(ctx));
-    if (self.tracked_computes.remove(raw))
+    if (self.tracked_computes.fetchRemove(raw)) |kv| {
        self.child_allocator.freeComputePipeline(raw);
        const desc = kv.value;
        std.log.debug("Freed Compute Pipeline '{s}'", .{viewStr(desc.label)});
    }
 }
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@ -1,6 +1,7 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
 const svOpt = @import("utils.zig").svOpt;
 raw: c.WGPUBuffer,
 size: u64,
@ -21,15 +22,25 @@ const BufferUsage = enum(u64) {
    QueryResolve = 0x0000000000000200,
 };
-pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@This() {
+const GpuBufferDef = struct {
    label: ?[]const u8 = null,
    size: u64,
    usage: std.EnumSet(BufferUsage),
 };
 pub fn init(gloc: GpuAllocator, def: GpuBufferDef) !@This() {
    var use: u64 = 0;
-    var iter = usage.iterator();
+    var iter = def.usage.iterator();
    while (iter.next()) |flag| use |= @intFromEnum(flag);
    // Automatically align the buffer size forward to a multiple of 4 bytes under the hood
-    const aligned_size = std.mem.alignForward(u64, size, 4);
+    const aligned_size = std.mem.alignForward(u64, def.size, 4);
-    const raw_handle = try gloc.allocBuffer(.{ .size = aligned_size, .usage = use });
+    const raw_handle = try gloc.allocBuffer(.{
        .size = aligned_size,
        .usage = use,
        .label = svOpt(def.label),
    });
    return .{
        .raw = raw_handle,
        .size = aligned_size,
@ -91,11 +102,11 @@ pub fn load(
 pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
    const out = try alloc.alloc(T, @divExact(self.size, @sizeOf(T)));
-    const staging = try init(
+    const staging = try init(self.gloc, .{
-        self.gloc,
+        .size = self.size,
-        self.size,
+        .usage = .initMany(&.{ .MapRead, .CopyDst }),
-        .initMany(&.{ .MapRead, .CopyDst }),
+        .label = "staging_read_buffer",
-    );
+    });
    defer staging.deinit();
    const enc = c.wgpuDeviceCreateCommandEncoder(self.gloc.device.device, null) orelse return error.Encoder;
--- a/src/GpuCompute.zig
+++ b/src/GpuCompute.zig
@ -1,5 +1,6 @@
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const svOpt = @import("utils.zig").svOpt;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuDevice = @import("GpuDevice.zig");
@ -11,6 +12,7 @@ pub const Binding = struct {
 };
 pub const ComputeDef = struct {
    label: ?[]const u8 = null,
    bindings: []const Binding,
    workgroup_size: u32 = 256,
    max_workgroups: u32 = 65535,
@ -33,7 +35,10 @@ pub fn init(gloc: GpuAllocator, wgsl: []const u8, def: ComputeDef) !@This() {
    }) orelse return error.Shader;
    defer c.wgpuShaderModuleRelease(shader);
-    const pip = try gloc.allocComputePipeline(.{ .compute = .{ .module = shader, .entryPoint = sv("main") } });
+    const pip = try gloc.allocComputePipeline(.{
        .label = svOpt(def.label),
        .compute = .{ .module = shader, .entryPoint = sv("main") },
    });
    return .{
        .gloc = gloc,
@ -108,11 +113,11 @@ pub fn run(
    defer if (info_buf) |b| b.deinit();
    if (self.def.append_info_buffer) {
-        info_buf = try GpuBuffer.init(
+        info_buf = try GpuBuffer.init(gloc, .{
-            gloc,
+            .size = @sizeOf(u32),
-            @sizeOf(u32),
+            .usage = .initMany(&.{ .Uniform, .CopyDst }),
-            .initMany(&.{ .Uniform, .CopyDst }),
+            .label = "compute_info_buffer",
-        );
+        });
        c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.?.raw, 0, &elements_count, @sizeOf(u32));
        entries_buf[entry_count] = .{
--- a/src/GpuDevice.zig
+++ b/src/GpuDevice.zig
@ -1,20 +1,17 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const svOpt = @import("utils.zig").svOpt;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 // TODO: Make Allocator more zig like
 //  - GpuDevice can return a GpuAllocator that just allocate and nothing else
 //  - From this GpuAllocator, can create a GpuArena like std.heap.ArenaAllocator.init(allocator)
 //  - Rename GpuArenaAllocator too
 const Ctx = struct {
    adapter: c.WGPUAdapter = null,
    device: c.WGPUDevice = null,
 };
-const GpuDeviceConfig = struct {
+const GpuDeviceDef = struct {
    label: ?[]const u8 = null,
    /// VRAM limit. Default 2 GB
    vram_bytes_limit: u64 = 2 * 1024 * 1024 * 1024,
    power_preference: enum(c_uint) {
@ -31,9 +28,9 @@ device: c.WGPUDevice,
 queue: c.WGPUQueue,
 limits: c.WGPULimits,
-config: GpuDeviceConfig,
+def: GpuDeviceDef,
-pub fn init(config: GpuDeviceConfig) !@This() {
+pub fn init(def: GpuDeviceDef) !@This() {
    const instance = c.wgpuCreateInstance(
        &std.mem.zeroes(c.WGPUInstanceDescriptor),
    ) orelse return error.NoInstance;
@ -42,13 +39,25 @@ pub fn init(config: GpuDeviceConfig) !@This() {
    var ctx = Ctx{};
    _ = c.wgpuInstanceRequestAdapter(
        instance,
-        &.{ .powerPreference = @intFromEnum(config.power_preference) },
+        &.{ .powerPreference = @intFromEnum(def.power_preference) },
        .{ .callback = onAdapter, .userdata1 = &ctx },
    );
    c.wgpuInstanceProcessEvents(instance);
    const adapter = ctx.adapter orelse return error.NoAdapter;
    errdefer c.wgpuAdapterRelease(adapter);
    var adapter_info = std.mem.zeroes(c.WGPUAdapterInfo);
    _ = c.wgpuAdapterGetInfo(adapter, &adapter_info);
    std.log.info("=== WebGPU Device Initialized ===", .{});
    if (adapter_info.device.length > 0 and adapter_info.device.data != null) {
        std.log.info("  Device Name  : {s}", .{adapter_info.device.data[0..adapter_info.device.length]});
    }
    if (adapter_info.architecture.length > 0 and adapter_info.architecture.data != null) {
        std.log.info("  Architecture : {s}", .{adapter_info.architecture.data[0..adapter_info.architecture.length]});
    }
    std.log.info("  Backend Type : {d}", .{adapter_info.backendType});
    var supported_features = std.mem.zeroes(c.WGPUSupportedFeatures);
    c.wgpuAdapterGetFeatures(adapter, &supported_features);
@ -56,6 +65,11 @@ pub fn init(config: GpuDeviceConfig) !@This() {
    supported_limits.nextInChain = null;
    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
    std.log.info("  Max Buf Size : {d} MB", .{supported_limits.maxBufferSize / 1024 / 1024});
    std.log.info("  Max Storage  : {d} MB", .{supported_limits.maxStorageBufferBindingSize / 1024 / 1024});
    std.log.info("  Max Workgroup: X: {d}, Y: {d}, Z: {d}", .{ supported_limits.maxComputeWorkgroupSizeX, supported_limits.maxComputeWorkgroupSizeY, supported_limits.maxComputeWorkgroupSizeZ });
    std.log.info("  VRAM Budget  : {d} MB", .{def.vram_bytes_limit / 1024 / 1024});
    var has_f16 = false;
    for (0..supported_features.featureCount) |i| {
        if (supported_features.features[i] == c.WGPUFeatureName_ShaderF16) {
@ -63,6 +77,8 @@ pub fn init(config: GpuDeviceConfig) !@This() {
            break;
        }
    }
    std.log.info("  Shader F16   : {}", .{has_f16});
    std.log.info("=================================", .{});
    var feature_buf = [_]c.WGPUFeatureName{c.WGPUFeatureName_ShaderF16};
    const required_features: []const c.WGPUFeatureName =
@ -70,7 +86,7 @@ pub fn init(config: GpuDeviceConfig) !@This() {
    const device_descriptor = c.WGPUDeviceDescriptor{
        .nextInChain = null,
-        .label = sv("TensorCompilerDevice"),
+        .label = svOpt(def.label),
        .requiredFeatureCount = required_features.len,
        .requiredFeatures = if (required_features.len > 0) required_features.ptr else null,
        .requiredLimits = &supported_limits,
@ -89,7 +105,7 @@ pub fn init(config: GpuDeviceConfig) !@This() {
        .device = device,
        .queue = c.wgpuDeviceGetQueue(device),
        .limits = supported_limits,
-        .config = config,
+        .def = def,
    };
 }
--- a/src/GpuRender.zig
+++ b/src/GpuRender.zig
@ -1,6 +1,7 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const svOpt = @import("utils.zig").svOpt;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuDevice = @import("GpuDevice.zig");
@ -12,6 +13,7 @@ pub const Binding = struct {
 };
 pub const GpuRenderDef = struct {
    label: ?[]const u8 = null,
    bindings: []const Binding = &.{},
    /// The surface texture format we are rendering to (e.g., BGRA8Unorm)
    texture_format: GpuTextureFormat,
@ -68,6 +70,7 @@ pub fn init(gloc: GpuAllocator, wgsl: []const u8, def: GpuRenderDef) !@This() {
    // 3. Compile the Complete Render Pipeline
    const pip = try gloc.allocRenderPipeline(.{
        .label = svOpt(def.label),
        .vertex = .{
            .module = shader,
            .entryPoint = sv(def.vertex_entry),
--- a/src/GpuTexture.zig
+++ b/src/GpuTexture.zig
@ -1,11 +1,13 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const svOpt = @import("utils.zig").svOpt;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const GpuTextureUsage = @import("lib.zig").GpuTextureUsage;
 pub const GpuTextureDef = struct {
    label: ?[]const u8 = null,
    size: c.WGPUExtent3D,
    usage: std.EnumSet(GpuTextureUsage),
    format: GpuTextureFormat,
@ -21,6 +23,7 @@ pub fn init(gloc: GpuAllocator, def: GpuTextureDef) !@This() {
    while (iter.next()) |flag| use |= @intFromEnum(flag);
    const desc = c.WGPUTextureDescriptor{
        .label = svOpt(def.label),
        .usage = use,
        .dimension = c.WGPUTextureDimension_2D,
        .size = def.size,
@ -51,7 +54,11 @@ pub fn bytesSizeRow(self: @This()) u32 {
 /// Return a GpuBuffer containing a copy of the texture.
 pub fn buffCopy(self: @This(), gloc: GpuAllocator) !GpuBuffer {
-    const buf = try GpuBuffer.init(gloc, self.bytesSize(), .initMany(&.{ .CopyDst, .CopySrc }));
+    const buf = try GpuBuffer.init(gloc, .{
        .size = self.bytesSize(),
        .usage = .initMany(&.{ .CopyDst, .CopySrc }),
        .label = "texture_copy_buffer",
    });
    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
    defer c.wgpuCommandEncoderRelease(enc);
@ -102,38 +109,52 @@ pub fn load(
 ) !void {
    const bytes = data.len * @sizeOf(T);
-    if (bytes == self.size) {
+    c.wgpuQueueWriteTexture(
-        // Aligned path: direct download
+        self.gloc.device.queue,
-        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
+        &.{
-    } else {
+            .texture = self.raw,
-        // Unaligned path: Split the write into an aligned chunk and a padded remainder
+            .mipLevel = 0,
-        // to support arbitrary lengths without any allocations or large stack arrays.
+            .origin = .{ .x = 0, .y = 0, .z = 0 },
-        const aligned_part = (bytes / 4) * 4;
+            .aspect = c.WGPUTextureAspect_All,
-        if (aligned_part > 0) {
+        },
-            c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, aligned_part);
+        data.ptr,
-        }
+        bytes,
-
+        &.{
-        var remainder_buf: [4]u8 = .{ 0, 0, 0, 0 };
+            .offset = 0,
-        const data_bytes = std.mem.sliceAsBytes(data);
+            .bytesPerRow = self.bytesSizeRow(),
-        @memcpy(remainder_buf[0 .. bytes - aligned_part], data_bytes[aligned_part..bytes]);
+            .rowsPerImage = self.def.size.height,
-
+        },
-        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, aligned_part, &remainder_buf, 4);
+        &self.def.size,
-    }
+    );
 }
 // GPU to CPU
 pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
    const out = try alloc.alloc(T, @divExact(self.size, @sizeOf(T)));
-    const staging = try init(
+    const staging = try init(self.gloc, .{
-        self.gloc,
+        .size = self.size,
-        self.size,
+        .usage = .initMany(&.{ .MapRead, .CopyDst }),
-        .initMany(&.{ .MapRead, .CopyDst }),
+        .label = "texture_read_staging",
-    );
+    });
    defer staging.deinit();
    const enc = c.wgpuDeviceCreateCommandEncoder(self.gloc.device.device, null) orelse return error.Encoder;
-    c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.raw, 0, staging.raw, 0, self.size);
+    const src_copy = c.WGPUTexelCopyTextureInfo{
        .texture = self.raw,
        .mipLevel = 0,
        .origin = .{ .x = 0, .y = 0, .z = 0 },
        .aspect = c.WGPUTextureAspect_All,
    };
    const dst_copy = c.WGPUTexelCopyBufferInfo{
        .buffer = staging.raw,
        .layout = .{
            .offset = 0,
            .bytesPerRow = self.bytesSizeRow(),
            .rowsPerImage = self.def.size.height,
        },
    };
    c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, &self.def.size);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandEncoderRelease(enc);
    defer c.wgpuCommandBufferRelease(cmd);
--- a/src/GpuTextureView.zig
+++ b/src/GpuTextureView.zig
@ -1,11 +1,13 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const svOpt = @import("utils.zig").svOpt;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuTexture = @import("lib.zig").GpuTexture;
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const GpuTextureUsage = @import("lib.zig").GpuTextureUsage;
 pub const GpuViewDef = struct {
    label: ?[]const u8 = null,
    usage: std.EnumSet(GpuTextureUsage) = .empty,
    format: GpuTextureFormat = .Undefined,
 };
@ -19,6 +21,7 @@ pub fn init(gloc: GpuAllocator, texture: GpuTexture, def: GpuViewDef) !@This() {
    while (iter.next()) |flag| use |= @intFromEnum(flag);
    const raw = try gloc.allocTextureView(texture.raw, .{
        .label = svOpt(def.label),
        .format = @intFromEnum(def.format),
        .usage = use,
        .mipLevelCount = 1,
--- a/src/utils.zig
+++ b/src/utils.zig
@ -3,3 +3,17 @@ pub const c = @cImport(@cInclude("wgpu.h"));
 pub fn sv(s: []const u8) c.WGPUStringView {
    return .{ .data = s.ptr, .length = s.len };
 }
 /// Allows safely passing an optional Zig string to a WebGPU string view.
 pub fn svOpt(s: ?[]const u8) c.WGPUStringView {
    if (s) |str| return sv(str);
    return .{ .data = null, .length = 0 };
 }
 /// Helper to print a WGPUStringView in your logs.
 pub fn viewStr(view: c.WGPUStringView) []const u8 {
    if (view.data != null and view.length > 0) {
        return view.data[0..view.length];
    }
    return "unnamed";
 }