11 changed files with 25 additions and 763 deletions
--- a/build.zig
+++ b/build.zig
@ -56,39 +56,4 @@ pub fn build(b: *std.Build) !void {
        const run_cmd = b.addRunArtifact(exe);
        run_step.dependOn(&run_cmd.step);
    }
    const exe = b.addExecutable(.{
        .name = "circle",
        .root_module = b.createModule(.{
            .root_source_file = b.path("src/circle.zig"),
            .target = target,
            .optimize = optimize,
            .imports = &.{},
        }),
    });
    exe.root_module.addIncludePath(b.path("libs/wgpu-native/include"));
    exe.root_module.addLibraryPath(b.path("libs/wgpu-native/lib"));
    exe.root_module.addObjectFile(b.path("libs/wgpu-native/lib/libwgpu_native.a"));
    // Platform-specific system frameworks needed by wgpu-native
    if (t.os.tag == .macos) {
        exe.root_module.linkFramework("Metal", .{});
        exe.root_module.linkFramework("QuartzCore", .{});
        exe.root_module.linkFramework("Foundation", .{});
        exe.root_module.linkFramework("CoreGraphics", .{});
    } else if (t.os.tag == .windows) {
        exe.root_module.linkSystemLibrary("d3d12", .{});
        exe.root_module.linkSystemLibrary("dxgi", .{});
        exe.root_module.linkSystemLibrary("user32", .{});
    } else {
        exe.root_module.linkSystemLibrary("vulkan", .{});
        exe.root_module.linkSystemLibrary("gcc_s", .{});
    }
    b.installArtifact(exe);
    const run_step = b.step("circle", "Run circle");
    const run_cmd = b.addRunArtifact(exe);
    run_step.dependOn(&run_cmd.step);
 }
--- a/circle.ppm
+++ b/circle.ppm
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@ -2,28 +2,18 @@ const GpuDevice = @import("GpuDevice.zig");
 const c = @import("utils.zig").c;
 pub const VTable = struct {
-    allocBuffer: *const fn (ctx: *anyopaque, desc: c.WGPUBufferDescriptor) anyerror!c.WGPUBuffer,
+    alloc: *const fn (ctx: *anyopaque, bytes: u64, usage: c.WGPUBufferUsage) anyerror!c.WGPUBuffer,
-    freeBuffer: *const fn (ctx: *anyopaque, buf_raw: c.WGPUBuffer) void,
+    free: *const fn (ctx: *anyopaque, buf_raw: c.WGPUBuffer, size: u64) void,
    allocTexture: *const fn (ctx: *anyopaque, desc: c.WGPUTextureDescriptor) anyerror!c.WGPUTexture,
    freeTexture: *const fn (ctx: *anyopaque, buf_raw: c.WGPUTexture) void,
 };
 device: GpuDevice,
 ptr: *anyopaque,
 vtable: *const VTable,
-pub fn allocBuffer(self: @This(), desc: c.WGPUBufferDescriptor) !c.WGPUBuffer {
+pub fn allocBuffer(self: @This(), bytes: u64, usage: c.WGPUBufferUsage) !c.WGPUBuffer {
-    return self.vtable.allocBuffer(self.ptr, desc);
+    return self.vtable.alloc(self.ptr, bytes, usage);
 }
-pub fn freeBuffer(self: @This(), buf_raw: c.WGPUBuffer) void {
+pub fn freeBuffer(self: @This(), buf_raw: c.WGPUBuffer, size: u64) void {
-    self.vtable.freeBuffer(self.ptr, buf_raw);
+    self.vtable.free(self.ptr, buf_raw, size);
 }
 pub fn allocTexture(self: @This(), desc: c.WGPUTextureDescriptor) !c.WGPUTexture {
    return self.vtable.allocTexture(self.ptr, desc);
 }
 pub fn freeTexture(self: @This(), buf_raw: c.WGPUTexture) void {
    self.vtable.freeTexture(self.ptr, buf_raw);
 }
--- a/src/GpuArena.zig
+++ b/src/GpuArena.zig
@ -1,34 +1,26 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const c = @import("utils.zig").c;
 device: GpuDevice,
-tracked_buffers: std.AutoHashMap(c.WGPUBuffer, c.WGPUBufferDescriptor),
+tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
 tracked_textures: std.AutoHashMap(c.WGPUTexture, c.WGPUTextureDescriptor),
 allocated_vram_bytes: u64 = 0,
 pub fn init(cpu_allocator: std.mem.Allocator, device: GpuDevice) @This() {
    return .{
        .device = device,
        .tracked_buffers = .init(cpu_allocator),
        .tracked_textures = .init(cpu_allocator),
    };
 }
 pub fn deinit(self: *@This()) void {
-    var it_buffer = self.tracked_buffers.keyIterator();
+    var it = self.tracked_buffers.keyIterator();
-    while (it_buffer.next()) |buf_ptr| {
+    while (it.next()) |buf_ptr| {
        c.wgpuBufferDestroy(buf_ptr.*);
        c.wgpuBufferRelease(buf_ptr.*);
    }
    self.tracked_buffers.deinit();
    var it_texture = self.tracked_textures.keyIterator();
    while (it_texture.next()) |tex_ptr|
        c.wgpuTextureRelease(tex_ptr.*);
    self.tracked_textures.deinit();
 }
 /// Returns the type-erased immutable interface wrapper
@ -37,71 +29,41 @@ pub fn gpuAllocator(self: *@This()) GpuAllocator {
        .device = self.device,
        .ptr = self,
        .vtable = &.{
-            .allocBuffer = allocBuffer,
+            .alloc = alloc,
-            .freeBuffer = freeBuffer,
+            .free = free,
            .allocTexture = allocTexture,
            .freeTexture = freeTexture,
        },
    };
 }
-fn allocBuffer(ctx: *anyopaque, desc: c.WGPUBufferDescriptor) anyerror!c.WGPUBuffer {
+fn alloc(ctx: *anyopaque, bytes: u64, usage: c.WGPUBufferUsage) anyerror!c.WGPUBuffer {
    const self: *@This() = @ptrCast(@alignCast(ctx));
-    if (desc.size > self.device.limits.maxBufferSize)
+    if (bytes > self.device.limits.maxBufferSize)
        return error.SingleBufferExceedsLimit;
-    if (desc.size + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
+    if (bytes + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
        return error.ExceedsVramBudget;
-    const buf = c.wgpuDeviceCreateBuffer(self.device.device, &desc) orelse return error.BufferAlloc;
+    const buf = c.wgpuDeviceCreateBuffer(self.device.device, &.{
        .usage = usage,
        .size = bytes,
    }) orelse return error.BufferAlloc;
    errdefer {
        c.wgpuBufferDestroy(buf);
        c.wgpuBufferRelease(buf);
    }
-    try self.tracked_buffers.put(buf, desc);
+    try self.tracked_buffers.put(buf, {});
-    self.allocated_vram_bytes += desc.size;
+    self.allocated_vram_bytes += bytes;
    return buf;
 }
-fn freeBuffer(ctx: *anyopaque, buf_raw: c.WGPUBuffer) void {
+fn free(ctx: *anyopaque, buf_raw: c.WGPUBuffer, size: u64) void {
    const self: *@This() = @ptrCast(@alignCast(ctx));
-    if (self.tracked_buffers.fetchRemove(buf_raw)) |kv| {
+    if (self.tracked_buffers.remove(buf_raw)) {
        c.wgpuBufferDestroy(buf_raw);
        c.wgpuBufferRelease(buf_raw);
-        self.allocated_vram_bytes -= kv.value.size;
+        self.allocated_vram_bytes -= size;
    }
 }
 fn allocTexture(ctx: *anyopaque, desc: c.WGPUTextureDescriptor) anyerror!c.WGPUTexture {
    const self: *@This() = @ptrCast(@alignCast(ctx));
    const format: GpuTextureFormat = @enumFromInt(desc.format);
    const bytes_size = desc.size.width * desc.size.height * format.bytesPerPixel();
    if (bytes_size > self.device.limits.maxBufferSize)
        return error.SingleBufferExceedsLimit;
    if (bytes_size + self.allocated_vram_bytes > self.device.config.vram_bytes_limit)
        return error.ExceedsVramBudget;
    const texture = c.wgpuDeviceCreateTexture(self.device.device, &desc) orelse return error.Texture;
    try self.tracked_textures.put(texture, desc);
    self.allocated_vram_bytes += bytes_size;
    return texture;
 }
 fn freeTexture(ctx: *anyopaque, texture_raw: c.WGPUTexture) void {
    const self: *@This() = @ptrCast(@alignCast(ctx));
    if (self.tracked_textures.fetchRemove(texture_raw)) |kv| {
        c.wgpuTextureRelease(texture_raw);
        const desc = kv.value;
        const format: GpuTextureFormat = @enumFromInt(desc.format);
        const bytes_size = desc.size.width * desc.size.height * format.bytesPerPixel();
        self.allocated_vram_bytes -= bytes_size;
    }
 }
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@ -30,7 +30,7 @@ pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@Th
    // Automatically align the buffer size forward to a multiple of 4 bytes under the hood
    const aligned_size = std.mem.alignForward(u64, size, 4);
-    const raw_handle = try gloc.allocBuffer(.{ .size = aligned_size, .usage = use });
+    const raw_handle = try gloc.allocBuffer(aligned_size, use);
    return .{
        .raw = raw_handle,
        .size = aligned_size,
@ -41,7 +41,7 @@ pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@Th
 /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
 pub fn deinit(self: @This()) void {
-    self.gloc.freeBuffer(self.raw);
+    self.gloc.freeBuffer(self.raw, self.size);
 }
 /// Native getConstMappedRange wrapper
--- a/src/GpuRender.zig
+++ b/src/GpuRender.zig
@ -1,167 +0,0 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuDevice = @import("GpuDevice.zig");
 pub const Binding = struct {
    element_size: u32 = 0,
 };
 pub const RenderDef = struct {
    bindings: []const Binding = &.{},
    /// The surface texture format we are rendering to (e.g., BGRA8Unorm)
    texture_format: c.WGPUTextureFormat,
    /// The names of the entry points inside your WGSL code
    vertex_entry: []const u8 = "vs_main",
    fragment_entry: []const u8 = "fs_main",
    /// Primitive topology, default to triangle list
    topology: c.WGPUPrimitiveTopology = c.WGPUPrimitiveTopology_TriangleList,
 };
 pip: c.WGPURenderPipeline,
 def: RenderDef,
 pub fn init(device: GpuDevice, wgsl: []const u8, def: RenderDef) !@This() {
    var wgsl_src = c.WGPUShaderSourceWGSL{
        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
        .code = sv(wgsl),
    };
    const shader = c.wgpuDeviceCreateShaderModule(device.device, &.{
        .nextInChain = @ptrCast(&wgsl_src),
    }) orelse return error.Shader;
    defer c.wgpuShaderModuleRelease(shader);
    // 1. Setup the Color Target State (where the fragment shader outputs)
    const blend = c.WGPUBlendState{
        .color = .{ .operation = c.WGPUBlendOperation_Add, .srcFactor = c.WGPUBlendFactor_SrcAlpha, .dstFactor = c.WGPUBlendFactor_OneMinusSrcAlpha },
        .alpha = .{ .operation = c.WGPUBlendOperation_Add, .srcFactor = c.WGPUBlendFactor_One, .dstFactor = c.WGPUBlendFactor_Zero },
    };
    const color_target = c.WGPUColorTargetState{
        .format = def.texture_format,
        .blend = &blend,
        .writeMask = c.WGPUColorWriteMask_All,
    };
    // 2. Setup the Fragment State
    const fragment_state = c.WGPUFragmentState{
        .module = shader,
        .entryPoint = sv(def.fragment_entry),
        .targetCount = 1,
        .targets = &color_target,
    };
    // 3. Compile the Complete Render Pipeline
    const pip = c.wgpuDeviceCreateRenderPipeline(device.device, &.{
        .vertex = .{
            .module = shader,
            .entryPoint = sv(def.vertex_entry),
        },
        .primitive = .{
            .topology = def.topology,
            .stripIndexFormat = c.WGPUIndexFormat_Undefined,
            .frontFace = c.WGPUFrontFace_CCW,
            .cullMode = c.WGPUCullMode_None,
        },
        .multisample = .{
            .count = 1,
            .mask = 0xFFFFFFFF,
            .alphaToCoverageEnabled = 0,
        },
        .fragment = &fragment_state,
    }) orelse return error.Pipeline;
    return .{
        .pip = pip,
        .def = def,
    };
 }
 pub fn deinit(self: @This()) void {
    c.wgpuRenderPipelineRelease(self.pip);
 }
 /// Execute the render pass targeting a specific frame texture view.
 /// Passes bind groups via a tuple exactly like your original compute setup.
 pub fn draw(
    self: @This(),
    gloc: GpuAllocator,
    target_view: c.WGPUTextureView,
    vertex_count: u32,
    args: anytype,
 ) !void {
    const type_info = @typeInfo(@TypeOf(args));
    if (type_info != .@"struct" or !type_info.@"struct".is_tuple)
        @compileError("Expected a tuple of GpuBuffers for args. E.g. .{ uniform_buf }");
    const fields = type_info.@"struct".fields;
    if (fields.len != self.def.bindings.len)
        return error.InvalidArgumentCount;
    var entries_buf: [32]c.WGPUBindGroupEntry = undefined;
    inline for (fields, 0..) |field, i| {
        const buf = @field(args, field.name);
        if (@TypeOf(buf) != GpuBuffer) {
            @compileError("All arguments in the tuple must be of type GpuBuffer");
        }
        entries_buf[i] = .{
            .binding = @intCast(i),
            .buffer = buf.raw,
            .offset = 0,
            .size = buf.size,
        };
    }
    const entries = entries_buf[0..fields.len];
    // Create Render Bind Group from layout
    const bgl = c.wgpuRenderPipelineGetBindGroupLayout(self.pip, 0);
    defer c.wgpuBindGroupLayoutRelease(bgl);
    const bg = c.wgpuDeviceCreateBindGroup(gloc.device.device, &.{
        .layout = bgl,
        .entries = entries.ptr,
        .entryCount = @intCast(entries.len),
    }) orelse return error.BindGroup;
    defer c.wgpuBindGroupRelease(bg);
    // Encode Render Command
    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
    defer c.wgpuCommandEncoderRelease(enc);
    const color_attachment = c.WGPURenderPassColorAttachment{
        .view = target_view,
        .resolveTarget = null,
        .loadOp = c.WGPULoadOp_Clear,
        .storeOp = c.WGPUStoreOp_Store,
        .clearValue = .{ .r = 0.1, .g = 0.1, .b = 0.1, .a = 1.0 },
        .depthSlice = c.WGPU_DEPTH_SLICE_UNDEFINED,
    };
    const pass_desc = c.WGPURenderPassDescriptor{
        .colorAttachmentCount = 1,
        .colorAttachments = &color_attachment,
        .depthStencilAttachment = null,
    };
    const pass = c.wgpuCommandEncoderBeginRenderPass(enc, &pass_desc);
    c.wgpuRenderPassEncoderSetPipeline(pass, self.pip);
    if (fields.len > 0) {
        c.wgpuRenderPassEncoderSetBindGroup(pass, 0, bg, 0, null);
    }
    // Draw! (Instead of Compute Dispatch)
    c.wgpuRenderPassEncoderDraw(pass, vertex_count, 1, 0, 0);
    c.wgpuRenderPassEncoderEnd(pass);
    c.wgpuRenderPassEncoderRelease(pass);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandBufferRelease(cmd);
    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
 }
--- a/src/GpuTexture.zig
+++ b/src/GpuTexture.zig
@ -1,137 +0,0 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const TextureUsage = enum(u64) {
    None = 0x0000000000000000,
    CopySrc = 0x0000000000000001,
    CopyDst = 0x0000000000000002,
    TextureBinding = 0x0000000000000004,
    StorageBinding = 0x0000000000000008,
    RenderAttachment = 0x0000000000000010,
    TransientAttachment = 0x0000000000000020,
 };
 raw: c.WGPUTexture,
 size: c.WGPUExtent3D,
 usage: c.WGPUTextureUsage,
 format: GpuTextureFormat,
 gloc: GpuAllocator,
 /// Allocates the underlying WebGPU handle and registers it to the parent GpuAllocator
 pub fn init(gloc: GpuAllocator, format: GpuTextureFormat, size: c.WGPUExtent3D, usage: std.EnumSet(TextureUsage)) !@This() {
    var use: u64 = 0;
    var iter = usage.iterator();
    while (iter.next()) |flag| use |= @intFromEnum(flag);
    const desc = c.WGPUTextureDescriptor{
        .usage = use,
        .dimension = c.WGPUTextureDimension_2D,
        .size = size,
        .format = @intCast(@intFromEnum(format)),
        .mipLevelCount = 1,
        .sampleCount = 1,
    };
    const raw = try gloc.allocTexture(desc);
    return .{ .gloc = gloc, .raw = raw, .size = size, .format = format, .usage = use };
 }
 /// Unregisters from the parent GpuAllocator and cleanly destroys GPU resources
 pub fn deinit(self: @This()) void {
    self.gloc.freeTexture(self.raw);
 }
 /// Native getConstMappedRange wrapper
 pub fn getConstMappedRange(self: @This(), offset: u64, size: u64) ?*const anyopaque {
    return c.wgpuBufferGetConstMappedRange(self.raw, offset, size);
 }
 /// Native mapAsync wrapper
 pub fn mapAsync(
    self: @This(),
    mode: c.WGPUMapMode,
    offset: u64,
    size: u64,
    callback_info: c.WGPUBufferMapCallbackInfo,
 ) void {
    _ = c.wgpuBufferMapAsync(self.raw, mode, offset, size, callback_info);
 }
 /// Native unmap wrapper
 pub fn unmap(self: @This()) void {
    c.wgpuBufferUnmap(self.raw);
 }
 /// CPU to GPU.
 pub fn load(
    self: @This(),
    T: type,
    data: []const T,
 ) !void {
    const bytes = data.len * @sizeOf(T);
    if (bytes == self.size) {
        // Aligned path: direct download
        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
    } else {
        // Unaligned path: Split the write into an aligned chunk and a padded remainder
        // to support arbitrary lengths without any allocations or large stack arrays.
        const aligned_part = (bytes / 4) * 4;
        if (aligned_part > 0) {
            c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, aligned_part);
        }
        var remainder_buf: [4]u8 = .{ 0, 0, 0, 0 };
        const data_bytes = std.mem.sliceAsBytes(data);
        @memcpy(remainder_buf[0 .. bytes - aligned_part], data_bytes[aligned_part..bytes]);
        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, aligned_part, &remainder_buf, 4);
    }
 }
 pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
    const out = try alloc.alloc(T, @divExact(self.size, @sizeOf(T)));
    const staging = try init(
        self.gloc,
        self.size,
        .initMany(&.{ .MapRead, .CopyDst }),
    );
    defer staging.deinit();
    const enc = c.wgpuDeviceCreateCommandEncoder(self.gloc.device.device, null) orelse return error.Encoder;
    c.wgpuCommandEncoderCopyBufferToBuffer(enc, self.raw, 0, staging.raw, 0, self.size);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandEncoderRelease(enc);
    defer c.wgpuCommandBufferRelease(cmd);
    c.wgpuQueueSubmit(self.gloc.device.queue, 1, &cmd);
    var mapped = false;
    staging.mapAsync(
        c.WGPUMapMode_Read,
        0,
        self.size,
        .{ .callback = onMapped, .userdata1 = &mapped },
    );
    while (!mapped) self.gloc.device.poll();
    const ptr: [*]const T = @ptrCast(@alignCast(
        staging.getConstMappedRange(0, self.size),
    ));
    @memcpy(out[0..out.len], ptr[0..out.len]);
    staging.unmap();
    return out;
 }
 fn onMapped(
    status: c.WGPUMapAsyncStatus,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
    flag.* = (status == c.WGPUMapAsyncStatus_Success);
 }
--- a/src/circle.zig
+++ b/src/circle.zig
@ -1,113 +0,0 @@
 const std = @import("std");
 const gpu = @import("lib.zig");
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const GpuDevice = gpu.GpuDevice;
 const GpuArena = gpu.GpuArena;
 const GpuBuffer = gpu.GpuBuffer;
 const GpuRender = gpu.GpuRender;
 const GpuTexture = gpu.GpuTexture;
 pub fn main(init: std.process.Init) !void {
    const allocator = init.gpa;
    // 1. Open the raw headless GPU Device you shared
    const device = try GpuDevice.init(.{});
    defer device.deinit();
    var grena = GpuArena.init(allocator, device);
    defer grena.deinit();
    const gloc = grena.gpuAllocator();
    const width: u32 = 512;
    const height: u32 = 512;
    const render_format = c.WGPUTextureFormat_RGBA8Unorm;
    // 2. Load our Render Pipeline (Procedural Triangle Strip)
    const circle_rp = try GpuRender.init(
        device,
        @embedFile("shaders/circle.wgsl"),
        .{
            .bindings = &.{},
            .texture_format = render_format,
            .topology = c.WGPUPrimitiveTopology_TriangleStrip,
        },
    );
    defer circle_rp.deinit();
    // 3. Create the offscreen VRAM texture to render into
    const texture = try GpuTexture.init(
        gloc,
        .RGBA8Unorm,
        .{ .width = width, .height = height, .depthOrArrayLayers = 1 },
        .initMany(&.{ .RenderAttachment, .CopySrc }),
    );
    defer texture.deinit();
    const target_view = c.wgpuTextureCreateView(texture.raw, null) orelse return error.View;
    defer c.wgpuTextureViewRelease(target_view);
    // 4. Create a staging buffer to pull pixels from VRAM to CPU
    // 4 bytes per pixel (RGBA8)
    const row_bytes = width * 4;
    const buffer_bytes = row_bytes * height;
    // Create a regular GpuBuffer set up to receive texture copy transfers
    const cpu_staging_buf = try GpuBuffer.init(gloc, buffer_bytes, .initMany(&.{ .CopyDst, .CopySrc }));
    // 5. Draw the Circle Frame into the texture view!
    try circle_rp.draw(gloc, target_view, 4, .{});
    // 6. Copy the texture data into our CPU staging buffer
    const enc = c.wgpuDeviceCreateCommandEncoder(device.device, null) orelse return error.Encoder;
    defer c.wgpuCommandEncoderRelease(enc);
    const src_copy = c.WGPUTexelCopyTextureInfo{
        .texture = texture.raw,
        .mipLevel = 0,
        .origin = .{ .x = 0, .y = 0, .z = 0 },
        .aspect = c.WGPUTextureAspect_All,
    };
    const dst_copy = c.WGPUTexelCopyBufferInfo{
        .buffer = cpu_staging_buf.raw,
        .layout = .{
            .offset = 0,
            .bytesPerRow = row_bytes,
            .rowsPerImage = height,
        },
    };
    const copy_size = c.WGPUExtent3D{ .width = width, .height = height, .depthOrArrayLayers = 1 };
    c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, &copy_size);
    const cmd = c.wgpuCommandEncoderFinish(enc, null);
    defer c.wgpuCommandBufferRelease(cmd);
    c.wgpuQueueSubmit(device.queue, 1, &cmd);
    // 7. Map and read the raw image bytes back to CPU
    // (This uses whatever slice-reading helpers your `GpuBuffer` wrapper provides)
    const pixels = try cpu_staging_buf.read(allocator, u8);
    defer allocator.free(pixels);
    // Now you have the raw binary image data! Let's output a simple Netpbm PPM image file
    // so you can actually open and look at your rendered circle.
    try savePpm(init.io, "circle.ppm", width, height, pixels);
    std.debug.print("Successfully rendered circle to circle.ppm!\n", .{});
 }
 fn savePpm(io: std.Io, filename: []const u8, w: u32, h: u32, rgba_pixels: []const u8) !void {
    const file = try std.Io.Dir.cwd().createFile(io, filename, .{});
    defer file.close(io);
    var buf: [255]u8 = undefined;
    var writer = file.writer(io, &buf);
    // PPM Header: P6 format means raw RGB bytes
    try writer.interface.print("P6\n{d} {d}\n255\n", .{ w, h });
    // Strip Alpha channel when writing out to standard RGB PPM format
    var i: usize = 0;
    while (i < rgba_pixels.len) : (i += 4) {
        try writer.interface.writeAll(rgba_pixels[i .. i + 3]);
    }
 }
--- a/src/lib.zig
+++ b/src/lib.zig
@ -3,174 +3,3 @@ pub const GpuArena = @import("GpuArena.zig");
 pub const GpuBuffer = @import("GpuBuffer.zig");
 pub const GpuDevice = @import("GpuDevice.zig");
 pub const GpuCompute = @import("GpuCompute.zig");
 pub const GpuRender = @import("GpuRender.zig");
 pub const GpuTexture = @import("GpuTexture.zig");
 pub const GpuTextureFormat = enum(c_int) {
    Undefined = 0,
    R8Unorm = 1,
    R8Snorm = 2,
    R8Uint = 3,
    R8Sint = 4,
    R16Unorm = 5,
    R16Snorm = 6,
    R16Uint = 7,
    R16Sint = 8,
    R16Float = 9,
    RG8Unorm = 10,
    RG8Snorm = 11,
    RG8Uint = 12,
    RG8Sint = 13,
    R32Float = 14,
    R32Uint = 15,
    R32Sint = 16,
    RG16Unorm = 17,
    RG16Snorm = 18,
    RG16Uint = 19,
    RG16Sint = 20,
    RG16Float = 21,
    RGBA8Unorm = 22,
    RGBA8UnormSrgb = 23,
    RGBA8Snorm = 24,
    RGBA8Uint = 25,
    RGBA8Sint = 26,
    BGRA8Unorm = 27,
    BGRA8UnormSrgb = 28,
    RGB10A2Uint = 29,
    RGB10A2Unorm = 30,
    RG11B10Ufloat = 31,
    RGB9E5Ufloat = 32,
    RG32Float = 33,
    RG32Uint = 34,
    RG32Sint = 35,
    RGBA16Unorm = 36,
    RGBA16Snorm = 37,
    RGBA16Uint = 38,
    RGBA16Sint = 39,
    RGBA16Float = 40,
    RGBA32Float = 41,
    RGBA32Uint = 42,
    RGBA32Sint = 43,
    Stencil8 = 44,
    Depth16Unorm = 45,
    Depth24Plus = 46,
    Depth24PlusStencil8 = 47,
    Depth32Float = 48,
    Depth32FloatStencil8 = 49,
    BC1RGBAUnorm = 50,
    BC1RGBAUnormSrgb = 51,
    BC2RGBAUnorm = 52,
    BC2RGBAUnormSrgb = 53,
    BC3RGBAUnorm = 54,
    BC3RGBAUnormSrgb = 55,
    BC4RUnorm = 56,
    BC4RSnorm = 57,
    BC5RGUnorm = 58,
    BC5RGSnorm = 59,
    BC6HRGBUfloat = 60,
    BC6HRGBFloat = 61,
    BC7RGBAUnorm = 62,
    BC7RGBAUnormSrgb = 63,
    ETC2RGB8Unorm = 64,
    ETC2RGB8UnormSrgb = 65,
    ETC2RGB8A1Unorm = 66,
    ETC2RGB8A1UnormSrgb = 67,
    ETC2RGBA8Unorm = 68,
    ETC2RGBA8UnormSrgb = 69,
    EACR11Unorm = 70,
    EACR11Snorm = 71,
    EACRG11Unorm = 72,
    EACRG11Snorm = 73,
    ASTC4x4Unorm = 74,
    ASTC4x4UnormSrgb = 75,
    ASTC5x4Unorm = 76,
    ASTC5x4UnormSrgb = 77,
    ASTC5x5Unorm = 78,
    ASTC5x5UnormSrgb = 79,
    ASTC6x5Unorm = 80,
    ASTC6x5UnormSrgb = 81,
    ASTC6x6Unorm = 82,
    ASTC6x6UnormSrgb = 83,
    ASTC8x5Unorm = 84,
    ASTC8x5UnormSrgb = 85,
    ASTC8x6Unorm = 86,
    ASTC8x6UnormSrgb = 87,
    ASTC8x8Unorm = 88,
    ASTC8x8UnormSrgb = 89,
    ASTC10x5Unorm = 90,
    ASTC10x5UnormSrgb = 91,
    ASTC10x6Unorm = 92,
    ASTC10x6UnormSrgb = 93,
    ASTC10x8Unorm = 94,
    ASTC10x8UnormSrgb = 95,
    ASTC10x10Unorm = 96,
    ASTC10x10UnormSrgb = 97,
    ASTC12x10Unorm = 98,
    ASTC12x10UnormSrgb = 99,
    ASTC12x12Unorm = 100,
    ASTC12x12UnormSrgb = 101,
    Force32 = 2147483647,
    pub fn bytesPerPixel(format: GpuTextureFormat) u32 {
        return switch (format) {
            // 8-bit formats (1 byte)
            .R8Unorm, .R8Snorm, .R8Uint, .R8Sint, .Stencil8 => 1,
            // 16-bit formats (2 bytes)
            .R16Unorm,
            .R16Snorm,
            .R16Uint,
            .R16Sint,
            .R16Float,
            .RG8Unorm,
            .RG8Snorm,
            .RG8Uint,
            .RG8Sint,
            .Depth16Unorm,
            => 2,
            // 32-bit formats (4 bytes)
            .R32Float,
            .R32Uint,
            .R32Sint,
            .RG16Unorm,
            .RG16Snorm,
            .RG16Uint,
            .RG16Sint,
            .RG16Float,
            .RGBA8Unorm,
            .RGBA8UnormSrgb,
            .RGBA8Snorm,
            .RGBA8Uint,
            .RGBA8Sint,
            .BGRA8Unorm,
            .BGRA8UnormSrgb,
            .RGB10A2Uint,
            .RGB10A2Unorm,
            .RG11B10Ufloat,
            .RGB9E5Ufloat,
            .Depth24Plus,
            .Depth32Float,
            => 4,
            // 64-bit formats (8 bytes)
            .RG32Float,
            .RG32Uint,
            .RG32Sint,
            .RGBA16Unorm,
            .RGBA16Snorm,
            .RGBA16Uint,
            .RGBA16Sint,
            .RGBA16Float,
            .Depth24PlusStencil8, // 24-bit depth + 8-bit stencil layout padded to 4+4 or 1+3
            .Depth32FloatStencil8, // 32-bit float depth + 8-bit stencil (padded to 8 bytes)
            => 8,
            // 128-bit formats (16 bytes)
            .RGBA32Float, .RGBA32Uint, .RGBA32Sint => 16,
            // Block Compressed Formats (Handled separately)
            else => 0,
        };
    }
 };
--- a/src/shaders/add.wgsl
+++ b/src/shaders/add.wgsl
@ -1,24 +0,0 @@
 enable f16;
@group(0) @binding(0) var<storage, read> A: array<f16>;
@group(0) @binding(1) var<storage, read> B: array<f16>;
@group(0) @binding(2) var<storage, read_write> C: array<f16>;
@group(0) @binding(3) var<uniform> size: u32; 
@compute @workgroup_size(256)
 fn main(
    @builtin(global_invocation_id) global_id : vec3<u32>,
    @builtin(num_workgroups) num_workgroups: vec3<u32>
 ) {
    // 1. Calculate the total number of threads across the entire grid
    let total_threads = num_workgroups.x * 256u; 
    // 2. Start at this thread's unique global ID
    var index = global_id.x;
    // 3. Stride through the tensor elements
    while (index < size) {
        C[index] = A[index] + B[index];
        index += total_threads; // Jump forward by the total thread count
    }
 }
--- a/src/shaders/circle.wgsl
+++ b/src/shaders/circle.wgsl
@ -1,39 +0,0 @@
 struct VertexOutput {
    @builtin(position) position: vec4f,
    @location(0) uv: vec2f,
 };
@vertex
 fn vs_main(@builtin(vertex_index) vertex_index: u32) -> VertexOutput {
    var output: VertexOutput;
    // Hardcoded fullscreen quad layout using 4 vertices (Triangle Strip)
    // Indexes: 0: Top-Left, 1: Bottom-Left, 2: Top-Right, 3: Bottom-Right
    var pos = array<vec2f, 4>(
        vec2f(-1.0,  1.0),
        vec2f(-1.0, -1.0),
        vec2f( 1.0,  1.0),
        vec2f( 1.0, -1.0)
    );
    output.position = vec4f(pos[vertex_index], 0.0, 1.0);
    output.uv = pos[vertex_index]; // Ranges cleanly from -1.0 to 1.0
    return output;
 }
@fragment
 fn fs_main(input: VertexOutput) -> @location(0) vec4f {
    // Distance from the center (0,0)
    let distance = length(input.uv);
    let radius = 0.5;
    // Smooth out pixel edges (anti-aliasing)
    let edge_softness = 0.005;
    let alpha = 1.0 - smoothstep(radius - edge_softness, radius + edge_softness, distance);
    if (alpha <= 0.0) {
        discard; 
    }
    // Draw a sharp/smooth red circle
    return vec4f(1.0, 0.3, 0.3, alpha);
 }