diff --git a/build.zig b/build.zig
index 140a513..4c0cf8d 100644
--- a/build.zig
+++ b/build.zig
@@ -56,39 +56,4 @@ pub fn build(b: *std.Build) !void {
         const run_cmd = b.addRunArtifact(exe);
         run_step.dependOn(&run_cmd.step);
     }
-
-    const exe = b.addExecutable(.{
-        .name = "circle",
-        .root_module = b.createModule(.{
-            .root_source_file = b.path("src/circle.zig"),
-            .target = target,
-            .optimize = optimize,
-            .imports = &.{},
-        }),
-    });
-
-    exe.root_module.addIncludePath(b.path("libs/wgpu-native/include"));
-    exe.root_module.addLibraryPath(b.path("libs/wgpu-native/lib"));
-    exe.root_module.addObjectFile(b.path("libs/wgpu-native/lib/libwgpu_native.a"));
-
-    // Platform-specific system frameworks needed by wgpu-native
-    if (t.os.tag == .macos) {
-        exe.root_module.linkFramework("Metal", .{});
-        exe.root_module.linkFramework("QuartzCore", .{});
-        exe.root_module.linkFramework("Foundation", .{});
-        exe.root_module.linkFramework("CoreGraphics", .{});
-    } else if (t.os.tag == .windows) {
-        exe.root_module.linkSystemLibrary("d3d12", .{});
-        exe.root_module.linkSystemLibrary("dxgi", .{});
-        exe.root_module.linkSystemLibrary("user32", .{});
-    } else {
-        exe.root_module.linkSystemLibrary("vulkan", .{});
-        exe.root_module.linkSystemLibrary("gcc_s", .{});
-    }
-
-    b.installArtifact(exe);
-
-    const run_step = b.step("circle", "Run circle");
-    const run_cmd = b.addRunArtifact(exe);
-    run_step.dependOn(&run_cmd.step);
 }
diff --git a/examples/bench.zig b/examples/bench_cp.zig
similarity index 100%
rename from examples/bench.zig
rename to examples/bench_cp.zig
diff --git a/examples/circle.zig b/examples/circle.zig
new file mode 100644
index 0000000..a508626
--- /dev/null
+++ b/examples/circle.zig
@@ -0,0 +1,76 @@
+const std = @import("std");
+const gpu = @import("gpu");
+const GpuDevice = gpu.GpuDevice;
+const GpuArena = gpu.GpuArena;
+const GpuBuffer = gpu.GpuBuffer;
+const GpuRender = gpu.GpuRender;
+const GpuTexture = gpu.GpuTexture;
+const GpuTextureView = gpu.GpuTextureView;
+
+const width: u32 = 512;
+const height: u32 = 512;
+
+pub fn main(init: std.process.Init) !void {
+    const allocator = init.gpa;
+
+    // 1. Open GPU Device
+    const device = try GpuDevice.init(.{});
+    defer device.deinit();
+
+    // 2. Init VRAM Arena
+    var grena = GpuArena.init(allocator, device);
+    defer grena.deinit();
+    const gloc = grena.gpuAllocator();
+
+    // 3. Load Render Pipeline
+    const circle_rp = try GpuRender.init(
+        device, // Change to gloc + track them
+        @embedFile("shaders/circle.wgsl"),
+        .{ .bindings = &.{}, .texture_format = .RGBA8Unorm, .topology = .TriangleStrip },
+    );
+    defer circle_rp.deinit();
+
+    // 4. Create VRAM texture to render into
+    const texture = try GpuTexture.init(gloc, .{
+        .format = .RGBA8Unorm,
+        .size = .{ .width = width, .height = height, .depthOrArrayLayers = 1 },
+        .usage = .initMany(&.{ .RenderAttachment, .CopySrc }),
+    });
+    defer texture.deinit();
+
+    // 5. Create a view from texture
+    const view = try GpuTextureView.init(gloc, texture, .{});
+    defer view.deinit();
+
+    // 6. Run the rendering pipeline
+    try circle_rp.draw(gloc, view, 4, .{});
+
+    // 7. Load Texture into GpuBuffer
+    const cpu_staging_cpu = try texture.buffCopy(gloc);
+    defer cpu_staging_cpu.deinit();
+
+    // 8. Read GpuBuffer to CPU
+    const pixels = try cpu_staging_cpu.read(allocator, u8);
+    defer allocator.free(pixels);
+
+    // 9. Write a simple ppm image
+    try savePpm(init.io, "circle.ppm", width, height, pixels);
+    std.debug.print("Successfully rendered circle to circle.ppm!\n", .{});
+}
+
+fn savePpm(io: std.Io, filename: []const u8, w: u32, h: u32, rgba_pixels: []const u8) !void {
+    const file = try std.Io.Dir.cwd().createFile(io, filename, .{});
+    defer file.close(io);
+
+    var buf: [255]u8 = undefined;
+    var writer = file.writer(io, &buf);
+
+    // PPM Header: P6 format means raw RGB bytes
+    try writer.interface.print("P6\n{d} {d}\n255\n", .{ w, h });
+
+    // Strip Alpha channel when writing out to standard RGB PPM format
+    var i: usize = 0;
+    while (i < rgba_pixels.len) : (i += 4) {
+        try writer.interface.writeAll(rgba_pixels[i .. i + 3]);
+    }
+}
diff --git a/examples/add.zig b/examples/compute.zig
similarity index 100%
rename from examples/add.zig
rename to examples/compute.zig
diff --git a/examples/digit.zig b/examples/digit.zig
deleted file mode 100644
index 20f35a6..0000000
--- a/examples/digit.zig
+++ /dev/null
@@ -1,77 +0,0 @@
-// I am using this mnist reduced dataset https://www.kaggle.com/datasets/mohamedgamal07/reduced-mnist
-
-const std = @import("std");
-const gpu = @import("gpu");
-const GpuDevice = gpu.GpuDevice;
-const GpuArena = gpu.GpuArena;
-const GpuBuffer = gpu.GpuBuffer;
-const GpuProcess = gpu.GpuProcess;
-
-const BATCHSIZE = 10;
-const EPOCH = 10;
-
-pub fn main(init: std.process.Init) !void {
-    const allocator = init.gpa;
-    const io = init.io;
-
-    // 1. Open GPU Device
-    const device = try GpuDevice.init(.{});
-    defer device.deinit();
-
-    // 2. Create a GPU Arena to manage VRAM
-    var grena = GpuArena.init(allocator, device);
-    defer grena.deinit();
-    const gloc = grena.gpuAllocator();
-
-    // 3. Load the WGSL compute pipeline
-    const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
-    defer add_process.deinit();
-
-    var train_dir = try std.Io.Dir.cwd().openDir(io, "mnist/train", .{});
-
-    var images: [BATCHSIZE * 28 * 28]f16 = undefined;
-    for (EPOCH) |epoch| {
-        // Load random images from train dir
-        train_dir.openDir(io, "0", .{});
-        for (BATCHSIZE) |i| {
-            const file = try train_dir.openFile(io, "0.jpg", .{});
-            images[28 * 28 * i .. 28 * 28 * (i + 1)] = file.read
-        }
-    }
-
-    // 4. Setup CPU data
-    const len: usize = 16;
-    const data_a = try allocator.alloc(f16, len);
-    defer allocator.free(data_a);
-    const data_b = try allocator.alloc(f16, len);
-    defer allocator.free(data_b);
-
-    for (0..len) |i| {
-        data_a[i] = @floatFromInt(i);
-        data_b[i] = @floatFromInt(len - 1 - i);
-    }
-
-    // 5. Initialize raw GPU Buffers
-    // We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported
-    const byte_size = len * @sizeOf(f16);
-    const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
-    const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
-    const buf_out = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
-
-    // Note: The buffers are safely tied to the GpuArena which will automatically
-    // release them at the end. You can also manually call buf_x.deinit() if desired.
-
-    // 6. Transfer data from CPU slices to GPU Buffers
-    try buf_a.load(f16, data_a);
-    try buf_b.load(f16, data_b);
-
-    // 7. Dispatch the Compute Process
-    // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
-    try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
-
-    // 8. Map and copy the resulting buffer back to the CPU
-    const out = try buf_out.read(allocator, f16);
-    defer allocator.free(out);
-
-    std.debug.print("Result: {any}\n", .{out});
-}
diff --git a/src/shaders/circle.wgsl b/examples/shaders/circle.wgsl
similarity index 100%
rename from src/shaders/circle.wgsl
rename to examples/shaders/circle.wgsl
diff --git a/src/GpuTexture.zig b/src/GpuTexture.zig
index 7d75cb0..b819ff2 100644
--- a/src/GpuTexture.zig
+++ b/src/GpuTexture.zig
@@ -1,6 +1,7 @@
 const std = @import("std");
 const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
+const GpuBuffer = @import("GpuBuffer.zig");
 const GpuTextureFormat = @import("lib.zig").GpuTextureFormat;
 const GpuTextureUsage = @import("lib.zig").GpuTextureUsage;
 
@@ -40,6 +41,45 @@ pub fn getConstMappedRange(self: @This(), offset: u64, size: u64) ?*const anyopa
     return c.wgpuBufferGetConstMappedRange(self.raw, offset, size);
 }
 
+pub fn bytesSize(self: @This()) u32 {
+    return self.bytesSizeRow() * self.def.size.height;
+}
+
+pub fn bytesSizeRow(self: @This()) u32 {
+    return self.def.size.width * self.def.format.bytesPerPixel();
+}
+
+/// Return a GpuBuffer containing a copy of the texture.
+pub fn buffCopy(self: @This(), gloc: GpuAllocator) !GpuBuffer {
+    const buf = try GpuBuffer.init(gloc, self.bytesSize(), .initMany(&.{ .CopyDst, .CopySrc }));
+
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
+    defer c.wgpuCommandEncoderRelease(enc);
+
+    const src_copy = c.WGPUTexelCopyTextureInfo{
+        .texture = self.raw,
+        .mipLevel = 0,
+        .origin = .{ .x = 0, .y = 0, .z = 0 },
+        .aspect = c.WGPUTextureAspect_All,
+    };
+    const dst_copy = c.WGPUTexelCopyBufferInfo{
+        .buffer = buf.raw,
+        .layout = .{
+            .offset = 0,
+            .bytesPerRow = self.bytesSizeRow(),
+            .rowsPerImage = self.def.size.height,
+        },
+    };
+
+    c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, &self.def.size);
+
+    const cmd = c.wgpuCommandEncoderFinish(enc, null);
+    defer c.wgpuCommandBufferRelease(cmd);
+    c.wgpuQueueSubmit(gloc.device.queue, 1, &cmd);
+
+    return buf;
+}
+
 pub fn mapAsync(
     self: @This(),
     mode: c.WGPUMapMode,
diff --git a/src/circle.zig b/src/circle.zig
deleted file mode 100644
index ef9ce7c..0000000
--- a/src/circle.zig
+++ /dev/null
@@ -1,112 +0,0 @@
-const std = @import("std");
-const gpu = @import("lib.zig");
-const c = @import("utils.zig").c;
-const sv = @import("utils.zig").sv;
-const GpuDevice = gpu.GpuDevice;
-const GpuArena = gpu.GpuArena;
-const GpuBuffer = gpu.GpuBuffer;
-const GpuRender = gpu.GpuRender;
-const GpuTexture = gpu.GpuTexture;
-const GpuTextureView = gpu.GpuTextureView;
-
-pub fn main(init: std.process.Init) !void {
-    const allocator = init.gpa;
-
-    // 1. Open the raw headless GPU Device you shared
-    const device = try GpuDevice.init(.{});
-    defer device.deinit();
-
-    var grena = GpuArena.init(allocator, device);
-    defer grena.deinit();
-    const gloc = grena.gpuAllocator();
-
-    const width: u32 = 512;
-    const height: u32 = 512;
-
-    // 2. Load our Render Pipeline (Procedural Triangle Strip)
-    const circle_rp = try GpuRender.init(
-        device,
-        @embedFile("shaders/circle.wgsl"),
-        .{
-            .bindings = &.{},
-            .texture_format = .RGBA8Unorm,
-            .topology = .TriangleStrip,
-        },
-    );
-    defer circle_rp.deinit();
-
-    // 3. Create the offscreen VRAM texture to render into
-    const texture = try GpuTexture.init(gloc, .{
-        .format = .RGBA8Unorm,
-        .size = .{ .width = width, .height = height, .depthOrArrayLayers = 1 },
-        .usage = .initMany(&.{ .RenderAttachment, .CopySrc }),
-    });
-    defer texture.deinit();
-
-    const view = try GpuTextureView.init(gloc, texture, .{});
-    defer view.deinit();
-
-    // 4. Create a staging buffer to pull pixels from VRAM to CPU
-    // 4 bytes per pixel (RGBA8)
-    const row_bytes = width * 4;
-    const buffer_bytes = row_bytes * height;
-
-    // Create a regular GpuBuffer set up to receive texture copy transfers
-    const cpu_staging_buf = try GpuBuffer.init(gloc, buffer_bytes, .initMany(&.{ .CopyDst, .CopySrc }));
-
-    // 5. Draw the Circle Frame into the texture view!
-    try circle_rp.draw(gloc, view, 4, .{});
-
-    // 6. Copy the texture data into our CPU staging buffer
-    const enc = c.wgpuDeviceCreateCommandEncoder(device.device, null) orelse return error.Encoder;
-    defer c.wgpuCommandEncoderRelease(enc);
-
-    const src_copy = c.WGPUTexelCopyTextureInfo{
-        .texture = texture.raw,
-        .mipLevel = 0,
-        .origin = .{ .x = 0, .y = 0, .z = 0 },
-        .aspect = c.WGPUTextureAspect_All,
-    };
-    const dst_copy = c.WGPUTexelCopyBufferInfo{
-        .buffer = cpu_staging_buf.raw,
-        .layout = .{
-            .offset = 0,
-            .bytesPerRow = row_bytes,
-            .rowsPerImage = height,
-        },
-    };
-    const copy_size = c.WGPUExtent3D{ .width = width, .height = height, .depthOrArrayLayers = 1 };
-
-    c.wgpuCommandEncoderCopyTextureToBuffer(enc, &src_copy, &dst_copy, &copy_size);
-
-    const cmd = c.wgpuCommandEncoderFinish(enc, null);
-    defer c.wgpuCommandBufferRelease(cmd);
-    c.wgpuQueueSubmit(device.queue, 1, &cmd);
-
-    // 7. Map and read the raw image bytes back to CPU
-    // (This uses whatever slice-reading helpers your `GpuBuffer` wrapper provides)
-    const pixels = try cpu_staging_buf.read(allocator, u8);
-    defer allocator.free(pixels);
-
-    // Now you have the raw binary image data! Let's output a simple Netpbm PPM image file
-    // so you can actually open and look at your rendered circle.
-    try savePpm(init.io, "circle.ppm", width, height, pixels);
-    std.debug.print("Successfully rendered circle to circle.ppm!\n", .{});
-}
-
-fn savePpm(io: std.Io, filename: []const u8, w: u32, h: u32, rgba_pixels: []const u8) !void {
-    const file = try std.Io.Dir.cwd().createFile(io, filename, .{});
-    defer file.close(io);
-
-    var buf: [255]u8 = undefined;
-    var writer = file.writer(io, &buf);
-
-    // PPM Header: P6 format means raw RGB bytes
-    try writer.interface.print("P6\n{d} {d}\n255\n", .{ w, h });
-
-    // Strip Alpha channel when writing out to standard RGB PPM format
-    var i: usize = 0;
-    while (i < rgba_pixels.len) : (i += 4) {
-        try writer.interface.writeAll(rgba_pixels[i .. i + 3]);
-    }
-}
diff --git a/src/shaders/add.wgsl b/src/shaders/add.wgsl
deleted file mode 100644
index 4288742..0000000
--- a/src/shaders/add.wgsl
+++ /dev/null
@@ -1,24 +0,0 @@
-enable f16;
-
-@group(0) @binding(0) var<storage, read> A: array<f16>;
-@group(0) @binding(1) var<storage, read> B: array<f16>;
-@group(0) @binding(2) var<storage, read_write> C: array<f16>;
-@group(0) @binding(3) var<uniform> size: u32; 
-
-@compute @workgroup_size(256)
-fn main(
-    @builtin(global_invocation_id) global_id : vec3<u32>,
-    @builtin(num_workgroups) num_workgroups: vec3<u32>
-) {
-    // 1. Calculate the total number of threads across the entire grid
-    let total_threads = num_workgroups.x * 256u; 
-    
-    // 2. Start at this thread's unique global ID
-    var index = global_id.x;
-    
-    // 3. Stride through the tensor elements
-    while (index < size) {
-        C[index] = A[index] + B[index];
-        index += total_threads; // Jump forward by the total thread count
-    }
-}