3 changed files with 57 additions and 123 deletions
--- a/README.md
+++ b/README.md
@ -17,6 +17,7 @@ The library exports five primary components:
 Below is a complete, self-contained example demonstrating how to initialize the GPU, load data, run a compute shader, and read the results back to the CPU:
 ```zig
 const std = @import("std");
 const gpu = @import("gpu");
 const GpuDevice = gpu.GpuDevice;
@ -37,15 +38,7 @@ pub fn main(init: std.process.Init) !void {
    const gloc = grena.gpuAllocator();
    // 3. Load the WGSL compute pipeline
-    const add_process = try GpuProcess.init(
+    const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
        device,
        @embedFile("shaders/add.wgsl"),
        .{ .bindings = &.{
            .{ .element_size = @sizeOf(f16) },
            .{ .element_size = @sizeOf(f16) },
            .{ .element_size = @sizeOf(f16) },
        } },
    );
    defer add_process.deinit();
    // 4. Setup CPU data
@ -61,6 +54,7 @@ pub fn main(init: std.process.Init) !void {
    }
    // 5. Initialize raw GPU Buffers
    // We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported
    const byte_size = len * @sizeOf(f16);
    const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
    const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
@ -74,7 +68,8 @@ pub fn main(init: std.process.Init) !void {
    try buf_b.load(f16, data_b);
    // 7. Dispatch the Compute Process
-    try add_process.run(gloc, .{ buf_a, buf_b, buf_out });
+    // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
    try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
    // 8. Map and copy the resulting buffer back to the CPU
    const out = try buf_out.read(allocator, f16);
@ -82,7 +77,6 @@ pub fn main(init: std.process.Init) !void {
    std.debug.print("Result: {any}\n", .{out});
 }
 ```
 ## Dependencies
--- a/examples/add.zig
+++ b/examples/add.zig
@ -18,15 +18,7 @@ pub fn main(init: std.process.Init) !void {
    const gloc = grena.gpuAllocator();
    // 3. Load the WGSL compute pipeline
-    const add_process = try GpuProcess.init(
+    const add_process = try GpuProcess.init(device, @embedFile("shaders/add.wgsl"));
        device,
        @embedFile("shaders/add.wgsl"),
        .{ .bindings = &.{
            .{ .element_size = @sizeOf(f16) },
            .{ .element_size = @sizeOf(f16) },
            .{ .element_size = @sizeOf(f16) },
        } },
    );
    defer add_process.deinit();
    // 4. Setup CPU data
@ -42,6 +34,7 @@ pub fn main(init: std.process.Init) !void {
    }
    // 5. Initialize raw GPU Buffers
    // We pass the EnumSet inline using `.initMany` since the Enum itself isn't exported
    const byte_size = len * @sizeOf(f16);
    const buf_a = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
    const buf_b = try GpuBuffer.init(gloc, byte_size, .initMany(&.{ .Storage, .CopyDst, .CopySrc }));
@ -55,7 +48,8 @@ pub fn main(init: std.process.Init) !void {
    try buf_b.load(f16, data_b);
    // 7. Dispatch the Compute Process
-    try add_process.run(gloc, .{ buf_a, buf_b, buf_out });
+    // We pass the data type (f16) to allow GpuProcess to calculate chunks correctly
    try add_process.run(gloc, f16, buf_a, buf_b, buf_out);
    // 8. Map and copy the resulting buffer back to the CPU
    const out = try buf_out.read(allocator, f16);
--- a/src/GpuProcess.zig
+++ b/src/GpuProcess.zig
@ -1,3 +1,6 @@
 /// GpuProcess is just a pipeline with 2 inpout and 1 output
 /// for now, to see if I make it a bit more generic
 ///
 const std = @import("std");
 const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
@ -5,25 +8,9 @@ const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuDevice = @import("GpuDevice.zig");
 pub const Binding = struct {
    /// Element size in bytes for this binding. E.g. @sizeOf(f32).
    /// If 0, no element-based size validation is performed for this buffer.
    element_size: u32 = 0,
 };
 pub const ProcessDef = struct {
    bindings: []const Binding,
    workgroup_size: u32 = 256,
    max_workgroups: u32 = 65535,
    /// If true, automatically adds a Uniform Buffer containing `elements_count` as a `u32`
    /// to the next available binding slot.
    append_info_buffer: bool = true,
 };
 pip: c.WGPUComputePipeline,
 def: ProcessDef,
-pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() {
+pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
    var wgsl_src = c.WGPUShaderSourceWGSL{
        .chain = .{ .sType = c.WGPUSType_ShaderSourceWGSL },
        .code = sv(wgsl),
@ -33,116 +20,71 @@ pub fn init(device: GpuDevice, wgsl: []const u8, def: ProcessDef) !@This() {
    }) orelse return error.Shader;
    defer c.wgpuShaderModuleRelease(shader);
-    const pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{
+    return .{ .pip = c.wgpuDeviceCreateComputePipeline(device.device, &.{
        .compute = .{ .module = shader, .entryPoint = sv("main") },
-    }) orelse return error.Pipeline;
+    }) orelse return error.Pipeline };
    return .{
        .pip = pip,
        .def = def,
    };
 }
 pub fn deinit(self: @This()) void {
    c.wgpuComputePipelineRelease(self.pip);
 }
-/// Execute the compute pass with arbitrary buffer bindings via a tuple.
+fn onMapped(
-/// Example: `try proc.run(gloc, .{ buf_a, buf_b, buf_out });`
+    status: c.WGPUMapAsyncStatus,
    _: c.WGPUStringView,
    userdata1: ?*anyopaque,
    _: ?*anyopaque,
 ) callconv(.c) void {
    const flag: *bool = @ptrCast(@alignCast(userdata1.?));
    flag.* = (status == c.WGPUMapAsyncStatus_Success);
 }
 // Changed: gloc is passed by value instead of *GpuAllocator
 pub fn run(
    self: @This(),
    gloc: GpuAllocator,
-    args: anytype,
+    T: type,
    buf_a: GpuBuffer,
    buf_b: GpuBuffer,
    buf_out: GpuBuffer,
 ) !void {
-    const type_info = @typeInfo(@TypeOf(args));
+    const max_chunk_bytes: u64 = 1024 * 1024 * 1024; // 1 GB
    if (type_info != .@"struct" or !type_info.@"struct".is_tuple)
        @compileError("Expected a tuple of GpuBuffers for args. E.g. .{ buf_a, buf_b }");
-    const fields = type_info.@"struct".fields;
+    const bytes = buf_a.size;
-    if (fields.len != self.def.bindings.len) {
+    var offset: u64 = 0;
-        std.log.err("Process expected {d} arguments, got {d}", .{ self.def.bindings.len, fields.len });
+    while (offset < bytes) {
-        return error.InvalidArgumentCount;
+        const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
-    }
+        const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(T));
-    var elements_count: u32 = 0;
+        const info_buf = try GpuBuffer.init(
    // Infer elements_count from the first arg with a defined element_size
    inline for (fields, 0..) |field, i| {
        if (elements_count == 0) {
            const buf = @field(args, field.name);
            const el_size = self.def.bindings[i].element_size;
            if (el_size > 0) {
                elements_count = @intCast(buf.size / el_size);
            }
        }
    }
    // Validate runtime buffer sizes before dispatching
    inline for (fields, 0..) |field, i| {
        const buf = @field(args, field.name);
        const el_size = self.def.bindings[i].element_size;
        if (el_size > 0) {
            const expected_min_bytes = @as(u64, elements_count) * el_size;
            if (buf.size < expected_min_bytes) {
                std.log.err("Argument {d} size mismatch: expected at least {d} bytes, got {d}", .{ i, expected_min_bytes, buf.size });
                return error.BufferTooSmall;
            }
        }
    }
    var entries_buf: [32]c.WGPUBindGroupEntry = undefined;
    var entry_count: usize = 0;
    // Unpack tuple into WebGPU BindGroupEntries
    inline for (fields, 0..) |field, i| {
        const buf = @field(args, field.name);
        if (@TypeOf(buf) != GpuBuffer) {
            @compileError("All arguments in the tuple must be of type GpuBuffer");
        }
        entries_buf[entry_count] = .{
            .binding = @intCast(i),
            .buffer = buf.raw,
            .offset = 0,
            .size = buf.size, // Size exposes the fully allocated length
        };
        entry_count += 1;
    }
    // Optional uniform dispatch buffer appended at the end
    var info_buf: ?GpuBuffer = null;
    defer if (info_buf) |b| b.deinit();
    if (self.def.append_info_buffer) {
        info_buf = try GpuBuffer.init(
            gloc,
            @sizeOf(u32),
            .initMany(&.{ .Uniform, .CopyDst }),
        );
-        c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.?.raw, 0, &elements_count, @sizeOf(u32));
+        defer info_buf.deinit();
-        entries_buf[entry_count] = .{
+        c.wgpuQueueWriteBuffer(gloc.device.queue, info_buf.raw, 0, &current_chunk_elements, @sizeOf(u32));
-            .binding = @intCast(entry_count),
+
-            .buffer = info_buf.?.raw,
+        const entries = [_]c.WGPUBindGroupEntry{
-            .offset = 0,
+            .{ .binding = 0, .buffer = buf_a.raw, .offset = offset, .size = current_chunk_bytes },
-            .size = @sizeOf(u32),
+            .{ .binding = 1, .buffer = buf_b.raw, .offset = offset, .size = current_chunk_bytes },
            .{ .binding = 2, .buffer = buf_out.raw, .offset = offset, .size = current_chunk_bytes },
            .{ .binding = 3, .buffer = info_buf.raw, .offset = 0, .size = @sizeOf(u32) },
        };
        entry_count += 1;
    }
-    const entries = entries_buf[0..entry_count];
+        try submitPass(gloc, self.pip, &entries, current_chunk_elements);
-    try submitPass(gloc, self.pip, entries, elements_count, self.def.workgroup_size, self.def.max_workgroups);
+
        offset += current_chunk_bytes;
    }
 }
 // Changed: gloc is passed by value instead of *GpuAllocator
 fn submitPass(
    gloc: GpuAllocator,
    pipeline: c.WGPUComputePipeline,
    entries: []const c.WGPUBindGroupEntry,
    n: usize,
    workgroup_size: u32,
    max_workgroups: u32,
 ) !void {
    if (n == 0) return;
    const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
    defer c.wgpuBindGroupLayoutRelease(bgl);
@ -153,13 +95,17 @@ fn submitPass(
    }) orelse return error.BindGroup;
    defer c.wgpuBindGroupRelease(bg);
-    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse return error.Encoder;
+    const enc = c.wgpuDeviceCreateCommandEncoder(gloc.device.device, null) orelse
        return error.Encoder;
    const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
    c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
    c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
-    const desired_workgroups = ceilDiv(n, workgroup_size);
+    const WORKGROUP_SIZE = 256;
-    const dispatch_count = @min(desired_workgroups, max_workgroups);
+    const MAX_WORKGROUPS = 65535;
    const desired_workgroups = ceilDiv(n, WORKGROUP_SIZE);
    const dispatch_count = @min(desired_workgroups, MAX_WORKGROUPS);
    c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(dispatch_count), 1, 1);
    c.wgpuComputePassEncoderEnd(pass);