From c7afe28cc67ea92267a99edd141266741f3d2928 Mon Sep 17 00:00:00 2001
From: adrien <adrien@bouvais.lu>
Date: Mon, 18 May 2026 16:24:43 +0200
Subject: [PATCH] GpuBuffer align with size 4 as required by WebGPU

Otherwise a GpuBuffer of a single f16 is too small (only 2) and a eneven
f16 wouldn't work either
---
 src/GpuBuffer.zig | 28 ++++++++++++++++++++++++----
 src/bench.zig     | 18 ------------------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/GpuBuffer.zig b/src/GpuBuffer.zig
index 906eda9..72b4f46 100644
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@@ -3,7 +3,7 @@ const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
 
 raw: c.WGPUBuffer,
-size: u64,
+size: u64, // Now tracks the 4-byte aligned size directly
 usage: c.WGPUBufferUsage,
 gloc: GpuAllocator,
 
@@ -27,11 +27,13 @@ pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@Th
     var iter = usage.iterator();
     while (iter.next()) |flag| use |= @intFromEnum(flag);
 
-    const raw_handle = try gloc.allocBuffer(size, use);
+    // Automatically align the buffer size forward to a multiple of 4 bytes under the hood
+    const aligned_size = std.mem.alignForward(u64, size, 4);
 
+    const raw_handle = try gloc.allocBuffer(aligned_size, use);
     return .{
         .raw = raw_handle,
-        .size = size,
+        .size = aligned_size, // Expose the aligned size to the rest of the application
         .usage = use,
         .gloc = gloc,
     };
@@ -69,7 +71,25 @@ pub fn load(
     T: type,
     data: []const T,
 ) !void {
-    c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
+    const bytes = data.len * @sizeOf(T);
+
+    if (bytes == self.size) {
+        // Aligned path: direct download
+        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
+    } else {
+        // Unaligned path: Split the write into an aligned chunk and a padded remainder
+        // to support arbitrary lengths without any allocations or large stack arrays.
+        const aligned_part = (bytes / 4) * 4;
+        if (aligned_part > 0) {
+            c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, aligned_part);
+        }
+
+        var remainder_buf: [4]u8 = .{ 0, 0, 0, 0 };
+        const data_bytes = std.mem.sliceAsBytes(data);
+        @memcpy(remainder_buf[0 .. bytes - aligned_part], data_bytes[aligned_part..bytes]);
+
+        c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, aligned_part, &remainder_buf, 4);
+    }
 }
 
 pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
diff --git a/src/bench.zig b/src/bench.zig
index 153b965..762eb60 100644
--- a/src/bench.zig
+++ b/src/bench.zig
@@ -40,10 +40,6 @@ const Vec = struct {
         try self.buf.load(f16, data);
     }
 
-    pub fn byteSize(self: Vec) u64 {
-        return @as(u64, self.len) * @sizeOf(f16);
-    }
-
     // Changed: gloc is passed by value instead of *GpuAllocator
     pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec {
         std.debug.assert(self.len == other.len);
@@ -75,20 +71,6 @@ pub fn main(init: std.process.Init) !void {
 
     const allocator = init.gpa;
 
-    // --- WARM-UP PHASE ---
-    {
-        var warmup_a = [_]f16{1.0};
-        var warmup_b = [_]f16{1.0};
-        const wa = try Vec.initLoad(gloc, &warmup_a);
-        defer wa.deinit();
-        const wb = try Vec.initLoad(gloc, &warmup_b);
-        defer wb.deinit();
-        const wsum = try wa.run(gloc, wb, add_pip);
-        defer wsum.deinit();
-        const wout = try wsum.read(allocator);
-        defer allocator.free(wout);
-    }
-
     const sizes = [_]usize{
         1,
         256,