From c7afe28cc67ea92267a99edd141266741f3d2928 Mon Sep 17 00:00:00 2001 From: adrien Date: Mon, 18 May 2026 16:24:43 +0200 Subject: [PATCH] GpuBuffer align with size 4 as required by WebGPU Otherwise a GpuBuffer of a single f16 is too small (only 2) and a eneven f16 wouldn't work either --- src/GpuBuffer.zig | 28 ++++++++++++++++++++++++---- src/bench.zig | 18 ------------------ 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/GpuBuffer.zig b/src/GpuBuffer.zig index 906eda9..72b4f46 100644 --- a/src/GpuBuffer.zig +++ b/src/GpuBuffer.zig @@ -3,7 +3,7 @@ const c = @import("utils.zig").c; const GpuAllocator = @import("GpuAllocator.zig"); raw: c.WGPUBuffer, -size: u64, +size: u64, // Now tracks the 4-byte aligned size directly usage: c.WGPUBufferUsage, gloc: GpuAllocator, @@ -27,11 +27,13 @@ pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@Th var iter = usage.iterator(); while (iter.next()) |flag| use |= @intFromEnum(flag); - const raw_handle = try gloc.allocBuffer(size, use); + // Automatically align the buffer size forward to a multiple of 4 bytes under the hood + const aligned_size = std.mem.alignForward(u64, size, 4); + const raw_handle = try gloc.allocBuffer(aligned_size, use); return .{ .raw = raw_handle, - .size = size, + .size = aligned_size, // Expose the aligned size to the rest of the application .usage = use, .gloc = gloc, }; @@ -69,7 +71,25 @@ pub fn load( T: type, data: []const T, ) !void { - c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size); + const bytes = data.len * @sizeOf(T); + + if (bytes == self.size) { + // Aligned path: direct download + c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size); + } else { + // Unaligned path: Split the write into an aligned chunk and a padded remainder + // to support arbitrary lengths without any allocations or large stack arrays. + const aligned_part = (bytes / 4) * 4; + if (aligned_part > 0) { + c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, aligned_part); + } + + var remainder_buf: [4]u8 = .{ 0, 0, 0, 0 }; + const data_bytes = std.mem.sliceAsBytes(data); + @memcpy(remainder_buf[0 .. bytes - aligned_part], data_bytes[aligned_part..bytes]); + + c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, aligned_part, &remainder_buf, 4); + } } pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T { diff --git a/src/bench.zig b/src/bench.zig index 153b965..762eb60 100644 --- a/src/bench.zig +++ b/src/bench.zig @@ -40,10 +40,6 @@ const Vec = struct { try self.buf.load(f16, data); } - pub fn byteSize(self: Vec) u64 { - return @as(u64, self.len) * @sizeOf(f16); - } - // Changed: gloc is passed by value instead of *GpuAllocator pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec { std.debug.assert(self.len == other.len); @@ -75,20 +71,6 @@ pub fn main(init: std.process.Init) !void { const allocator = init.gpa; - // --- WARM-UP PHASE --- - { - var warmup_a = [_]f16{1.0}; - var warmup_b = [_]f16{1.0}; - const wa = try Vec.initLoad(gloc, &warmup_a); - defer wa.deinit(); - const wb = try Vec.initLoad(gloc, &warmup_b); - defer wb.deinit(); - const wsum = try wa.run(gloc, wb, add_pip); - defer wsum.deinit(); - const wout = try wsum.read(allocator); - defer allocator.free(wout); - } - const sizes = [_]usize{ 1, 256,