GpuBuffer align with size 4 as required by WebGPU
Otherwise a GpuBuffer of a single f16 is too small (only 2) and a eneven f16 wouldn't work either
This commit is contained in:
parent
d9cb4c4672
commit
c7afe28cc6
@ -3,7 +3,7 @@ const c = @import("utils.zig").c;
|
|||||||
const GpuAllocator = @import("GpuAllocator.zig");
|
const GpuAllocator = @import("GpuAllocator.zig");
|
||||||
|
|
||||||
raw: c.WGPUBuffer,
|
raw: c.WGPUBuffer,
|
||||||
size: u64,
|
size: u64, // Now tracks the 4-byte aligned size directly
|
||||||
usage: c.WGPUBufferUsage,
|
usage: c.WGPUBufferUsage,
|
||||||
gloc: GpuAllocator,
|
gloc: GpuAllocator,
|
||||||
|
|
||||||
@ -27,11 +27,13 @@ pub fn init(gloc: GpuAllocator, size: u64, usage: std.EnumSet(BufferUsage)) !@Th
|
|||||||
var iter = usage.iterator();
|
var iter = usage.iterator();
|
||||||
while (iter.next()) |flag| use |= @intFromEnum(flag);
|
while (iter.next()) |flag| use |= @intFromEnum(flag);
|
||||||
|
|
||||||
const raw_handle = try gloc.allocBuffer(size, use);
|
// Automatically align the buffer size forward to a multiple of 4 bytes under the hood
|
||||||
|
const aligned_size = std.mem.alignForward(u64, size, 4);
|
||||||
|
|
||||||
|
const raw_handle = try gloc.allocBuffer(aligned_size, use);
|
||||||
return .{
|
return .{
|
||||||
.raw = raw_handle,
|
.raw = raw_handle,
|
||||||
.size = size,
|
.size = aligned_size, // Expose the aligned size to the rest of the application
|
||||||
.usage = use,
|
.usage = use,
|
||||||
.gloc = gloc,
|
.gloc = gloc,
|
||||||
};
|
};
|
||||||
@ -69,7 +71,25 @@ pub fn load(
|
|||||||
T: type,
|
T: type,
|
||||||
data: []const T,
|
data: []const T,
|
||||||
) !void {
|
) !void {
|
||||||
c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
|
const bytes = data.len * @sizeOf(T);
|
||||||
|
|
||||||
|
if (bytes == self.size) {
|
||||||
|
// Aligned path: direct download
|
||||||
|
c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, self.size);
|
||||||
|
} else {
|
||||||
|
// Unaligned path: Split the write into an aligned chunk and a padded remainder
|
||||||
|
// to support arbitrary lengths without any allocations or large stack arrays.
|
||||||
|
const aligned_part = (bytes / 4) * 4;
|
||||||
|
if (aligned_part > 0) {
|
||||||
|
c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, 0, data.ptr, aligned_part);
|
||||||
|
}
|
||||||
|
|
||||||
|
var remainder_buf: [4]u8 = .{ 0, 0, 0, 0 };
|
||||||
|
const data_bytes = std.mem.sliceAsBytes(data);
|
||||||
|
@memcpy(remainder_buf[0 .. bytes - aligned_part], data_bytes[aligned_part..bytes]);
|
||||||
|
|
||||||
|
c.wgpuQueueWriteBuffer(self.gloc.device.queue, self.raw, aligned_part, &remainder_buf, 4);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
|
pub fn read(self: @This(), alloc: std.mem.Allocator, T: type) ![]T {
|
||||||
|
|||||||
@ -40,10 +40,6 @@ const Vec = struct {
|
|||||||
try self.buf.load(f16, data);
|
try self.buf.load(f16, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn byteSize(self: Vec) u64 {
|
|
||||||
return @as(u64, self.len) * @sizeOf(f16);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Changed: gloc is passed by value instead of *GpuAllocator
|
// Changed: gloc is passed by value instead of *GpuAllocator
|
||||||
pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec {
|
pub fn run(self: Vec, gloc: GpuAllocator, other: Vec, process: GpuProcess) !Vec {
|
||||||
std.debug.assert(self.len == other.len);
|
std.debug.assert(self.len == other.len);
|
||||||
@ -75,20 +71,6 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
const allocator = init.gpa;
|
const allocator = init.gpa;
|
||||||
|
|
||||||
// --- WARM-UP PHASE ---
|
|
||||||
{
|
|
||||||
var warmup_a = [_]f16{1.0};
|
|
||||||
var warmup_b = [_]f16{1.0};
|
|
||||||
const wa = try Vec.initLoad(gloc, &warmup_a);
|
|
||||||
defer wa.deinit();
|
|
||||||
const wb = try Vec.initLoad(gloc, &warmup_b);
|
|
||||||
defer wb.deinit();
|
|
||||||
const wsum = try wa.run(gloc, wb, add_pip);
|
|
||||||
defer wsum.deinit();
|
|
||||||
const wout = try wsum.read(allocator);
|
|
||||||
defer allocator.free(wout);
|
|
||||||
}
|
|
||||||
|
|
||||||
const sizes = [_]usize{
|
const sizes = [_]usize{
|
||||||
1,
|
1,
|
||||||
256,
|
256,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user