Added f16 capability

GpuBuffer can be for any float and init now
Added a example.zig
2026-05-18 10:12:36 +02:00 · 2026-05-18 09:31:58 +02:00 · 2026-05-18 00:19:37 +02:00 · 2026-05-18 00:10:09 +02:00
11 changed files with 158 additions and 57 deletions
--- a/build.zig
+++ b/build.zig
@ -29,7 +29,7 @@ pub fn build(b: *std.Build) void {
            .target = target,
            .optimize = optimize,
        }),
-        .name = "gpu_matrix_add",
+        .name = "bench",
    });
    // wgpu-native headers + pre-built static library
@ -57,5 +57,38 @@ pub fn build(b: *std.Build) void {
    const run = b.addRunArtifact(exe);
    run.step.dependOn(b.getInstallStep());
-    b.step("bench", "Benchmark a simple add vector").dependOn(&run.step);
+    b.step("bench", "Benchmark a simple add vector.").dependOn(&run.step);
    const exe_examp = b.addExecutable(.{
        .root_module = b.createModule(.{
            .root_source_file = b.path("src/example.zig"),
            .link_libc = true,
            .target = target,
            .optimize = optimize,
        }),
        .name = "bench",
    });
    // wgpu-native headers + pre-built static library
    exe_examp.root_module.addIncludePath(b.path("libs/wgpu-native/include"));
    exe_examp.root_module.addLibraryPath(b.path("libs/wgpu-native/lib"));
    exe_examp.root_module.addObjectFile(b.path("libs/wgpu-native/lib/libwgpu_native.a"));
    if (t.os.tag == .macos) {
        exe_examp.root_module.linkFramework("Metal", .{});
        exe_examp.root_module.linkFramework("QuartzCore", .{});
        exe_examp.root_module.linkFramework("Foundation", .{});
        exe_examp.root_module.linkFramework("CoreGraphics", .{});
    } else if (t.os.tag == .windows) {
        exe_examp.root_module.linkSystemLibrary("d3d12", .{});
        exe_examp.root_module.linkSystemLibrary("dxgi", .{});
        exe_examp.root_module.linkSystemLibrary("user32", .{});
    } else {
        exe_examp.root_module.linkSystemLibrary("vulkan", .{});
        exe_examp.root_module.linkSystemLibrary("gcc_s", .{});
    }
    const examp = b.addRunArtifact(exe_examp);
    run.step.dependOn(b.getInstallStep());
    b.step("example", "Run basic example.").dependOn(&examp.step);
 }
--- a/src/GpuAllocator.zig
+++ b/src/GpuAllocator.zig
@ -1,7 +1,7 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 device: GpuDevice,
 tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
@ -39,6 +39,10 @@ pub fn registerBuffer(
        .usage = usage,
        .size = bytes,
    }) orelse return error.BufferAlloc;
    errdefer {
        c.wgpuBufferDestroy(buf);
        c.wgpuBufferRelease(buf);
    }
    try self.tracked_buffers.put(buf, {});
    self.allocated_vram_bytes += bytes;
@ -50,6 +54,5 @@ pub fn unregisterAndDestroyBuffer(self: *@This(), buf: GpuBuffer) void {
        c.wgpuBufferDestroy(buf.raw);
        c.wgpuBufferRelease(buf.raw);
        self.allocated_vram_bytes -= buf.size;
        self.device.poll();
    }
 }
--- a/src/GpuBuffer.zig
+++ b/src/GpuBuffer.zig
@ -1,5 +1,5 @@
 const std = @import("std");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
 raw: c.WGPUBuffer,
@ -8,7 +8,12 @@ usage: c.WGPUBufferUsage,
 gloc: *GpuAllocator,
 /// Allocates the underlying WebGPU handle and registers it to the parent GpuAllocator
-pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !@This() {
+pub fn init(gloc: *GpuAllocator, T: type, len: usize, usage: c.WGPUBufferUsage) !@This() {
    switch (@typeInfo(T)) {
        .int, .float => {},
        else => @compileError("GpuBuffer can only use int and float type"),
    }
    const bytes = @sizeOf(T) * len;
    const raw_handle = try gloc.registerBuffer(bytes, usage);
    return .{
        .raw = raw_handle,
--- a/src/GpuDevice.zig
+++ b/src/GpuDevice.zig
@ -1,22 +1,26 @@
 const std = @import("std");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 const Ctx = struct {
    adapter: c.WGPUAdapter = null,
    device: c.WGPUDevice = null,
 };
 const GpuDeviceConfig = struct {
    /// VRAM limit. Default 2 GB
    vram_bytes_limit: u64 = 2 * 1024 * 1024 * 1024,
 };
 instance: c.WGPUInstance,
 adapter: c.WGPUAdapter,
 device: c.WGPUDevice,
 queue: c.WGPUQueue,
 limits: c.WGPULimits,
-config: struct {
+config: GpuDeviceConfig,
    vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
 } = .{},
-pub fn init() !@This() {
+pub fn init(config: GpuDeviceConfig) !@This() {
    const instance = c.wgpuCreateInstance(
        &std.mem.zeroes(c.WGPUInstanceDescriptor),
    ) orelse return error.NoInstance;
@ -32,21 +36,32 @@ pub fn init() !@This() {
    const adapter = ctx.adapter orelse return error.NoAdapter;
    errdefer c.wgpuAdapterRelease(adapter);
-    // --- QUERY HARDWARE LIMITS ---
+    var supported_features = std.mem.zeroes(c.WGPUSupportedFeatures);
    c.wgpuAdapterGetFeatures(adapter, &supported_features);
    var supported_limits = std.mem.zeroes(c.WGPULimits);
    supported_limits.nextInChain = null;
    // Fetch what your physical graphic card can actually handle
    if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
    var has_f16 = false;
    for (0..supported_features.featureCount) |i| {
        if (supported_features.features[i] == c.WGPUFeatureName_ShaderF16) {
            has_f16 = true;
            break;
        }
    }
    var feature_buf = [_]c.WGPUFeatureName{c.WGPUFeatureName_ShaderF16};
    const required_features: []const c.WGPUFeatureName =
        if (has_f16) feature_buf[0..1] else &.{};
    const device_descriptor = c.WGPUDeviceDescriptor{
        .nextInChain = null,
        .label = sv("TensorCompilerDevice"),
-        .requiredFeatureCount = 0,
+        .requiredFeatureCount = required_features.len,
-        .requiredFeatures = null,
+        .requiredFeatures = if (required_features.len > 0) required_features.ptr else null,
        .requiredLimits = &supported_limits,
    };
    _ = c.wgpuAdapterRequestDevice(
        adapter,
        &device_descriptor,
@ -61,6 +76,7 @@ pub fn init() !@This() {
        .device = device,
        .queue = c.wgpuDeviceGetQueue(device),
        .limits = supported_limits,
        .config = config,
    };
 }
@ -104,7 +120,3 @@ fn onDevice(
    const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
    ctx.device = device;
 }
 fn sv(s: []const u8) c.WGPUStringView {
    return .{ .data = s.ptr, .length = s.len };
 }
--- a/src/GpuPipeline.zig
+++ b/src/GpuPipeline.zig
@ -1,6 +1,7 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 const sv = @import("utils.zig").sv;
 raw: c.WGPUComputePipeline,
@ -22,7 +23,3 @@ pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
 pub fn deinit(self: @This()) void {
    c.wgpuComputePipelineRelease(self.raw);
 }
 fn sv(s: []const u8) c.WGPUStringView {
    return .{ .data = s.ptr, .length = s.len };
 }
--- a/src/Vec.zig
+++ b/src/Vec.zig
@ -1,6 +1,6 @@
 /// Dummy
 const std = @import("std");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuBuffer = @import("GpuBuffer.zig");
 const GpuDevice = @import("GpuDevice.zig");
@ -15,14 +15,15 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
    return .{
        .buf = try GpuBuffer.init(
            gloc,
-            len * @sizeOf(f32),
+            f16,
            len,
            c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
        ),
        .len = len,
    };
 }
-pub fn initLoad(gloc: *GpuAllocator, data: []const f32) !Vec {
+pub fn initLoad(gloc: *GpuAllocator, data: []const f16) !Vec {
    var self = try initZero(gloc, data.len);
    try self.load(gloc.device, data);
    return self;
@ -36,15 +37,15 @@ pub fn deinit(self: Vec) void {
 pub fn load(
    self: Vec,
    device: GpuDevice,
-    data: []const f32,
+    data: []const f16,
 ) !void {
    std.debug.assert(data.len == self.len);
-    const bytes = data.len * @sizeOf(f32);
+    const bytes = self.byteSize();
    c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
 }
 pub fn byteSize(self: Vec) u64 {
-    return @as(u64, self.len) * @sizeOf(f32);
+    return @as(u64, self.len) * @sizeOf(f16);
 }
 pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
@ -59,13 +60,14 @@ pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
 }
 /// GPU to CPU.
-pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
+pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f16 {
-    const out = try alloc.alloc(f32, self.len);
+    const out = try alloc.alloc(f16, self.len);
    const bytes = self.byteSize();
    const staging = try GpuBuffer.init(
        gloc,
-        bytes,
+        f16,
        self.len,
        c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
    );
    defer staging.deinit();
@ -86,7 +88,7 @@ pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
    );
    while (!mapped) gloc.device.poll();
-    const ptr: [*]const f32 = @ptrCast(@alignCast(
+    const ptr: [*]const f16 = @ptrCast(@alignCast(
        staging.getConstMappedRange(0, bytes),
    ));
    @memcpy(out[0..self.len], ptr[0..self.len]);
@ -120,12 +122,13 @@ fn dispatch2in1out(
    while (offset < bytes) {
        // Calculate bounds for the current chunk
        const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
-        const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32));
+        const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16));
        // Create uniform buffer for this specific chunk's size
        const info_buf = try GpuBuffer.init(
            gloc,
-            @sizeOf(u32),
+            u32,
            1,
            c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
        );
        defer info_buf.deinit();
--- a/src/bench.zig
+++ b/src/bench.zig
@ -4,10 +4,10 @@ const GpuAllocator = @import("GpuAllocator.zig");
 const GpuPipeline = @import("GpuPipeline.zig");
 const Vec = @import("Vec.zig");
-const c = @import("c.zig").c;
+const c = @import("utils.zig").c;
 pub fn main(init: std.process.Init) !void {
-    const device = try GpuDevice.init();
+    const device = try GpuDevice.init(.{ .vram_bytes_limit = 4 * 1024 * 1024 * 1024 });
    defer device.deinit();
    var gloc = try GpuAllocator.init(init.gpa, device);
@ -20,8 +20,8 @@ pub fn main(init: std.process.Init) !void {
    // --- WARM-UP PHASE ---
    {
-        var warmup_a = [_]f32{1.0};
+        var warmup_a = [_]f16{1.0};
-        var warmup_b = [_]f32{1.0};
+        var warmup_b = [_]f16{1.0};
        const wa = try Vec.initLoad(&gloc, &warmup_a);
        defer wa.deinit();
        const wb = try Vec.initLoad(&gloc, &warmup_b);
@ -41,11 +41,11 @@ pub fn main(init: std.process.Init) !void {
        4 * 4 * 4 * 1024,
        4 * 4 * 4 * 4 * 1024,
        1024 * 1024,
-        4 * 1024 * 1024,
+        // 4 * 1024 * 1024,
-        4 * 4 * 1024 * 1024,
+        // 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 1024 * 1024,
-        4 * 4 * 4 * 4 * 4 * 1024 * 1024,
+        // 4 * 4 * 4 * 4 * 4 * 1024 * 1024,
    };
    const iterations = 10;
@ -56,9 +56,9 @@ pub fn main(init: std.process.Init) !void {
    for (sizes) |size| {
        // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
-        var data_a = try allocator.alloc(f32, size);
+        const data_a = try allocator.alloc(f16, size);
        defer allocator.free(data_a);
-        var data_b = try allocator.alloc(f32, size);
+        const data_b = try allocator.alloc(f16, size);
        defer allocator.free(data_b);
        for (0..size) |i| {
@ -72,7 +72,7 @@ pub fn main(init: std.process.Init) !void {
        var min_compute_ns: u64 = std.math.maxInt(u64);
        // Track peak VRAM usage observed during the iterations
-        var peak_vram_bytes: usize = 0;
+        var peak_vram_bytes: u64 = 0;
        for (0..iterations) |_| {
            // --- 1. GPU ALLOCATION PHASE ---
@ -95,9 +95,8 @@ pub fn main(init: std.process.Init) !void {
            // All 3 buffers (a, b, sum) are currently resident in VRAM here.
            // Querying now catches the true peak allocation step.
-            if (gloc.allocated_vram_bytes > peak_vram_bytes) {
+            if (gloc.allocated_vram_bytes > peak_vram_bytes)
                peak_vram_bytes = gloc.allocated_vram_bytes;
            }
            _ = c.wgpuDevicePoll(device.device, 1, null);
@ -118,7 +117,7 @@ pub fn main(init: std.process.Init) !void {
        // --- Metrics Calculations ---
        const f_size = @as(f64, @floatFromInt(size));
-        const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32)));
+        const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f16)));
        const mb = element_bytes / (1024.0 * 1024.0);
        // Individual Phase Timings (ms)
--- a/src/c.zig
+++ b/src/c.zig
@ -1 +0,0 @@
 pub const c = @cImport(@cInclude("wgpu.h"));
--- a/src/example.zig
+++ b/src/example.zig
@ -0,0 +1,43 @@
 const std = @import("std");
 const GpuDevice = @import("GpuDevice.zig");
 const GpuAllocator = @import("GpuAllocator.zig");
 const GpuPipeline = @import("GpuPipeline.zig");
 const Vec = @import("Vec.zig");
 const c = @import("utils.zig").c;
 pub fn main(init: std.process.Init) !void {
    const allocator = init.gpa;
    const device = try GpuDevice.init(.{ .vram_bytes_limit = 4 * 1024 * 1024 * 1024 });
    defer device.deinit();
    var gloc = try GpuAllocator.init(allocator, device);
    defer gloc.deinit();
    const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
    defer add_pip.deinit();
    const data_a = try allocator.alloc(f16, 1024);
    defer allocator.free(data_a);
    const data_b = try allocator.alloc(f16, 1024);
    defer allocator.free(data_b);
    for (0..1024) |i| {
        data_a[i] = @floatFromInt(i);
        data_b[i] = @floatFromInt(1024 - 1 - i);
    }
    const a = try Vec.initLoad(&gloc, data_a);
    defer a.deinit();
    const b = try Vec.initLoad(&gloc, data_b);
    defer b.deinit();
    const sum = try a.run(&gloc, b, add_pip);
    defer sum.deinit();
    const out = try sum.read(&gloc, allocator);
    defer allocator.free(out);
    std.debug.print("{any}\n", .{out});
 }
--- a/src/shaders/add.wgsl
+++ b/src/shaders/add.wgsl
@ -1,6 +1,8 @@
-@group(0) @binding(0) var<storage, read> A: array<f32>;
+enable f16;
-@group(0) @binding(1) var<storage, read> B: array<f32>;
+
-@group(0) @binding(2) var<storage, read_write> C: array<f32>;
+@group(0) @binding(0) var<storage, read> A: array<f16>;
@group(0) @binding(1) var<storage, read> B: array<f16>;
@group(0) @binding(2) var<storage, read_write> C: array<f16>;
 struct TensorInfo {
    size: u32,
--- a/src/utils.zig
+++ b/src/utils.zig
@ -0,0 +1,5 @@
 pub const c = @cImport(@cInclude("wgpu.h"));
 pub fn sv(s: []const u8) c.WGPUStringView {
    return .{ .data = s.ptr, .length = s.len };
 }
Author	SHA1	Message	Date
adrien	d42c521a96	Added f16 capability	2026-05-18 10:12:36 +02:00
adrien	0fc5f5dbb8	GpuBuffer can be for any float and init now	2026-05-18 09:31:58 +02:00
adrien	545e67d72f	Added a example.zig	2026-05-18 00:19:37 +02:00
adrien	0fcb9ee351	Synthax improv + GpuDeviceConfig	2026-05-18 00:10:09 +02:00
		`@ -1 +0,0 @@`
			`pub const c = @cImport(@cInclude("wgpu.h"));`