Added f16 capability

This commit is contained in:
adrien 2026-05-18 10:12:36 +02:00
parent 0fc5f5dbb8
commit d42c521a96
5 changed files with 39 additions and 26 deletions

View File

@ -36,21 +36,32 @@ pub fn init(config: GpuDeviceConfig) !@This() {
const adapter = ctx.adapter orelse return error.NoAdapter; const adapter = ctx.adapter orelse return error.NoAdapter;
errdefer c.wgpuAdapterRelease(adapter); errdefer c.wgpuAdapterRelease(adapter);
// --- QUERY HARDWARE LIMITS --- var supported_features = std.mem.zeroes(c.WGPUSupportedFeatures);
c.wgpuAdapterGetFeatures(adapter, &supported_features);
var supported_limits = std.mem.zeroes(c.WGPULimits); var supported_limits = std.mem.zeroes(c.WGPULimits);
supported_limits.nextInChain = null; supported_limits.nextInChain = null;
// Fetch what your physical graphic card can actually handle
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits; if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
var has_f16 = false;
for (0..supported_features.featureCount) |i| {
if (supported_features.features[i] == c.WGPUFeatureName_ShaderF16) {
has_f16 = true;
break;
}
}
var feature_buf = [_]c.WGPUFeatureName{c.WGPUFeatureName_ShaderF16};
const required_features: []const c.WGPUFeatureName =
if (has_f16) feature_buf[0..1] else &.{};
const device_descriptor = c.WGPUDeviceDescriptor{ const device_descriptor = c.WGPUDeviceDescriptor{
.nextInChain = null, .nextInChain = null,
.label = sv("TensorCompilerDevice"), .label = sv("TensorCompilerDevice"),
.requiredFeatureCount = 0, .requiredFeatureCount = required_features.len,
.requiredFeatures = null, .requiredFeatures = if (required_features.len > 0) required_features.ptr else null,
.requiredLimits = &supported_limits, .requiredLimits = &supported_limits,
}; };
_ = c.wgpuAdapterRequestDevice( _ = c.wgpuAdapterRequestDevice(
adapter, adapter,
&device_descriptor, &device_descriptor,

View File

@ -15,7 +15,7 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
return .{ return .{
.buf = try GpuBuffer.init( .buf = try GpuBuffer.init(
gloc, gloc,
f32, f16,
len, len,
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc, c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
), ),
@ -23,7 +23,7 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
}; };
} }
pub fn initLoad(gloc: *GpuAllocator, data: []const f32) !Vec { pub fn initLoad(gloc: *GpuAllocator, data: []const f16) !Vec {
var self = try initZero(gloc, data.len); var self = try initZero(gloc, data.len);
try self.load(gloc.device, data); try self.load(gloc.device, data);
return self; return self;
@ -37,15 +37,15 @@ pub fn deinit(self: Vec) void {
pub fn load( pub fn load(
self: Vec, self: Vec,
device: GpuDevice, device: GpuDevice,
data: []const f32, data: []const f16,
) !void { ) !void {
std.debug.assert(data.len == self.len); std.debug.assert(data.len == self.len);
const bytes = data.len * @sizeOf(f32); const bytes = self.byteSize();
c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes); c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
} }
pub fn byteSize(self: Vec) u64 { pub fn byteSize(self: Vec) u64 {
return @as(u64, self.len) * @sizeOf(f32); return @as(u64, self.len) * @sizeOf(f16);
} }
pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec { pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
@ -60,13 +60,13 @@ pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
} }
/// GPU to CPU. /// GPU to CPU.
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 { pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f16 {
const out = try alloc.alloc(f32, self.len); const out = try alloc.alloc(f16, self.len);
const bytes = self.byteSize(); const bytes = self.byteSize();
const staging = try GpuBuffer.init( const staging = try GpuBuffer.init(
gloc, gloc,
f32, f16,
self.len, self.len,
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst, c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
); );
@ -88,7 +88,7 @@ pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
); );
while (!mapped) gloc.device.poll(); while (!mapped) gloc.device.poll();
const ptr: [*]const f32 = @ptrCast(@alignCast( const ptr: [*]const f16 = @ptrCast(@alignCast(
staging.getConstMappedRange(0, bytes), staging.getConstMappedRange(0, bytes),
)); ));
@memcpy(out[0..self.len], ptr[0..self.len]); @memcpy(out[0..self.len], ptr[0..self.len]);
@ -122,7 +122,7 @@ fn dispatch2in1out(
while (offset < bytes) { while (offset < bytes) {
// Calculate bounds for the current chunk // Calculate bounds for the current chunk
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset); const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32)); const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16));
// Create uniform buffer for this specific chunk's size // Create uniform buffer for this specific chunk's size
const info_buf = try GpuBuffer.init( const info_buf = try GpuBuffer.init(

View File

@ -20,8 +20,8 @@ pub fn main(init: std.process.Init) !void {
// --- WARM-UP PHASE --- // --- WARM-UP PHASE ---
{ {
var warmup_a = [_]f32{1.0}; var warmup_a = [_]f16{1.0};
var warmup_b = [_]f32{1.0}; var warmup_b = [_]f16{1.0};
const wa = try Vec.initLoad(&gloc, &warmup_a); const wa = try Vec.initLoad(&gloc, &warmup_a);
defer wa.deinit(); defer wa.deinit();
const wb = try Vec.initLoad(&gloc, &warmup_b); const wb = try Vec.initLoad(&gloc, &warmup_b);
@ -56,9 +56,9 @@ pub fn main(init: std.process.Init) !void {
for (sizes) |size| { for (sizes) |size| {
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) --- // --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
const data_a = try allocator.alloc(f32, size); const data_a = try allocator.alloc(f16, size);
defer allocator.free(data_a); defer allocator.free(data_a);
const data_b = try allocator.alloc(f32, size); const data_b = try allocator.alloc(f16, size);
defer allocator.free(data_b); defer allocator.free(data_b);
for (0..size) |i| { for (0..size) |i| {
@ -117,7 +117,7 @@ pub fn main(init: std.process.Init) !void {
// --- Metrics Calculations --- // --- Metrics Calculations ---
const f_size = @as(f64, @floatFromInt(size)); const f_size = @as(f64, @floatFromInt(size));
const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32))); const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f16)));
const mb = element_bytes / (1024.0 * 1024.0); const mb = element_bytes / (1024.0 * 1024.0);
// Individual Phase Timings (ms) // Individual Phase Timings (ms)

View File

@ -18,9 +18,9 @@ pub fn main(init: std.process.Init) !void {
const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl")); const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
defer add_pip.deinit(); defer add_pip.deinit();
const data_a = try allocator.alloc(f32, 1024); const data_a = try allocator.alloc(f16, 1024);
defer allocator.free(data_a); defer allocator.free(data_a);
const data_b = try allocator.alloc(f32, 1024); const data_b = try allocator.alloc(f16, 1024);
defer allocator.free(data_b); defer allocator.free(data_b);
for (0..1024) |i| { for (0..1024) |i| {

View File

@ -1,6 +1,8 @@
@group(0) @binding(0) var<storage, read> A: array<f32>; enable f16;
@group(0) @binding(1) var<storage, read> B: array<f32>;
@group(0) @binding(2) var<storage, read_write> C: array<f32>; @group(0) @binding(0) var<storage, read> A: array<f16>;
@group(0) @binding(1) var<storage, read> B: array<f16>;
@group(0) @binding(2) var<storage, read_write> C: array<f16>;
struct TensorInfo { struct TensorInfo {
size: u32, size: u32,