Added f16 capability
This commit is contained in:
parent
0fc5f5dbb8
commit
d42c521a96
@ -36,21 +36,32 @@ pub fn init(config: GpuDeviceConfig) !@This() {
|
|||||||
const adapter = ctx.adapter orelse return error.NoAdapter;
|
const adapter = ctx.adapter orelse return error.NoAdapter;
|
||||||
errdefer c.wgpuAdapterRelease(adapter);
|
errdefer c.wgpuAdapterRelease(adapter);
|
||||||
|
|
||||||
// --- QUERY HARDWARE LIMITS ---
|
var supported_features = std.mem.zeroes(c.WGPUSupportedFeatures);
|
||||||
|
c.wgpuAdapterGetFeatures(adapter, &supported_features);
|
||||||
|
|
||||||
var supported_limits = std.mem.zeroes(c.WGPULimits);
|
var supported_limits = std.mem.zeroes(c.WGPULimits);
|
||||||
supported_limits.nextInChain = null;
|
supported_limits.nextInChain = null;
|
||||||
|
|
||||||
// Fetch what your physical graphic card can actually handle
|
|
||||||
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
|
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
|
||||||
|
|
||||||
|
var has_f16 = false;
|
||||||
|
for (0..supported_features.featureCount) |i| {
|
||||||
|
if (supported_features.features[i] == c.WGPUFeatureName_ShaderF16) {
|
||||||
|
has_f16 = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var feature_buf = [_]c.WGPUFeatureName{c.WGPUFeatureName_ShaderF16};
|
||||||
|
const required_features: []const c.WGPUFeatureName =
|
||||||
|
if (has_f16) feature_buf[0..1] else &.{};
|
||||||
|
|
||||||
const device_descriptor = c.WGPUDeviceDescriptor{
|
const device_descriptor = c.WGPUDeviceDescriptor{
|
||||||
.nextInChain = null,
|
.nextInChain = null,
|
||||||
.label = sv("TensorCompilerDevice"),
|
.label = sv("TensorCompilerDevice"),
|
||||||
.requiredFeatureCount = 0,
|
.requiredFeatureCount = required_features.len,
|
||||||
.requiredFeatures = null,
|
.requiredFeatures = if (required_features.len > 0) required_features.ptr else null,
|
||||||
.requiredLimits = &supported_limits,
|
.requiredLimits = &supported_limits,
|
||||||
};
|
};
|
||||||
|
|
||||||
_ = c.wgpuAdapterRequestDevice(
|
_ = c.wgpuAdapterRequestDevice(
|
||||||
adapter,
|
adapter,
|
||||||
&device_descriptor,
|
&device_descriptor,
|
||||||
|
|||||||
20
src/Vec.zig
20
src/Vec.zig
@ -15,7 +15,7 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
|
|||||||
return .{
|
return .{
|
||||||
.buf = try GpuBuffer.init(
|
.buf = try GpuBuffer.init(
|
||||||
gloc,
|
gloc,
|
||||||
f32,
|
f16,
|
||||||
len,
|
len,
|
||||||
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
||||||
),
|
),
|
||||||
@ -23,7 +23,7 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn initLoad(gloc: *GpuAllocator, data: []const f32) !Vec {
|
pub fn initLoad(gloc: *GpuAllocator, data: []const f16) !Vec {
|
||||||
var self = try initZero(gloc, data.len);
|
var self = try initZero(gloc, data.len);
|
||||||
try self.load(gloc.device, data);
|
try self.load(gloc.device, data);
|
||||||
return self;
|
return self;
|
||||||
@ -37,15 +37,15 @@ pub fn deinit(self: Vec) void {
|
|||||||
pub fn load(
|
pub fn load(
|
||||||
self: Vec,
|
self: Vec,
|
||||||
device: GpuDevice,
|
device: GpuDevice,
|
||||||
data: []const f32,
|
data: []const f16,
|
||||||
) !void {
|
) !void {
|
||||||
std.debug.assert(data.len == self.len);
|
std.debug.assert(data.len == self.len);
|
||||||
const bytes = data.len * @sizeOf(f32);
|
const bytes = self.byteSize();
|
||||||
c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
|
c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn byteSize(self: Vec) u64 {
|
pub fn byteSize(self: Vec) u64 {
|
||||||
return @as(u64, self.len) * @sizeOf(f32);
|
return @as(u64, self.len) * @sizeOf(f16);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
||||||
@ -60,13 +60,13 @@ pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// GPU to CPU.
|
/// GPU to CPU.
|
||||||
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f16 {
|
||||||
const out = try alloc.alloc(f32, self.len);
|
const out = try alloc.alloc(f16, self.len);
|
||||||
const bytes = self.byteSize();
|
const bytes = self.byteSize();
|
||||||
|
|
||||||
const staging = try GpuBuffer.init(
|
const staging = try GpuBuffer.init(
|
||||||
gloc,
|
gloc,
|
||||||
f32,
|
f16,
|
||||||
self.len,
|
self.len,
|
||||||
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
||||||
);
|
);
|
||||||
@ -88,7 +88,7 @@ pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
|||||||
);
|
);
|
||||||
while (!mapped) gloc.device.poll();
|
while (!mapped) gloc.device.poll();
|
||||||
|
|
||||||
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
const ptr: [*]const f16 = @ptrCast(@alignCast(
|
||||||
staging.getConstMappedRange(0, bytes),
|
staging.getConstMappedRange(0, bytes),
|
||||||
));
|
));
|
||||||
@memcpy(out[0..self.len], ptr[0..self.len]);
|
@memcpy(out[0..self.len], ptr[0..self.len]);
|
||||||
@ -122,7 +122,7 @@ fn dispatch2in1out(
|
|||||||
while (offset < bytes) {
|
while (offset < bytes) {
|
||||||
// Calculate bounds for the current chunk
|
// Calculate bounds for the current chunk
|
||||||
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
|
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
|
||||||
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32));
|
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16));
|
||||||
|
|
||||||
// Create uniform buffer for this specific chunk's size
|
// Create uniform buffer for this specific chunk's size
|
||||||
const info_buf = try GpuBuffer.init(
|
const info_buf = try GpuBuffer.init(
|
||||||
|
|||||||
@ -20,8 +20,8 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
// --- WARM-UP PHASE ---
|
// --- WARM-UP PHASE ---
|
||||||
{
|
{
|
||||||
var warmup_a = [_]f32{1.0};
|
var warmup_a = [_]f16{1.0};
|
||||||
var warmup_b = [_]f32{1.0};
|
var warmup_b = [_]f16{1.0};
|
||||||
const wa = try Vec.initLoad(&gloc, &warmup_a);
|
const wa = try Vec.initLoad(&gloc, &warmup_a);
|
||||||
defer wa.deinit();
|
defer wa.deinit();
|
||||||
const wb = try Vec.initLoad(&gloc, &warmup_b);
|
const wb = try Vec.initLoad(&gloc, &warmup_b);
|
||||||
@ -56,9 +56,9 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
for (sizes) |size| {
|
for (sizes) |size| {
|
||||||
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
|
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
|
||||||
const data_a = try allocator.alloc(f32, size);
|
const data_a = try allocator.alloc(f16, size);
|
||||||
defer allocator.free(data_a);
|
defer allocator.free(data_a);
|
||||||
const data_b = try allocator.alloc(f32, size);
|
const data_b = try allocator.alloc(f16, size);
|
||||||
defer allocator.free(data_b);
|
defer allocator.free(data_b);
|
||||||
|
|
||||||
for (0..size) |i| {
|
for (0..size) |i| {
|
||||||
@ -117,7 +117,7 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
|
|
||||||
// --- Metrics Calculations ---
|
// --- Metrics Calculations ---
|
||||||
const f_size = @as(f64, @floatFromInt(size));
|
const f_size = @as(f64, @floatFromInt(size));
|
||||||
const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32)));
|
const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f16)));
|
||||||
const mb = element_bytes / (1024.0 * 1024.0);
|
const mb = element_bytes / (1024.0 * 1024.0);
|
||||||
|
|
||||||
// Individual Phase Timings (ms)
|
// Individual Phase Timings (ms)
|
||||||
|
|||||||
@ -18,9 +18,9 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
|
const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
|
||||||
defer add_pip.deinit();
|
defer add_pip.deinit();
|
||||||
|
|
||||||
const data_a = try allocator.alloc(f32, 1024);
|
const data_a = try allocator.alloc(f16, 1024);
|
||||||
defer allocator.free(data_a);
|
defer allocator.free(data_a);
|
||||||
const data_b = try allocator.alloc(f32, 1024);
|
const data_b = try allocator.alloc(f16, 1024);
|
||||||
defer allocator.free(data_b);
|
defer allocator.free(data_b);
|
||||||
|
|
||||||
for (0..1024) |i| {
|
for (0..1024) |i| {
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
@group(0) @binding(0) var<storage, read> A: array<f32>;
|
enable f16;
|
||||||
@group(0) @binding(1) var<storage, read> B: array<f32>;
|
|
||||||
@group(0) @binding(2) var<storage, read_write> C: array<f32>;
|
@group(0) @binding(0) var<storage, read> A: array<f16>;
|
||||||
|
@group(0) @binding(1) var<storage, read> B: array<f16>;
|
||||||
|
@group(0) @binding(2) var<storage, read_write> C: array<f16>;
|
||||||
|
|
||||||
struct TensorInfo {
|
struct TensorInfo {
|
||||||
size: u32,
|
size: u32,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user