Compare commits
4 Commits
6a2cbe2734
...
d42c521a96
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d42c521a96 | ||
|
|
0fc5f5dbb8 | ||
|
|
545e67d72f | ||
|
|
0fcb9ee351 |
37
build.zig
37
build.zig
@ -29,7 +29,7 @@ pub fn build(b: *std.Build) void {
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
.name = "gpu_matrix_add",
|
||||
.name = "bench",
|
||||
});
|
||||
|
||||
// wgpu-native headers + pre-built static library
|
||||
@ -57,5 +57,38 @@ pub fn build(b: *std.Build) void {
|
||||
|
||||
const run = b.addRunArtifact(exe);
|
||||
run.step.dependOn(b.getInstallStep());
|
||||
b.step("bench", "Benchmark a simple add vector").dependOn(&run.step);
|
||||
b.step("bench", "Benchmark a simple add vector.").dependOn(&run.step);
|
||||
|
||||
const exe_examp = b.addExecutable(.{
|
||||
.root_module = b.createModule(.{
|
||||
.root_source_file = b.path("src/example.zig"),
|
||||
.link_libc = true,
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
}),
|
||||
.name = "bench",
|
||||
});
|
||||
|
||||
// wgpu-native headers + pre-built static library
|
||||
exe_examp.root_module.addIncludePath(b.path("libs/wgpu-native/include"));
|
||||
exe_examp.root_module.addLibraryPath(b.path("libs/wgpu-native/lib"));
|
||||
exe_examp.root_module.addObjectFile(b.path("libs/wgpu-native/lib/libwgpu_native.a"));
|
||||
|
||||
if (t.os.tag == .macos) {
|
||||
exe_examp.root_module.linkFramework("Metal", .{});
|
||||
exe_examp.root_module.linkFramework("QuartzCore", .{});
|
||||
exe_examp.root_module.linkFramework("Foundation", .{});
|
||||
exe_examp.root_module.linkFramework("CoreGraphics", .{});
|
||||
} else if (t.os.tag == .windows) {
|
||||
exe_examp.root_module.linkSystemLibrary("d3d12", .{});
|
||||
exe_examp.root_module.linkSystemLibrary("dxgi", .{});
|
||||
exe_examp.root_module.linkSystemLibrary("user32", .{});
|
||||
} else {
|
||||
exe_examp.root_module.linkSystemLibrary("vulkan", .{});
|
||||
exe_examp.root_module.linkSystemLibrary("gcc_s", .{});
|
||||
}
|
||||
|
||||
const examp = b.addRunArtifact(exe_examp);
|
||||
run.step.dependOn(b.getInstallStep());
|
||||
b.step("example", "Run basic example.").dependOn(&examp.step);
|
||||
}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
const std = @import("std");
|
||||
const GpuDevice = @import("GpuDevice.zig");
|
||||
const GpuBuffer = @import("GpuBuffer.zig");
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
|
||||
device: GpuDevice,
|
||||
tracked_buffers: std.AutoHashMap(c.WGPUBuffer, void),
|
||||
@ -39,6 +39,10 @@ pub fn registerBuffer(
|
||||
.usage = usage,
|
||||
.size = bytes,
|
||||
}) orelse return error.BufferAlloc;
|
||||
errdefer {
|
||||
c.wgpuBufferDestroy(buf);
|
||||
c.wgpuBufferRelease(buf);
|
||||
}
|
||||
|
||||
try self.tracked_buffers.put(buf, {});
|
||||
self.allocated_vram_bytes += bytes;
|
||||
@ -50,6 +54,5 @@ pub fn unregisterAndDestroyBuffer(self: *@This(), buf: GpuBuffer) void {
|
||||
c.wgpuBufferDestroy(buf.raw);
|
||||
c.wgpuBufferRelease(buf.raw);
|
||||
self.allocated_vram_bytes -= buf.size;
|
||||
self.device.poll();
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
const std = @import("std");
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
const GpuAllocator = @import("GpuAllocator.zig");
|
||||
|
||||
raw: c.WGPUBuffer,
|
||||
@ -8,7 +8,12 @@ usage: c.WGPUBufferUsage,
|
||||
gloc: *GpuAllocator,
|
||||
|
||||
/// Allocates the underlying WebGPU handle and registers it to the parent GpuAllocator
|
||||
pub fn init(gloc: *GpuAllocator, bytes: u64, usage: c.WGPUBufferUsage) !@This() {
|
||||
pub fn init(gloc: *GpuAllocator, T: type, len: usize, usage: c.WGPUBufferUsage) !@This() {
|
||||
switch (@typeInfo(T)) {
|
||||
.int, .float => {},
|
||||
else => @compileError("GpuBuffer can only use int and float type"),
|
||||
}
|
||||
const bytes = @sizeOf(T) * len;
|
||||
const raw_handle = try gloc.registerBuffer(bytes, usage);
|
||||
return .{
|
||||
.raw = raw_handle,
|
||||
|
||||
@ -1,22 +1,26 @@
|
||||
const std = @import("std");
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
const sv = @import("utils.zig").sv;
|
||||
|
||||
const Ctx = struct {
|
||||
adapter: c.WGPUAdapter = null,
|
||||
device: c.WGPUDevice = null,
|
||||
};
|
||||
|
||||
const GpuDeviceConfig = struct {
|
||||
/// VRAM limit. Default 2 GB
|
||||
vram_bytes_limit: u64 = 2 * 1024 * 1024 * 1024,
|
||||
};
|
||||
|
||||
instance: c.WGPUInstance,
|
||||
adapter: c.WGPUAdapter,
|
||||
device: c.WGPUDevice,
|
||||
queue: c.WGPUQueue,
|
||||
limits: c.WGPULimits,
|
||||
|
||||
config: struct {
|
||||
vram_bytes_limit: u64 = 10 * 1024 * 1024 * 1024, // 10 GB
|
||||
} = .{},
|
||||
config: GpuDeviceConfig,
|
||||
|
||||
pub fn init() !@This() {
|
||||
pub fn init(config: GpuDeviceConfig) !@This() {
|
||||
const instance = c.wgpuCreateInstance(
|
||||
&std.mem.zeroes(c.WGPUInstanceDescriptor),
|
||||
) orelse return error.NoInstance;
|
||||
@ -32,21 +36,32 @@ pub fn init() !@This() {
|
||||
const adapter = ctx.adapter orelse return error.NoAdapter;
|
||||
errdefer c.wgpuAdapterRelease(adapter);
|
||||
|
||||
// --- QUERY HARDWARE LIMITS ---
|
||||
var supported_features = std.mem.zeroes(c.WGPUSupportedFeatures);
|
||||
c.wgpuAdapterGetFeatures(adapter, &supported_features);
|
||||
|
||||
var supported_limits = std.mem.zeroes(c.WGPULimits);
|
||||
supported_limits.nextInChain = null;
|
||||
|
||||
// Fetch what your physical graphic card can actually handle
|
||||
if (c.wgpuAdapterGetLimits(adapter, &supported_limits) != 1) return error.FailedToGetAdapterLimits;
|
||||
|
||||
var has_f16 = false;
|
||||
for (0..supported_features.featureCount) |i| {
|
||||
if (supported_features.features[i] == c.WGPUFeatureName_ShaderF16) {
|
||||
has_f16 = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
var feature_buf = [_]c.WGPUFeatureName{c.WGPUFeatureName_ShaderF16};
|
||||
const required_features: []const c.WGPUFeatureName =
|
||||
if (has_f16) feature_buf[0..1] else &.{};
|
||||
|
||||
const device_descriptor = c.WGPUDeviceDescriptor{
|
||||
.nextInChain = null,
|
||||
.label = sv("TensorCompilerDevice"),
|
||||
.requiredFeatureCount = 0,
|
||||
.requiredFeatures = null,
|
||||
.requiredFeatureCount = required_features.len,
|
||||
.requiredFeatures = if (required_features.len > 0) required_features.ptr else null,
|
||||
.requiredLimits = &supported_limits,
|
||||
};
|
||||
|
||||
_ = c.wgpuAdapterRequestDevice(
|
||||
adapter,
|
||||
&device_descriptor,
|
||||
@ -61,6 +76,7 @@ pub fn init() !@This() {
|
||||
.device = device,
|
||||
.queue = c.wgpuDeviceGetQueue(device),
|
||||
.limits = supported_limits,
|
||||
.config = config,
|
||||
};
|
||||
}
|
||||
|
||||
@ -104,7 +120,3 @@ fn onDevice(
|
||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||
ctx.device = device;
|
||||
}
|
||||
|
||||
fn sv(s: []const u8) c.WGPUStringView {
|
||||
return .{ .data = s.ptr, .length = s.len };
|
||||
}
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
const std = @import("std");
|
||||
const GpuDevice = @import("GpuDevice.zig");
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
const sv = @import("utils.zig").sv;
|
||||
|
||||
raw: c.WGPUComputePipeline,
|
||||
|
||||
@ -22,7 +23,3 @@ pub fn init(device: GpuDevice, wgsl: []const u8) !@This() {
|
||||
pub fn deinit(self: @This()) void {
|
||||
c.wgpuComputePipelineRelease(self.raw);
|
||||
}
|
||||
|
||||
fn sv(s: []const u8) c.WGPUStringView {
|
||||
return .{ .data = s.ptr, .length = s.len };
|
||||
}
|
||||
|
||||
27
src/Vec.zig
27
src/Vec.zig
@ -1,6 +1,6 @@
|
||||
/// Dummy
|
||||
const std = @import("std");
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
const GpuAllocator = @import("GpuAllocator.zig");
|
||||
const GpuBuffer = @import("GpuBuffer.zig");
|
||||
const GpuDevice = @import("GpuDevice.zig");
|
||||
@ -15,14 +15,15 @@ pub fn initZero(gloc: *GpuAllocator, len: usize) !Vec {
|
||||
return .{
|
||||
.buf = try GpuBuffer.init(
|
||||
gloc,
|
||||
len * @sizeOf(f32),
|
||||
f16,
|
||||
len,
|
||||
c.WGPUBufferUsage_Storage | c.WGPUBufferUsage_CopyDst | c.WGPUBufferUsage_CopySrc,
|
||||
),
|
||||
.len = len,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn initLoad(gloc: *GpuAllocator, data: []const f32) !Vec {
|
||||
pub fn initLoad(gloc: *GpuAllocator, data: []const f16) !Vec {
|
||||
var self = try initZero(gloc, data.len);
|
||||
try self.load(gloc.device, data);
|
||||
return self;
|
||||
@ -36,15 +37,15 @@ pub fn deinit(self: Vec) void {
|
||||
pub fn load(
|
||||
self: Vec,
|
||||
device: GpuDevice,
|
||||
data: []const f32,
|
||||
data: []const f16,
|
||||
) !void {
|
||||
std.debug.assert(data.len == self.len);
|
||||
const bytes = data.len * @sizeOf(f32);
|
||||
const bytes = self.byteSize();
|
||||
c.wgpuQueueWriteBuffer(device.queue, self.buf.raw, 0, data.ptr, bytes);
|
||||
}
|
||||
|
||||
pub fn byteSize(self: Vec) u64 {
|
||||
return @as(u64, self.len) * @sizeOf(f32);
|
||||
return @as(u64, self.len) * @sizeOf(f16);
|
||||
}
|
||||
|
||||
pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
||||
@ -59,13 +60,14 @@ pub fn run(self: Vec, gloc: *GpuAllocator, other: Vec, pip: GpuPipeline) !Vec {
|
||||
}
|
||||
|
||||
/// GPU to CPU.
|
||||
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
||||
const out = try alloc.alloc(f32, self.len);
|
||||
pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f16 {
|
||||
const out = try alloc.alloc(f16, self.len);
|
||||
const bytes = self.byteSize();
|
||||
|
||||
const staging = try GpuBuffer.init(
|
||||
gloc,
|
||||
bytes,
|
||||
f16,
|
||||
self.len,
|
||||
c.WGPUBufferUsage_MapRead | c.WGPUBufferUsage_CopyDst,
|
||||
);
|
||||
defer staging.deinit();
|
||||
@ -86,7 +88,7 @@ pub fn read(self: Vec, gloc: *GpuAllocator, alloc: std.mem.Allocator) ![]f32 {
|
||||
);
|
||||
while (!mapped) gloc.device.poll();
|
||||
|
||||
const ptr: [*]const f32 = @ptrCast(@alignCast(
|
||||
const ptr: [*]const f16 = @ptrCast(@alignCast(
|
||||
staging.getConstMappedRange(0, bytes),
|
||||
));
|
||||
@memcpy(out[0..self.len], ptr[0..self.len]);
|
||||
@ -120,12 +122,13 @@ fn dispatch2in1out(
|
||||
while (offset < bytes) {
|
||||
// Calculate bounds for the current chunk
|
||||
const current_chunk_bytes = @min(max_chunk_bytes, bytes - offset);
|
||||
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f32));
|
||||
const current_chunk_elements: u32 = @intCast(current_chunk_bytes / @sizeOf(f16));
|
||||
|
||||
// Create uniform buffer for this specific chunk's size
|
||||
const info_buf = try GpuBuffer.init(
|
||||
gloc,
|
||||
@sizeOf(u32),
|
||||
u32,
|
||||
1,
|
||||
c.WGPUBufferUsage_Uniform | c.WGPUBufferUsage_CopyDst,
|
||||
);
|
||||
defer info_buf.deinit();
|
||||
|
||||
@ -4,10 +4,10 @@ const GpuAllocator = @import("GpuAllocator.zig");
|
||||
const GpuPipeline = @import("GpuPipeline.zig");
|
||||
const Vec = @import("Vec.zig");
|
||||
|
||||
const c = @import("c.zig").c;
|
||||
const c = @import("utils.zig").c;
|
||||
|
||||
pub fn main(init: std.process.Init) !void {
|
||||
const device = try GpuDevice.init();
|
||||
const device = try GpuDevice.init(.{ .vram_bytes_limit = 4 * 1024 * 1024 * 1024 });
|
||||
defer device.deinit();
|
||||
|
||||
var gloc = try GpuAllocator.init(init.gpa, device);
|
||||
@ -20,8 +20,8 @@ pub fn main(init: std.process.Init) !void {
|
||||
|
||||
// --- WARM-UP PHASE ---
|
||||
{
|
||||
var warmup_a = [_]f32{1.0};
|
||||
var warmup_b = [_]f32{1.0};
|
||||
var warmup_a = [_]f16{1.0};
|
||||
var warmup_b = [_]f16{1.0};
|
||||
const wa = try Vec.initLoad(&gloc, &warmup_a);
|
||||
defer wa.deinit();
|
||||
const wb = try Vec.initLoad(&gloc, &warmup_b);
|
||||
@ -41,11 +41,11 @@ pub fn main(init: std.process.Init) !void {
|
||||
4 * 4 * 4 * 1024,
|
||||
4 * 4 * 4 * 4 * 1024,
|
||||
1024 * 1024,
|
||||
4 * 1024 * 1024,
|
||||
4 * 4 * 1024 * 1024,
|
||||
4 * 4 * 4 * 1024 * 1024,
|
||||
4 * 4 * 4 * 4 * 1024 * 1024,
|
||||
4 * 4 * 4 * 4 * 4 * 1024 * 1024,
|
||||
// 4 * 1024 * 1024,
|
||||
// 4 * 4 * 1024 * 1024,
|
||||
// 4 * 4 * 4 * 1024 * 1024,
|
||||
// 4 * 4 * 4 * 4 * 1024 * 1024,
|
||||
// 4 * 4 * 4 * 4 * 4 * 1024 * 1024,
|
||||
};
|
||||
|
||||
const iterations = 10;
|
||||
@ -56,9 +56,9 @@ pub fn main(init: std.process.Init) !void {
|
||||
|
||||
for (sizes) |size| {
|
||||
// --- Phase 1: Host Init/Alloc (Outside the iteration loop for pure host prep) ---
|
||||
var data_a = try allocator.alloc(f32, size);
|
||||
const data_a = try allocator.alloc(f16, size);
|
||||
defer allocator.free(data_a);
|
||||
var data_b = try allocator.alloc(f32, size);
|
||||
const data_b = try allocator.alloc(f16, size);
|
||||
defer allocator.free(data_b);
|
||||
|
||||
for (0..size) |i| {
|
||||
@ -72,7 +72,7 @@ pub fn main(init: std.process.Init) !void {
|
||||
var min_compute_ns: u64 = std.math.maxInt(u64);
|
||||
|
||||
// Track peak VRAM usage observed during the iterations
|
||||
var peak_vram_bytes: usize = 0;
|
||||
var peak_vram_bytes: u64 = 0;
|
||||
|
||||
for (0..iterations) |_| {
|
||||
// --- 1. GPU ALLOCATION PHASE ---
|
||||
@ -95,9 +95,8 @@ pub fn main(init: std.process.Init) !void {
|
||||
|
||||
// All 3 buffers (a, b, sum) are currently resident in VRAM here.
|
||||
// Querying now catches the true peak allocation step.
|
||||
if (gloc.allocated_vram_bytes > peak_vram_bytes) {
|
||||
if (gloc.allocated_vram_bytes > peak_vram_bytes)
|
||||
peak_vram_bytes = gloc.allocated_vram_bytes;
|
||||
}
|
||||
|
||||
_ = c.wgpuDevicePoll(device.device, 1, null);
|
||||
|
||||
@ -118,7 +117,7 @@ pub fn main(init: std.process.Init) !void {
|
||||
|
||||
// --- Metrics Calculations ---
|
||||
const f_size = @as(f64, @floatFromInt(size));
|
||||
const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f32)));
|
||||
const element_bytes = f_size * @as(f64, @floatFromInt(@sizeOf(f16)));
|
||||
const mb = element_bytes / (1024.0 * 1024.0);
|
||||
|
||||
// Individual Phase Timings (ms)
|
||||
|
||||
43
src/example.zig
Normal file
43
src/example.zig
Normal file
@ -0,0 +1,43 @@
|
||||
const std = @import("std");
|
||||
const GpuDevice = @import("GpuDevice.zig");
|
||||
const GpuAllocator = @import("GpuAllocator.zig");
|
||||
const GpuPipeline = @import("GpuPipeline.zig");
|
||||
const Vec = @import("Vec.zig");
|
||||
|
||||
const c = @import("utils.zig").c;
|
||||
|
||||
pub fn main(init: std.process.Init) !void {
|
||||
const allocator = init.gpa;
|
||||
|
||||
const device = try GpuDevice.init(.{ .vram_bytes_limit = 4 * 1024 * 1024 * 1024 });
|
||||
defer device.deinit();
|
||||
|
||||
var gloc = try GpuAllocator.init(allocator, device);
|
||||
defer gloc.deinit();
|
||||
|
||||
const add_pip = try GpuPipeline.init(device, @embedFile("shaders/add.wgsl"));
|
||||
defer add_pip.deinit();
|
||||
|
||||
const data_a = try allocator.alloc(f16, 1024);
|
||||
defer allocator.free(data_a);
|
||||
const data_b = try allocator.alloc(f16, 1024);
|
||||
defer allocator.free(data_b);
|
||||
|
||||
for (0..1024) |i| {
|
||||
data_a[i] = @floatFromInt(i);
|
||||
data_b[i] = @floatFromInt(1024 - 1 - i);
|
||||
}
|
||||
|
||||
const a = try Vec.initLoad(&gloc, data_a);
|
||||
defer a.deinit();
|
||||
const b = try Vec.initLoad(&gloc, data_b);
|
||||
defer b.deinit();
|
||||
|
||||
const sum = try a.run(&gloc, b, add_pip);
|
||||
defer sum.deinit();
|
||||
|
||||
const out = try sum.read(&gloc, allocator);
|
||||
defer allocator.free(out);
|
||||
|
||||
std.debug.print("{any}\n", .{out});
|
||||
}
|
||||
@ -1,6 +1,8 @@
|
||||
@group(0) @binding(0) var<storage, read> A: array<f32>;
|
||||
@group(0) @binding(1) var<storage, read> B: array<f32>;
|
||||
@group(0) @binding(2) var<storage, read_write> C: array<f32>;
|
||||
enable f16;
|
||||
|
||||
@group(0) @binding(0) var<storage, read> A: array<f16>;
|
||||
@group(0) @binding(1) var<storage, read> B: array<f16>;
|
||||
@group(0) @binding(2) var<storage, read_write> C: array<f16>;
|
||||
|
||||
struct TensorInfo {
|
||||
size: u32,
|
||||
|
||||
5
src/utils.zig
Normal file
5
src/utils.zig
Normal file
@ -0,0 +1,5 @@
|
||||
pub const c = @cImport(@cInclude("wgpu.h"));
|
||||
|
||||
pub fn sv(s: []const u8) c.WGPUStringView {
|
||||
return .{ .data = s.ptr, .length = s.len };
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user