Changed main to be a benchmark of different size
This commit is contained in:
parent
0da02f60c4
commit
90a7cf946f
@ -152,7 +152,6 @@ fn onDevice(
|
|||||||
}
|
}
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
ctx.device = device;
|
ctx.device = device;
|
||||||
std.debug.print("{?}", .{device});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
fn buildPipeline(device: c.WGPUDevice, wgsl: []const u8) !c.WGPUComputePipeline {
|
||||||
|
|||||||
20
src/Mat.zig
20
src/Mat.zig
@ -6,14 +6,14 @@ const GpuBuffer = @import("GpuBuffer.zig");
|
|||||||
const Mat = @This();
|
const Mat = @This();
|
||||||
|
|
||||||
buf: GpuBuffer,
|
buf: GpuBuffer,
|
||||||
rows: u32,
|
rows: usize,
|
||||||
cols: u32,
|
cols: usize,
|
||||||
|
|
||||||
pub fn load(
|
pub fn load(
|
||||||
gloc: *GpuAllocator,
|
gloc: *GpuAllocator,
|
||||||
data: []const f32,
|
data: []const f32,
|
||||||
rows: u32,
|
rows: usize,
|
||||||
cols: u32,
|
cols: usize,
|
||||||
) !Mat {
|
) !Mat {
|
||||||
std.debug.assert(data.len == @as(usize, rows) * cols);
|
std.debug.assert(data.len == @as(usize, rows) * cols);
|
||||||
const bytes = data.len * @sizeOf(f32);
|
const bytes = data.len * @sizeOf(f32);
|
||||||
@ -29,7 +29,7 @@ pub fn load(
|
|||||||
return .{ .buf = buf, .rows = rows, .cols = cols };
|
return .{ .buf = buf, .rows = rows, .cols = cols };
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn zeros(gloc: *GpuAllocator, rows: u32, cols: u32) !Mat {
|
pub fn zeros(gloc: *GpuAllocator, rows: usize, cols: usize) !Mat {
|
||||||
const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
|
const bytes: u64 = @as(u64, rows) * cols * @sizeOf(f32);
|
||||||
const buf = try GpuBuffer.init(
|
const buf = try GpuBuffer.init(
|
||||||
gloc,
|
gloc,
|
||||||
@ -43,7 +43,7 @@ pub fn deinit(self: Mat) void {
|
|||||||
self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory
|
self.buf.deinit(); // Automatically cleans tracking map & releases GPU memory
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(self: Mat) u32 {
|
pub fn len(self: Mat) usize {
|
||||||
return self.rows * self.cols;
|
return self.rows * self.cols;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -144,7 +144,7 @@ fn dispatch2in1out(
|
|||||||
buf_b: GpuBuffer,
|
buf_b: GpuBuffer,
|
||||||
buf_out: GpuBuffer,
|
buf_out: GpuBuffer,
|
||||||
bytes: u64,
|
bytes: u64,
|
||||||
n: u32,
|
n: usize,
|
||||||
) !void {
|
) !void {
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
@ -162,7 +162,7 @@ fn submitPass(
|
|||||||
gloc: *GpuAllocator,
|
gloc: *GpuAllocator,
|
||||||
pipeline: c.WGPUComputePipeline,
|
pipeline: c.WGPUComputePipeline,
|
||||||
entries: []const c.WGPUBindGroupEntry,
|
entries: []const c.WGPUBindGroupEntry,
|
||||||
n: u32,
|
n: usize,
|
||||||
) !void {
|
) !void {
|
||||||
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
const bgl = c.wgpuComputePipelineGetBindGroupLayout(pipeline, 0);
|
||||||
defer c.wgpuBindGroupLayoutRelease(bgl);
|
defer c.wgpuBindGroupLayoutRelease(bgl);
|
||||||
@ -179,7 +179,7 @@ fn submitPass(
|
|||||||
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
const pass = c.wgpuCommandEncoderBeginComputePass(enc, null);
|
||||||
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
c.wgpuComputePassEncoderSetPipeline(pass, pipeline);
|
||||||
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
c.wgpuComputePassEncoderSetBindGroup(pass, 0, bg, 0, null);
|
||||||
c.wgpuComputePassEncoderDispatchWorkgroups(pass, ceilDiv(n, 64), 1, 1);
|
c.wgpuComputePassEncoderDispatchWorkgroups(pass, @intCast(ceilDiv(n, 256)), 1, 1);
|
||||||
c.wgpuComputePassEncoderEnd(pass);
|
c.wgpuComputePassEncoderEnd(pass);
|
||||||
c.wgpuComputePassEncoderRelease(pass);
|
c.wgpuComputePassEncoderRelease(pass);
|
||||||
|
|
||||||
@ -189,6 +189,6 @@ fn submitPass(
|
|||||||
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
c.wgpuQueueSubmit(gloc.queue, 1, &cmd);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn ceilDiv(n: u32, d: u32) u32 {
|
fn ceilDiv(n: usize, d: usize) usize {
|
||||||
return (n + d - 1) / d;
|
return (n + d - 1) / d;
|
||||||
}
|
}
|
||||||
|
|||||||
86
src/main.zig
86
src/main.zig
@ -6,39 +6,61 @@ pub fn main(init: std.process.Init) !void {
|
|||||||
var gloc = try GpuAllocator.init(init.gpa);
|
var gloc = try GpuAllocator.init(init.gpa);
|
||||||
defer gloc.deinit();
|
defer gloc.deinit();
|
||||||
|
|
||||||
// Input data: a[i] = i, b[i] = 15 - i → add should give all 15s
|
// Define the sizes you want to benchmark
|
||||||
var data_a: [16]f32 = undefined;
|
const sizes = [_]usize{ 1, 1024, 4096, 16384, 65536, 262144, 1024 * 1024, 4 * 1024 * 1024 };
|
||||||
var data_b: [16]f32 = undefined;
|
|
||||||
for (0..16) |i| {
|
// Print table header
|
||||||
data_a[i] = @floatFromInt(i);
|
std.debug.print("\n| Element Count | Size (MB) | Time (ms) | Time (ns) |\n", .{});
|
||||||
data_b[i] = @floatFromInt(15 - i);
|
std.debug.print("|--------------:|----------:|----------:|----------:|\n", .{});
|
||||||
|
|
||||||
|
const allocator = init.gpa;
|
||||||
|
|
||||||
|
for (sizes) |size| {
|
||||||
|
// Dynamically allocate buffers for the current size
|
||||||
|
var data_a = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(data_a);
|
||||||
|
var data_b = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(data_b);
|
||||||
|
|
||||||
|
// Populate data
|
||||||
|
for (0..size) |i| {
|
||||||
|
data_a[i] = @floatFromInt(i);
|
||||||
|
data_b[i] = @floatFromInt(size - 1 - i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start timing the GPU operations
|
||||||
|
const start = std.Io.Clock.awake.now(init.io);
|
||||||
|
|
||||||
|
const a = try Mat.load(&gloc, data_a, size, 1);
|
||||||
|
defer a.deinit();
|
||||||
|
const b = try Mat.load(&gloc, data_b, size, 1);
|
||||||
|
defer b.deinit();
|
||||||
|
|
||||||
|
// a + b
|
||||||
|
const sum = try a.add(&gloc, b);
|
||||||
|
defer sum.deinit();
|
||||||
|
|
||||||
|
// sum * 2
|
||||||
|
const scaled = try sum.scale(&gloc, 2.0);
|
||||||
|
defer scaled.deinit();
|
||||||
|
|
||||||
|
// Read back (allocating dynamically for read-back buffers too)
|
||||||
|
const out_sum = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(out_sum);
|
||||||
|
const out_scaled = try allocator.alloc(f32, size);
|
||||||
|
defer allocator.free(out_scaled);
|
||||||
|
|
||||||
|
try sum.read(&gloc, out_sum);
|
||||||
|
try scaled.read(&gloc, out_scaled);
|
||||||
|
|
||||||
|
const duration = start.durationTo(std.Io.Clock.awake.now(init.io));
|
||||||
|
const ns = duration.toNanoseconds();
|
||||||
|
const ms = @as(f64, @floatFromInt(ns)) / 1_000_000.0;
|
||||||
|
const mb = @as(f64, @floatFromInt(size * @sizeOf(f32))) / (1024.0 * 1024.0);
|
||||||
|
|
||||||
|
// Print table row
|
||||||
|
std.debug.print("| {d:12} | {d:8.2} | {d:9.3} | {d:9} |\n", .{ size, mb, ms, ns });
|
||||||
}
|
}
|
||||||
|
|
||||||
const a = try Mat.load(&gloc, &data_a, 4, 4);
|
|
||||||
defer a.deinit();
|
|
||||||
const b = try Mat.load(&gloc, &data_b, 4, 4);
|
|
||||||
defer b.deinit();
|
|
||||||
|
|
||||||
// a + b
|
|
||||||
const sum = try a.add(&gloc, b);
|
|
||||||
defer sum.deinit();
|
|
||||||
|
|
||||||
// sum * 2
|
|
||||||
const scaled = try sum.scale(&gloc, 2.0);
|
|
||||||
defer scaled.deinit();
|
|
||||||
|
|
||||||
// Read back
|
|
||||||
var out_sum: [16]f32 = undefined;
|
|
||||||
var out_scaled: [16]f32 = undefined;
|
|
||||||
try sum.read(&gloc, &out_sum);
|
|
||||||
try scaled.read(&gloc, &out_scaled);
|
|
||||||
|
|
||||||
// Print
|
|
||||||
std.debug.print("\na + b (expect all 15):\n", .{});
|
|
||||||
printMat(&out_sum, 4, 4);
|
|
||||||
|
|
||||||
std.debug.print("\n(a + b) * 2 (expect all 30):\n", .{});
|
|
||||||
printMat(&out_scaled, 4, 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn printMat(data: []const f32, rows: u32, cols: u32) void {
|
fn printMat(data: []const f32, rows: u32, cols: u32) void {
|
||||||
|
|||||||
@ -65,7 +65,6 @@ fn onDevice(
|
|||||||
}
|
}
|
||||||
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
const ctx: *Ctx = @ptrCast(@alignCast(userdata1.?));
|
||||||
ctx.device = device;
|
ctx.device = device;
|
||||||
std.debug.print("{?}", .{device});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn onMapped(
|
fn onMapped(
|
||||||
|
|||||||
@ -3,7 +3,7 @@ pub const SHADER_ADD =
|
|||||||
\\@group(0) @binding(1) var<storage, read> b : array<f32>;
|
\\@group(0) @binding(1) var<storage, read> b : array<f32>;
|
||||||
\\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
|
\\@group(0) @binding(2) var<storage, read_write> out : array<f32>;
|
||||||
\\
|
\\
|
||||||
\\@compute @workgroup_size(64)
|
\\@compute @workgroup_size(256)
|
||||||
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
\\ let i = gid.x;
|
\\ let i = gid.x;
|
||||||
\\ if (i < arrayLength(&out)) {
|
\\ if (i < arrayLength(&out)) {
|
||||||
@ -18,7 +18,7 @@ pub const SHADER_SCALE =
|
|||||||
\\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
|
\\@group(0) @binding(1) var<storage, read_write> out : array<f32>;
|
||||||
\\@group(0) @binding(2) var<uniform> u : Uniforms;
|
\\@group(0) @binding(2) var<uniform> u : Uniforms;
|
||||||
\\
|
\\
|
||||||
\\@compute @workgroup_size(64)
|
\\@compute @workgroup_size(256)
|
||||||
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
\\fn main(@builtin(global_invocation_id) gid : vec3<u32>) {
|
||||||
\\ let i = gid.x;
|
\\ let i = gid.x;
|
||||||
\\ if (i < arrayLength(&out)) {
|
\\ if (i < arrayLength(&out)) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user