Removed vs native benchmarks

This commit is contained in:
adrien 2026-05-20 22:58:36 +02:00
parent 957f75243f
commit 63e9b6b63d
2 changed files with 5 additions and 324 deletions

View File

@ -11,12 +11,10 @@ pub fn Tensor(
comptime s_opt: Scales.ArgOpts, comptime s_opt: Scales.ArgOpts,
comptime shape_: []const comptime_int, comptime shape_: []const comptime_int,
) type { ) type {
comptime {
if (shape_.len == 0) if (shape_.len == 0)
@compileError("Tensor shape must have at least 1 dimension (rank >= 1)."); @compileError("Tensor shape must have at least 1 dimension (rank >= 1).");
for (shape_) |s| for (shape_) |s|
if (s < 1) @compileError("Tensor shape dimensions must be strictly >= 1."); if (s < 1) @compileError("Tensor shape dimensions must be strictly >= 1.");
}
@setEvalBranchQuota(100_000_000); @setEvalBranchQuota(100_000_000);
const _total: usize = comptime sh.shapeTotal(shape_); const _total: usize = comptime sh.shapeTotal(shape_);

View File

@ -10,23 +10,8 @@ pub fn main(init: std.process.Init) !void {
io = init.io; io = init.io;
try vectorSIMDvsNative(f64, &stdout_writer.interface);
try stdout_writer.flush();
try vectorSIMDvsNative(f32, &stdout_writer.interface);
try stdout_writer.flush();
try vectorSIMDvsNative(i32, &stdout_writer.interface);
try stdout_writer.flush();
try vectorSIMDvsNative(i64, &stdout_writer.interface);
try stdout_writer.flush();
try vectorSIMDvsNative(i128, &stdout_writer.interface);
try stdout_writer.flush();
try bench_Scalar(&stdout_writer.interface); try bench_Scalar(&stdout_writer.interface);
try stdout_writer.flush(); try stdout_writer.flush();
try bench_vsNative(&stdout_writer.interface);
try stdout_writer.flush();
try bench_crossTypeVsNative(&stdout_writer.interface);
try stdout_writer.flush();
try bench_Vector(&stdout_writer.interface); try bench_Vector(&stdout_writer.interface);
try stdout_writer.flush(); try stdout_writer.flush();
try bench_HighDimTensor(&stdout_writer.interface); try bench_HighDimTensor(&stdout_writer.interface);
@ -169,245 +154,6 @@ fn bench_Scalar(writer: *std.Io.Writer) !void {
try writer.print("└──────────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘\n", .{}); try writer.print("└──────────────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘\n", .{});
} }
fn bench_vsNative(writer: *std.Io.Writer) !void {
const ITERS: usize = 100_000;
const SAMPLES: usize = 100;
const getValT = struct {
fn f(comptime TT: type, i: usize) TT {
const v = (i % 100) + 1;
return if (comptime @typeInfo(TT) == .float) @floatFromInt(v) else @intCast(v);
}
}.f;
const Types = .{ i32, i64, i128, f32, f64 };
const TNames = .{ "i32", "i64", "i128", "f32", "f64" };
// Expanded Ops to match bench_Scalar
const Ops = .{ "add", "sub", "mul", "div", "abs", "eq", "gt" };
try writer.print(
\\
\\ Scalar vs Native Overhead Analysis
\\
\\┌───────────┬──────┬───────────┬───────────┬───────────┬───────────────────────┐
\\│ Operation │ Type │ Native │ @Vector │ Tensor{{1}} │ Slowdown Nat | Vec │
\\├───────────┼──────┼───────────┼───────────┼───────────┼───────────────────────┤
\\
, .{});
inline for (Ops, 0..) |op_name, j| {
inline for (Types, 0..) |T, tidx| {
var native_total_ns: f64 = 0;
var vector_total_ns: f64 = 0;
var tensor_total_ns: f64 = 0;
const M = Tensor(T, .{}, .{}, &.{1});
for (0..SAMPLES) |_| {
// --- 1. Benchmark Native ---
const n_start = getTime();
const a = getValT(T, 10);
const b = getValT(T, 2);
for (0..ITERS) |_| {
// Native logic branch
_ = if (comptime std.mem.eql(u8, op_name, "add"))
if (comptime @typeInfo(T) == .int) a +| b else a + b
else if (comptime std.mem.eql(u8, op_name, "sub"))
if (comptime @typeInfo(T) == .int) a -| b else a - b
else if (comptime std.mem.eql(u8, op_name, "mul"))
if (comptime @typeInfo(T) == .int) a *| b else a * b
else if (comptime std.mem.eql(u8, op_name, "div"))
if (comptime @typeInfo(T) == .int) @divTrunc(a, b) else a / b
else if (comptime std.mem.eql(u8, op_name, "abs"))
if (comptime @typeInfo(T) == .int) @abs(a) else @as(T, @abs(a))
else if (comptime std.mem.eql(u8, op_name, "eq"))
a == b
else if (comptime std.mem.eql(u8, op_name, "gt"))
a > b
else
unreachable;
}
const n_end = getTime();
native_total_ns += @as(f64, @floatFromInt(n_start.durationTo(n_end).toNanoseconds()));
const v_start = getTime();
const va = @Vector(1, T){getValT(T, 10)};
const vb = @Vector(1, T){getValT(T, 2)};
for (0..ITERS) |_| {
// Native logic branch
_ = if (comptime std.mem.eql(u8, op_name, "add"))
if (comptime @typeInfo(T) == .int) va +| vb else va + vb
else if (comptime std.mem.eql(u8, op_name, "sub"))
if (comptime @typeInfo(T) == .int) va -| vb else va - vb
else if (comptime std.mem.eql(u8, op_name, "mul"))
if (comptime @typeInfo(T) == .int) va *| vb else va * vb
else if (comptime std.mem.eql(u8, op_name, "div"))
if (comptime @typeInfo(T) == .int) @divTrunc(va, vb) else va / vb
else if (comptime std.mem.eql(u8, op_name, "abs"))
if (comptime @typeInfo(T) == .int) @as(T, @intCast(@abs(va[0]))) else @abs(va)
else if (comptime std.mem.eql(u8, op_name, "eq"))
va == vb
else if (comptime std.mem.eql(u8, op_name, "gt"))
va > vb
else
unreachable;
}
const v_end = getTime();
vector_total_ns += @as(f64, @floatFromInt(v_start.durationTo(v_end).toNanoseconds()));
// --- 2. Benchmark Scalar ---
const q_start = getTime();
const qa = M.splat(getValT(T, 10));
const qb = M.splat(getValT(T, 2));
for (0..ITERS) |_| {
// Scalar logic branch
_ = if (comptime std.mem.eql(u8, op_name, "add"))
qa.add(qb)
else if (comptime std.mem.eql(u8, op_name, "sub"))
qa.sub(qb)
else if (comptime std.mem.eql(u8, op_name, "mul"))
qa.mul(qb)
else if (comptime std.mem.eql(u8, op_name, "div"))
qa.div(qb)
else if (comptime std.mem.eql(u8, op_name, "abs"))
qa.abs()
else if (comptime std.mem.eql(u8, op_name, "eq"))
qa.eq(qb)
else if (comptime std.mem.eql(u8, op_name, "gt"))
qa.gt(qb)
else
unreachable;
}
const q_end = getTime();
tensor_total_ns += @as(f64, @floatFromInt(q_start.durationTo(q_end).toNanoseconds()));
}
const avg_n = (native_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
const avg_v = (vector_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
const avg_t = (tensor_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
const slowdown_nt = avg_t / avg_n;
const slowdown_vt = avg_t / avg_v;
try writer.print("│ {s:<9} │ {s:<4} │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>8.2}x {d:>8.2}x │\n", .{
op_name, TNames[tidx], avg_n, avg_v, avg_t, slowdown_nt, slowdown_vt,
});
}
if (j != Ops.len - 1) try writer.print("├───────────┼──────┼───────────┼───────────┼───────────┼───────────────────────┤\n", .{});
}
try writer.print("└───────────┴──────┴───────────┴───────────┴───────────┴───────────────────────┘\n", .{});
}
fn bench_crossTypeVsNative(writer: *std.Io.Writer) !void {
const ITERS: usize = 100_000;
const SAMPLES: usize = 5;
const getValT = struct {
fn f(comptime TT: type, i: usize) TT {
// Keep values safe and non-zero to avoid division by zero or overflows during cross-casting
const v = (i % 50) + 1;
return if (comptime @typeInfo(TT) == .float) @floatFromInt(v) else @intCast(v);
}
}.f;
// Helper for the Native baseline: explicitly casting T2 to T1 before the operation
const castTo = struct {
fn f(comptime DestT: type, comptime SrcT: type, val: SrcT) DestT {
if (comptime DestT == SrcT) return val;
const src_info = @typeInfo(SrcT);
const dest_info = @typeInfo(DestT);
if (dest_info == .int and src_info == .int) return @intCast(val);
if (dest_info == .float and src_info == .int) return @floatFromInt(val);
if (dest_info == .int and src_info == .float) return @intFromFloat(val);
if (dest_info == .float and src_info == .float) return @floatCast(val);
unreachable;
}
}.f;
const Types = .{ i16, i64, i128, f32, f64 };
const TNames = .{ "i16", "i64", "i128", "f32", "f64" };
const Ops = .{ "add", "mul", "div" };
try writer.print(
\\
\\ Cross-Type Overhead Analysis: Scalar vs Native
\\
\\┌─────────┬──────┬──────┬───────────┬───────────┬───────────┐
\\│ Op │ T1 │ T2 │ Native │ Scalar │ Slowdown │
\\├─────────┼──────┼──────┼───────────┼───────────┼───────────┤
\\
, .{});
inline for (Ops, 0..) |op_name, j| {
inline for (Types, 0..) |T1, t1_idx| {
inline for (Types, 0..) |T2, t2_idx| {
var native_total_ns: f64 = 0;
var quantity_total_ns: f64 = 0;
const M1 = Tensor(T1, .{ .L = 1 }, .{}, &.{1});
const M2 = Tensor(T2, .{ .L = 1 }, .{}, &.{1});
const S2 = Tensor(T2, .{ .T = 1 }, .{}, &.{1});
std.mem.doNotOptimizeAway({
for (0..SAMPLES) |_| {
// --- 1. Benchmark Native (Cast T2 to T1, then math) ---
const n_start = getTime();
for (0..ITERS) |i| {
const a = getValT(T1, i);
const b_raw = getValT(T2, 2);
const b = castTo(T1, T2, b_raw);
_ = if (comptime std.mem.eql(u8, op_name, "add"))
a + b
else if (comptime std.mem.eql(u8, op_name, "mul"))
a * b
else if (comptime @typeInfo(T1) == .int)
@divTrunc(a, b)
else
a / b;
}
const n_end = getTime();
native_total_ns += @as(f64, @floatFromInt(n_start.durationTo(n_end).toNanoseconds()));
// --- 2. Benchmark Scalar ---
const q_start = getTime();
for (0..ITERS) |i| {
const qa = M1.splat(getValT(T1, i));
const qb = if (comptime std.mem.eql(u8, op_name, "div"))
S2.splat(getValT(T2, 2))
else
M2.splat(getValT(T2, 2));
_ = if (comptime std.mem.eql(u8, op_name, "add"))
qa.add(qb)
else if (comptime std.mem.eql(u8, op_name, "mul"))
qa.mul(qb)
else
qa.div(qb);
}
const q_end = getTime();
quantity_total_ns += @as(f64, @floatFromInt(q_start.durationTo(q_end).toNanoseconds()));
}
const avg_n = (native_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
const avg_q = (quantity_total_ns / SAMPLES) / @as(f64, @floatFromInt(ITERS));
const slowdown = avg_q / avg_n;
try writer.print("│ {s:<7} │ {s:<4} │ {s:<4} │ {d:>7.2}ns │ {d:>7.2}ns │ {d:>8.2}x │\n", .{
op_name, TNames[t1_idx], TNames[t2_idx], avg_n, avg_q, slowdown,
});
});
}
}
if (j != Ops.len - 1) {
try writer.print("├─────────┼──────┼──────┼───────────┼───────────┼───────────┤\n", .{});
}
}
try writer.print("└─────────┴──────┴──────┴───────────┴───────────┴───────────┘\n", .{});
}
fn bench_Vector(writer: *std.Io.Writer) !void { fn bench_Vector(writer: *std.Io.Writer) !void {
const ITERS: usize = 10_000; const ITERS: usize = 10_000;
const SAMPLES: usize = 10; const SAMPLES: usize = 10;
@ -446,7 +192,7 @@ fn bench_Vector(writer: *std.Io.Writer) !void {
const TNames = .{ "i32", "i64", "i128", "f32", "f64" }; const TNames = .{ "i32", "i64", "i128", "f32", "f64" };
const Lengths = .{ 1, 3, 4, 16, 100 }; const Lengths = .{ 1, 3, 4, 16, 100 };
// "cross" is only valid for len=3; other cells will show " --- " // "cross" is only valid for len=3; other cells will show " --- "
const Ops = .{ "add", "div", "mulScalar", "dot", "cross", "product", "pow", "length" }; const Ops = .{ "add", "div", "mulScalar", "dot", "product", "pow", "length" };
inline for (Ops, 0..) |op_name, o_idx| { inline for (Ops, 0..) |op_name, o_idx| {
inline for (Types, TNames) |T, tname| { inline for (Types, TNames) |T, tname| {
@ -482,10 +228,6 @@ fn bench_Vector(writer: *std.Io.Writer) !void {
} else if (comptime std.mem.eql(u8, op_name, "dot")) { } else if (comptime std.mem.eql(u8, op_name, "dot")) {
const v2 = V.splat(getVal(T, i +% 5, 63)); const v2 = V.splat(getVal(T, i +% 5, 63));
_ = v1.contract(v2, 0, 0); _ = v1.contract(v2, 0, 0);
} else if (comptime std.mem.eql(u8, op_name, "cross")) {
// len == 3 guaranteed by the guard above
const v2 = V.splat(getVal(T, i +% 5, 63));
_ = v1.cross(v2);
} else if (comptime std.mem.eql(u8, op_name, "product")) { } else if (comptime std.mem.eql(u8, op_name, "product")) {
_ = v1.product(); _ = v1.product();
} else if (comptime std.mem.eql(u8, op_name, "pow")) { } else if (comptime std.mem.eql(u8, op_name, "pow")) {
@ -608,62 +350,3 @@ fn bench_HighDimTensor(writer: *std.Io.Writer) !void {
} }
try writer.print("└─────────────────┴──────┴──────────────┴──────────────┴──────────────┴──────────────┘\n", .{}); try writer.print("└─────────────────┴──────┴──────────────┴──────────────┴──────────────┴──────────────┘\n", .{});
} }
fn vectorSIMDvsNative(comptime T: type, writer: *std.Io.Writer) !void {
const iterations: u64 = 10_000;
const lens = [_]u32{ 1, 2, 3, 4, 5, 10, 100, 1_000, 10_000 };
try writer.print("\nSIMD Speedup Analysis: {s}\n", .{@typeName(T)});
try writer.print("┌────────────┬────────────┬────────────┬────────────┐\n", .{});
try writer.print("│ Vector Len │ Scalar (us)│ Vector (us)│ Speedup │\n", .{});
try writer.print("├────────────┼────────────┼────────────┼────────────┤\n", .{});
inline for (lens) |vector_len| {
// --- Scalar Test ---
var scalar_val: T = 10;
const start_scalar = getTime();
var i: u64 = 0;
while (i < iterations * vector_len) : (i += 1) {
if (comptime @typeInfo(T) == .int)
scalar_val = scalar_val +% 1
else
scalar_val = scalar_val + 1;
}
const scalar_time = start_scalar.durationTo(getTime()).toMicroseconds();
// --- Vector Test ---
var vector_val: @Vector(vector_len, T) = @splat(20);
const start_vector = getTime();
i = 0;
const increment: @Vector(vector_len, T) = @splat(1);
while (i < iterations) : (i += 1) {
if (comptime @typeInfo(T) == .int)
vector_val = vector_val +% increment
else
vector_val = vector_val + increment;
}
const vector_time = start_vector.durationTo(getTime()).toMicroseconds();
// --- Results ---
const s_float = @as(f64, @floatFromInt(scalar_time));
const v_float = @as(f64, @floatFromInt(vector_time));
// Speedup = ScalarTime / VectorTime.
// > 1.0 means SIMD is faster.
const speedup = if (vector_time > 0) s_float / v_float else 0;
try writer.print("│ {d:<10} │ {d:>10} │ {d:>10} │ {d:>9.2}x │\n", .{
vector_len,
scalar_time,
vector_time,
speedup,
});
try writer.flush();
std.mem.doNotOptimizeAway(scalar_val);
std.mem.doNotOptimizeAway(vector_val);
}
try writer.print("└────────────┴────────────┴────────────┴────────────┘\n", .{});
}