mirror of
https://github.com/ziglang/zig.git
synced 2025-12-10 08:13:07 +00:00
This change:
- Adds generic implementation of the float -> integer conversion
functions floatXiYf, including support for f80
- Updates the existing implementation of integer -> float conversion
fixXiYf to support f16 and f80
- Fixes the handling of the explicit integer bit in `__trunctfxf2`
- Combines the test cases for fixXfYi/floatXiYf into a single file
- Renames `fmodl` to `fmodq`, since it operates on 128-bit floats
The new implementation for floatXiYf has been benchmarked, and generally
provides equal or better performance versus the current implementations:
Throughput (MiB/s) - Before
| u32 | i32 | u64 | i64 | u128 | i128 |
-----|----------|----------|----------|----------|----------|----------|
f16 | none | none | none | none | none | none |
f32 | 2231.67 | 2001.19 | 1745.66 | 1405.77 | 2173.99 | 1874.63 |
f64 | 1407.17 | 1055.83 | 2911.68 | 2437.21 | 1676.05 | 1476.67 |
f80 | none | none | none | none | none | none |
f128 | 327.56 | 321.25 | 645.92 | 654.52 | 1153.56 | 1096.27 |
Throughput (MiB/s) - After
| u32 | i32 | u64 | i64 | u128 | i128 |
-----|----------|----------|----------|----------|----------|----------|
f16 | 1407.61 | 1637.25 | 3555.03 | 2594.56 | 3680.60 | 3063.34 |
f32 | 2101.36 | 2122.62 | 3225.46 | 3123.86 | 2860.05 | 1985.21 |
f64 | 1395.57 | 1314.87 | 2409.24 | 2196.30 | 2384.95 | 1908.15 |
f80 | 475.53 | 457.92 | 884.50 | 812.12 | 1475.27 | 1382.16 |
f128 | 359.60 | 350.91 | 723.08 | 706.80 | 1296.42 | 1198.87 |
127 lines
3.7 KiB
Zig
127 lines
3.7 KiB
Zig
const builtin = @import("builtin");
|
|
const std = @import("std");
|
|
|
|
// fmodq - floating modulo large, returns the remainder of division for f128 types
|
|
// Logic and flow heavily inspired by MUSL fmodl for 113 mantissa digits
|
|
pub fn fmodq(a: f128, b: f128) callconv(.C) f128 {
|
|
@setRuntimeSafety(builtin.is_test);
|
|
var amod = a;
|
|
var bmod = b;
|
|
const aPtr_u64 = @ptrCast([*]u64, &amod);
|
|
const bPtr_u64 = @ptrCast([*]u64, &bmod);
|
|
const aPtr_u16 = @ptrCast([*]u16, &amod);
|
|
const bPtr_u16 = @ptrCast([*]u16, &bmod);
|
|
|
|
const exp_and_sign_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
|
.Little => 7,
|
|
.Big => 0,
|
|
};
|
|
const low_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
|
.Little => 0,
|
|
.Big => 1,
|
|
};
|
|
const high_index = comptime switch (builtin.target.cpu.arch.endian()) {
|
|
.Little => 1,
|
|
.Big => 0,
|
|
};
|
|
|
|
const signA = aPtr_u16[exp_and_sign_index] & 0x8000;
|
|
var expA = @intCast(i32, (aPtr_u16[exp_and_sign_index] & 0x7fff));
|
|
var expB = bPtr_u16[exp_and_sign_index] & 0x7fff;
|
|
|
|
// There are 3 cases where the answer is undefined, check for:
|
|
// - fmodq(val, 0)
|
|
// - fmodq(val, NaN)
|
|
// - fmodq(inf, val)
|
|
// The sign on checked values does not matter.
|
|
// Doing (a * b) / (a * b) procudes undefined results
|
|
// because the three cases always produce undefined calculations:
|
|
// - 0 / 0
|
|
// - val * NaN
|
|
// - inf / inf
|
|
if (b == 0 or std.math.isNan(b) or expA == 0x7fff) {
|
|
return (a * b) / (a * b);
|
|
}
|
|
|
|
// Remove the sign from both
|
|
aPtr_u16[exp_and_sign_index] = @bitCast(u16, @intCast(i16, expA));
|
|
bPtr_u16[exp_and_sign_index] = @bitCast(u16, @intCast(i16, expB));
|
|
if (amod <= bmod) {
|
|
if (amod == bmod) {
|
|
return 0 * a;
|
|
}
|
|
return a;
|
|
}
|
|
|
|
if (expA == 0) {
|
|
amod *= 0x1p120;
|
|
expA = aPtr_u16[exp_and_sign_index] -% 120;
|
|
}
|
|
|
|
if (expB == 0) {
|
|
bmod *= 0x1p120;
|
|
expB = bPtr_u16[exp_and_sign_index] -% 120;
|
|
}
|
|
|
|
// OR in extra non-stored mantissa digit
|
|
var highA: u64 = (aPtr_u64[high_index] & (std.math.maxInt(u64) >> 16)) | 1 << 48;
|
|
var highB: u64 = (bPtr_u64[high_index] & (std.math.maxInt(u64) >> 16)) | 1 << 48;
|
|
var lowA: u64 = aPtr_u64[low_index];
|
|
var lowB: u64 = bPtr_u64[low_index];
|
|
|
|
while (expA > expB) : (expA -= 1) {
|
|
var high = highA -% highB;
|
|
var low = lowA -% lowB;
|
|
if (lowA < lowB) {
|
|
high = highA -% 1;
|
|
}
|
|
if (high >> 63 == 0) {
|
|
if ((high | low) == 0) {
|
|
return 0 * a;
|
|
}
|
|
highA = 2 *% high + (low >> 63);
|
|
lowA = 2 *% low;
|
|
} else {
|
|
highA = 2 *% highA + (lowA >> 63);
|
|
lowA = 2 *% lowA;
|
|
}
|
|
}
|
|
|
|
var high = highA -% highB;
|
|
var low = lowA -% lowB;
|
|
if (lowA < lowB) {
|
|
high -= 1;
|
|
}
|
|
if (high >> 63 == 0) {
|
|
if ((high | low) == 0) {
|
|
return 0 * a;
|
|
}
|
|
highA = high;
|
|
lowA = low;
|
|
}
|
|
|
|
while (highA >> 48 == 0) {
|
|
highA = 2 *% highA + (lowA >> 63);
|
|
lowA = 2 *% lowA;
|
|
expA = expA - 1;
|
|
}
|
|
|
|
// Overwrite the current amod with the values in highA and lowA
|
|
aPtr_u64[high_index] = highA;
|
|
aPtr_u64[low_index] = lowA;
|
|
|
|
// Combine the exponent with the sign, normalize if happend to be denormalized
|
|
if (expA <= 0) {
|
|
aPtr_u16[exp_and_sign_index] = @truncate(u16, @bitCast(u32, (expA +% 120))) | signA;
|
|
amod *= 0x1p-120;
|
|
} else {
|
|
aPtr_u16[exp_and_sign_index] = @truncate(u16, @bitCast(u32, expA)) | signA;
|
|
}
|
|
|
|
return amod;
|
|
}
|
|
|
|
test {
|
|
_ = @import("floatfmodq_test.zig");
|
|
}
|