derive float constants in a generic way (#10133)

This commit is contained in:
viri 2022-04-01 15:17:24 -06:00 committed by Isaac Freund
parent 62f54aa39c
commit 5b8ac9821d
3 changed files with 142 additions and 76 deletions

View File

@ -36,38 +36,44 @@ pub const sqrt2 = 1.414213562373095048801688724209698079;
/// 1/sqrt(2)
pub const sqrt1_2 = 0.707106781186547524400844362104849039;
pub const f128_true_min = @bitCast(f128, @as(u128, 0x00000000000000000000000000000001));
pub const f128_min = @bitCast(f128, @as(u128, 0x00010000000000000000000000000000));
pub const f128_max = @bitCast(f128, @as(u128, 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF));
pub const f128_epsilon = @bitCast(f128, @as(u128, 0x3F8F0000000000000000000000000000));
pub const f128_toint = 1.0 / f128_epsilon;
pub const floatExponentBits = @import("math/float.zig").floatExponentBits;
pub const floatMantissaBits = @import("math/float.zig").floatMantissaBits;
pub const floatMantissaDigits = @import("math/float.zig").floatMantissaDigits;
pub const floatExponentMin = @import("math/float.zig").floatExponentMin;
pub const floatExponentMax = @import("math/float.zig").floatExponentMax;
pub const floatTrueMin = @import("math/float.zig").floatTrueMin;
pub const floatMin = @import("math/float.zig").floatMin;
pub const floatMax = @import("math/float.zig").floatMax;
pub const floatEps = @import("math/float.zig").floatEps;
// float.h details
pub const f80_true_min = make_f80(.{ .fraction = 1, .exp = 0 });
pub const f80_min = make_f80(.{ .fraction = 0x8000000000000000, .exp = 1 });
pub const f80_max = make_f80(.{ .fraction = 0xFFFFFFFFFFFFFFFF, .exp = 0x7FFE });
pub const f80_epsilon = make_f80(.{ .fraction = 0x8000000000000000, .exp = 0x3FC0 });
pub const f80_toint = 1.0 / f80_epsilon;
pub const f64_true_min = 4.94065645841246544177e-324;
pub const f64_min = 2.2250738585072014e-308;
pub const f64_max = 1.79769313486231570815e+308;
pub const f64_epsilon = 2.22044604925031308085e-16;
pub const f64_toint = 1.0 / f64_epsilon;
pub const f32_true_min = 1.40129846432481707092e-45;
pub const f32_min = 1.17549435082228750797e-38;
pub const f32_max = 3.40282346638528859812e+38;
pub const f32_epsilon = 1.1920928955078125e-07;
pub const f32_toint = 1.0 / f32_epsilon;
pub const f16_true_min = 0.000000059604644775390625; // 2**-24
pub const f16_min = 0.00006103515625; // 2**-14
pub const f16_max = 65504;
pub const f16_epsilon = 0.0009765625; // 2**-10
pub const f16_toint = 1.0 / f16_epsilon;
pub const epsilon = @import("math/epsilon.zig").epsilon;
// TODO Replace with @compileError("deprecated for foobar") after 0.10.0 is released.
pub const f16_true_min: comptime_float = floatTrueMin(f16); // prev: 0.000000059604644775390625
pub const f32_true_min: comptime_float = floatTrueMin(f32); // prev: 1.40129846432481707092e-45
pub const f64_true_min: comptime_float = floatTrueMin(f64); // prev: 4.94065645841246544177e-324
pub const f80_true_min = floatTrueMin(f80); // prev: make_f80(.{ .fraction = 1, .exp = 0 })
pub const f128_true_min = floatTrueMin(f128); // prev: @bitCast(f128, @as(u128, 0x00000000000000000000000000000001))
pub const f16_min: comptime_float = floatMin(f16); // prev: 0.00006103515625
pub const f32_min: comptime_float = floatMin(f32); // prev: 1.17549435082228750797e-38
pub const f64_min: comptime_float = floatMin(f64); // prev: 2.2250738585072014e-308
pub const f80_min = floatMin(f80); // prev: make_f80(.{ .fraction = 0x8000000000000000, .exp = 1 })
pub const f128_min = floatMin(f128); // prev: @bitCast(f128, @as(u128, 0x00010000000000000000000000000000))
pub const f16_max: comptime_float = floatMax(f16); // prev: 65504
pub const f32_max: comptime_float = floatMax(f32); // prev: 3.40282346638528859812e+38
pub const f64_max: comptime_float = floatMax(f64); // prev: 1.79769313486231570815e+308
pub const f80_max = floatMax(f80); // prev: make_f80(.{ .fraction = 0xFFFFFFFFFFFFFFFF, .exp = 0x7FFE })
pub const f128_max = floatMax(f128); // prev: @bitCast(f128, @as(u128, 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF))
pub const f16_epsilon: comptime_float = floatEps(f16); // prev: 0.0009765625
pub const f32_epsilon: comptime_float = floatEps(f32); // prev: 1.1920928955078125e-07
pub const f64_epsilon: comptime_float = floatEps(f64); // prev: 2.22044604925031308085e-16
pub const f80_epsilon = floatEps(f80); // prev: make_f80(.{ .fraction = 0x8000000000000000, .exp = 0x3FC0 })
pub const f128_epsilon = floatEps(f128); // prev: @bitCast(f128, @as(u128, 0x3F8F0000000000000000000000000000))
pub const f16_toint: comptime_float = 1.0 / f16_epsilon; // same as before
pub const f32_toint: comptime_float = 1.0 / f32_epsilon; // same as before
pub const f64_toint: comptime_float = 1.0 / f64_epsilon; // same as before
pub const f80_toint = 1.0 / f80_epsilon; // same as before
pub const f128_toint = 1.0 / f128_epsilon; // same as before
pub const epsilon = floatEps;
// End of "soft deprecated" section
pub const nan_u16 = @as(u16, 0x7C01);
pub const nan_f16 = @bitCast(f16, nan_u16);
@ -294,36 +300,6 @@ test {
std.testing.refAllDecls(@This());
}
/// Returns the number of bits in the mantissa of floating point type
/// T.
pub fn floatMantissaBits(comptime T: type) comptime_int {
assert(@typeInfo(T) == .Float);
return switch (@typeInfo(T).Float.bits) {
16 => 10,
32 => 23,
64 => 52,
80 => 64,
128 => 112,
else => @compileError("unknown floating point type " ++ @typeName(T)),
};
}
/// Returns the number of bits in the exponent of floating point type
/// T.
pub fn floatExponentBits(comptime T: type) comptime_int {
assert(@typeInfo(T) == .Float);
return switch (@typeInfo(T).Float.bits) {
16 => 5,
32 => 8,
64 => 11,
80 => 15,
128 => 15,
else => @compileError("unknown floating point type " ++ @typeName(T)),
};
}
/// Given two types, returns the smallest one which is capable of holding the
/// full range of the minimum value.
pub fn Min(comptime A: type, comptime B: type) type {

View File

@ -1,15 +0,0 @@
const math = @import("../math.zig");
/// Returns the machine epsilon for type T.
/// This is the smallest value of type T that satisfies the inequality 1.0 +
/// epsilon != 1.0.
pub fn epsilon(comptime T: type) T {
return switch (T) {
f16 => math.f16_epsilon,
f32 => math.f32_epsilon,
f64 => math.f64_epsilon,
f80 => math.f80_epsilon,
f128 => math.f128_epsilon,
else => @compileError("epsilon not implemented for " ++ @typeName(T)),
};
}

105
lib/std/math/float.zig Normal file
View File

@ -0,0 +1,105 @@
const std = @import("../std.zig");
const assert = std.debug.assert;
const expect = std.testing.expect;
/// Creates a raw "1.0" mantissa for floating point type T. Used to dedupe f80 logic.
fn mantissaOne(comptime T: type) comptime_int {
return if (floatMantissaDigits(T) == 64) 1 << 63 else 0;
}
/// Creates floating point type T from an unbiased exponent and raw mantissa.
fn reconstructFloat(comptime T: type, exponent: comptime_int, mantissa: comptime_int) T {
const TBits = std.meta.Int(.unsigned, @bitSizeOf(T));
const biased_exponent = @as(TBits, exponent + floatExponentMax(T));
return @bitCast(T, (biased_exponent << floatMantissaBits(T)) | @as(TBits, mantissa));
}
/// Returns the number of bits in the exponent of floating point type T.
pub fn floatExponentBits(comptime T: type) comptime_int {
assert(@typeInfo(T) == .Float);
return switch (@typeInfo(T).Float.bits) {
16 => 5,
32 => 8,
64 => 11,
80 => 15,
128 => 15,
else => @compileError("unknown floating point type " ++ @typeName(T)),
};
}
/// Returns the number of bits in the mantissa of floating point type T.
pub fn floatMantissaBits(comptime T: type) comptime_int {
assert(@typeInfo(T) == .Float);
return switch (@typeInfo(T).Float.bits) {
16 => 10,
32 => 23,
64 => 52,
80 => 64,
128 => 112,
else => @compileError("unknown floating point type " ++ @typeName(T)),
};
}
/// Returns the number of binary digits in the mantissa of floating point type T.
pub fn floatMantissaDigits(comptime T: type) comptime_int {
assert(@typeInfo(T) == .Float);
// standard IEEE floats have an implicit 0.m or 1.m integer part
// f80 is special and has an explicitly stored bit in the MSB
// this function corresponds to `MANT_DIG' constants from C
return switch (@typeInfo(T).Float.bits) {
16 => 11,
32 => 24,
64 => 53,
80 => 64,
128 => 113,
else => @compileError("unknown floating point type " ++ @typeName(T)),
};
}
/// Returns the minimum exponent that can represent
/// a normalised value in floating point type T.
pub fn floatExponentMin(comptime T: type) comptime_int {
return -floatExponentMax(T) + 1;
}
/// Returns the maximum exponent that can represent
/// a normalised value in floating point type T.
pub fn floatExponentMax(comptime T: type) comptime_int {
return (1 << (floatExponentBits(T) - 1)) - 1;
}
/// Returns the smallest subnormal number representable in floating point type T.
pub fn floatTrueMin(comptime T: type) T {
return reconstructFloat(T, floatExponentMin(T) - 1, 1);
}
/// Returns the smallest normal number representable in floating point type T.
pub fn floatMin(comptime T: type) T {
return reconstructFloat(T, floatExponentMin(T), mantissaOne(T));
}
/// Returns the largest normal number representable in floating point type T.
pub fn floatMax(comptime T: type) T {
const all1s_mantissa = (1 << floatMantissaBits(T)) - 1;
return reconstructFloat(T, floatExponentMax(T), all1s_mantissa);
}
/// Returns the machine epsilon of floating point type T.
pub fn floatEps(comptime T: type) T {
return reconstructFloat(T, -(floatMantissaDigits(T) - 1), mantissaOne(T));
}
test "std.math.float" {
inline for ([_]type{ f16, f32, f64, f80, f128, c_longdouble }) |T| {
// (1 +) for the sign bit, since it is separate from the other bits
const size = 1 + floatExponentBits(T) + floatMantissaBits(T);
try expect(@bitSizeOf(T) == size);
// for machine epsilon, assert expmin <= -prec <= expmax
try expect(floatExponentMin(T) <= -(floatMantissaDigits(T) - 1));
try expect(-(floatMantissaDigits(T) - 1) <= floatExponentMax(T));
}
}