derive float constants in a generic way (#10133)

2026-02-20 08:14:48 +00:00 · 2022-04-01 15:17:24 -06:00 · 2022-04-01 15:17:24 -06:00 · 5b8ac9821d
commit 5b8ac9821d
parent 62f54aa39c
3 changed files with 142 additions and 76 deletions
--- a/lib/std/math.zig
+++ b/lib/std/math.zig
@ -36,38 +36,44 @@ pub const sqrt2 = 1.414213562373095048801688724209698079;
 /// 1/sqrt(2)
 pub const sqrt1_2 = 0.707106781186547524400844362104849039;

-pub const f128_true_min = @bitCast(f128, @as(u128, 0x00000000000000000000000000000001));
-pub const f128_min = @bitCast(f128, @as(u128, 0x00010000000000000000000000000000));
-pub const f128_max = @bitCast(f128, @as(u128, 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF));
-pub const f128_epsilon = @bitCast(f128, @as(u128, 0x3F8F0000000000000000000000000000));
-pub const f128_toint = 1.0 / f128_epsilon;
+pub const floatExponentBits = @import("math/float.zig").floatExponentBits;
+pub const floatMantissaBits = @import("math/float.zig").floatMantissaBits;
+pub const floatMantissaDigits = @import("math/float.zig").floatMantissaDigits;
+pub const floatExponentMin = @import("math/float.zig").floatExponentMin;
+pub const floatExponentMax = @import("math/float.zig").floatExponentMax;
+pub const floatTrueMin = @import("math/float.zig").floatTrueMin;
+pub const floatMin = @import("math/float.zig").floatMin;
+pub const floatMax = @import("math/float.zig").floatMax;
+pub const floatEps = @import("math/float.zig").floatEps;

-// float.h details
-pub const f80_true_min = make_f80(.{ .fraction = 1, .exp = 0 });
-pub const f80_min = make_f80(.{ .fraction = 0x8000000000000000, .exp = 1 });
-pub const f80_max = make_f80(.{ .fraction = 0xFFFFFFFFFFFFFFFF, .exp = 0x7FFE });
-pub const f80_epsilon = make_f80(.{ .fraction = 0x8000000000000000, .exp = 0x3FC0 });
-pub const f80_toint = 1.0 / f80_epsilon;
-
-pub const f64_true_min = 4.94065645841246544177e-324;
-pub const f64_min = 2.2250738585072014e-308;
-pub const f64_max = 1.79769313486231570815e+308;
-pub const f64_epsilon = 2.22044604925031308085e-16;
-pub const f64_toint = 1.0 / f64_epsilon;
-
-pub const f32_true_min = 1.40129846432481707092e-45;
-pub const f32_min = 1.17549435082228750797e-38;
-pub const f32_max = 3.40282346638528859812e+38;
-pub const f32_epsilon = 1.1920928955078125e-07;
-pub const f32_toint = 1.0 / f32_epsilon;
-
-pub const f16_true_min = 0.000000059604644775390625; // 2**-24
-pub const f16_min = 0.00006103515625; // 2**-14
-pub const f16_max = 65504;
-pub const f16_epsilon = 0.0009765625; // 2**-10
-pub const f16_toint = 1.0 / f16_epsilon;
-
-pub const epsilon = @import("math/epsilon.zig").epsilon;
+// TODO Replace with @compileError("deprecated for foobar") after 0.10.0 is released.
+pub const f16_true_min: comptime_float = floatTrueMin(f16); // prev: 0.000000059604644775390625
+pub const f32_true_min: comptime_float = floatTrueMin(f32); // prev: 1.40129846432481707092e-45
+pub const f64_true_min: comptime_float = floatTrueMin(f64); // prev: 4.94065645841246544177e-324
+pub const f80_true_min = floatTrueMin(f80); // prev: make_f80(.{ .fraction = 1, .exp = 0 })
+pub const f128_true_min = floatTrueMin(f128); // prev: @bitCast(f128, @as(u128, 0x00000000000000000000000000000001))
+pub const f16_min: comptime_float = floatMin(f16); // prev: 0.00006103515625
+pub const f32_min: comptime_float = floatMin(f32); // prev: 1.17549435082228750797e-38
+pub const f64_min: comptime_float = floatMin(f64); // prev: 2.2250738585072014e-308
+pub const f80_min = floatMin(f80); // prev: make_f80(.{ .fraction = 0x8000000000000000, .exp = 1 })
+pub const f128_min = floatMin(f128); // prev: @bitCast(f128, @as(u128, 0x00010000000000000000000000000000))
+pub const f16_max: comptime_float = floatMax(f16); // prev: 65504
+pub const f32_max: comptime_float = floatMax(f32); // prev: 3.40282346638528859812e+38
+pub const f64_max: comptime_float = floatMax(f64); // prev: 1.79769313486231570815e+308
+pub const f80_max = floatMax(f80); // prev: make_f80(.{ .fraction = 0xFFFFFFFFFFFFFFFF, .exp = 0x7FFE })
+pub const f128_max = floatMax(f128); // prev: @bitCast(f128, @as(u128, 0x7FFEFFFFFFFFFFFFFFFFFFFFFFFFFFFF))
+pub const f16_epsilon: comptime_float = floatEps(f16); // prev: 0.0009765625
+pub const f32_epsilon: comptime_float = floatEps(f32); // prev: 1.1920928955078125e-07
+pub const f64_epsilon: comptime_float = floatEps(f64); // prev: 2.22044604925031308085e-16
+pub const f80_epsilon = floatEps(f80); // prev: make_f80(.{ .fraction = 0x8000000000000000, .exp = 0x3FC0 })
+pub const f128_epsilon = floatEps(f128); // prev: @bitCast(f128, @as(u128, 0x3F8F0000000000000000000000000000))
+pub const f16_toint: comptime_float = 1.0 / f16_epsilon; // same as before
+pub const f32_toint: comptime_float = 1.0 / f32_epsilon; // same as before
+pub const f64_toint: comptime_float = 1.0 / f64_epsilon; // same as before
+pub const f80_toint = 1.0 / f80_epsilon; // same as before
+pub const f128_toint = 1.0 / f128_epsilon; // same as before
+pub const epsilon = floatEps;
+// End of "soft deprecated" section

 pub const nan_u16 = @as(u16, 0x7C01);
 pub const nan_f16 = @bitCast(f16, nan_u16);
@ -294,36 +300,6 @@ test {
    std.testing.refAllDecls(@This());
 }

-/// Returns the number of bits in the mantissa of floating point type
-/// T.
-pub fn floatMantissaBits(comptime T: type) comptime_int {
-    assert(@typeInfo(T) == .Float);
-
-    return switch (@typeInfo(T).Float.bits) {
-        16 => 10,
-        32 => 23,
-        64 => 52,
-        80 => 64,
-        128 => 112,
-        else => @compileError("unknown floating point type " ++ @typeName(T)),
-    };
-}
-
-/// Returns the number of bits in the exponent of floating point type
-/// T.
-pub fn floatExponentBits(comptime T: type) comptime_int {
-    assert(@typeInfo(T) == .Float);
-
-    return switch (@typeInfo(T).Float.bits) {
-        16 => 5,
-        32 => 8,
-        64 => 11,
-        80 => 15,
-        128 => 15,
-        else => @compileError("unknown floating point type " ++ @typeName(T)),
-    };
-}
-
 /// Given two types, returns the smallest one which is capable of holding the
 /// full range of the minimum value.
 pub fn Min(comptime A: type, comptime B: type) type {
--- a/lib/std/math/epsilon.zig
+++ b/lib/std/math/epsilon.zig
@ -1,15 +0,0 @@
-const math = @import("../math.zig");
-
-/// Returns the machine epsilon for type T.
-/// This is the smallest value of type T that satisfies the inequality 1.0 +
-/// epsilon != 1.0.
-pub fn epsilon(comptime T: type) T {
-    return switch (T) {
-        f16 => math.f16_epsilon,
-        f32 => math.f32_epsilon,
-        f64 => math.f64_epsilon,
-        f80 => math.f80_epsilon,
-        f128 => math.f128_epsilon,
-        else => @compileError("epsilon not implemented for " ++ @typeName(T)),
-    };
-}
--- a/lib/std/math/float.zig
+++ b/lib/std/math/float.zig
@ -0,0 +1,105 @@
+const std = @import("../std.zig");
+const assert = std.debug.assert;
+const expect = std.testing.expect;
+
+/// Creates a raw "1.0" mantissa for floating point type T. Used to dedupe f80 logic.
+fn mantissaOne(comptime T: type) comptime_int {
+    return if (floatMantissaDigits(T) == 64) 1 << 63 else 0;
+}
+
+/// Creates floating point type T from an unbiased exponent and raw mantissa.
+fn reconstructFloat(comptime T: type, exponent: comptime_int, mantissa: comptime_int) T {
+    const TBits = std.meta.Int(.unsigned, @bitSizeOf(T));
+    const biased_exponent = @as(TBits, exponent + floatExponentMax(T));
+    return @bitCast(T, (biased_exponent << floatMantissaBits(T)) | @as(TBits, mantissa));
+}
+
+/// Returns the number of bits in the exponent of floating point type T.
+pub fn floatExponentBits(comptime T: type) comptime_int {
+    assert(@typeInfo(T) == .Float);
+
+    return switch (@typeInfo(T).Float.bits) {
+        16 => 5,
+        32 => 8,
+        64 => 11,
+        80 => 15,
+        128 => 15,
+        else => @compileError("unknown floating point type " ++ @typeName(T)),
+    };
+}
+
+/// Returns the number of bits in the mantissa of floating point type T.
+pub fn floatMantissaBits(comptime T: type) comptime_int {
+    assert(@typeInfo(T) == .Float);
+
+    return switch (@typeInfo(T).Float.bits) {
+        16 => 10,
+        32 => 23,
+        64 => 52,
+        80 => 64,
+        128 => 112,
+        else => @compileError("unknown floating point type " ++ @typeName(T)),
+    };
+}
+
+/// Returns the number of binary digits in the mantissa of floating point type T.
+pub fn floatMantissaDigits(comptime T: type) comptime_int {
+    assert(@typeInfo(T) == .Float);
+
+    // standard IEEE floats have an implicit 0.m or 1.m integer part
+    // f80 is special and has an explicitly stored bit in the MSB
+    // this function corresponds to `MANT_DIG' constants from C
+    return switch (@typeInfo(T).Float.bits) {
+        16 => 11,
+        32 => 24,
+        64 => 53,
+        80 => 64,
+        128 => 113,
+        else => @compileError("unknown floating point type " ++ @typeName(T)),
+    };
+}
+
+/// Returns the minimum exponent that can represent
+/// a normalised value in floating point type T.
+pub fn floatExponentMin(comptime T: type) comptime_int {
+    return -floatExponentMax(T) + 1;
+}
+
+/// Returns the maximum exponent that can represent
+/// a normalised value in floating point type T.
+pub fn floatExponentMax(comptime T: type) comptime_int {
+    return (1 << (floatExponentBits(T) - 1)) - 1;
+}
+
+/// Returns the smallest subnormal number representable in floating point type T.
+pub fn floatTrueMin(comptime T: type) T {
+    return reconstructFloat(T, floatExponentMin(T) - 1, 1);
+}
+
+/// Returns the smallest normal number representable in floating point type T.
+pub fn floatMin(comptime T: type) T {
+    return reconstructFloat(T, floatExponentMin(T), mantissaOne(T));
+}
+
+/// Returns the largest normal number representable in floating point type T.
+pub fn floatMax(comptime T: type) T {
+    const all1s_mantissa = (1 << floatMantissaBits(T)) - 1;
+    return reconstructFloat(T, floatExponentMax(T), all1s_mantissa);
+}
+
+/// Returns the machine epsilon of floating point type T.
+pub fn floatEps(comptime T: type) T {
+    return reconstructFloat(T, -(floatMantissaDigits(T) - 1), mantissaOne(T));
+}
+
+test "std.math.float" {
+    inline for ([_]type{ f16, f32, f64, f80, f128, c_longdouble }) |T| {
+        // (1 +) for the sign bit, since it is separate from the other bits
+        const size = 1 + floatExponentBits(T) + floatMantissaBits(T);
+        try expect(@bitSizeOf(T) == size);
+
+        // for machine epsilon, assert expmin <= -prec <= expmax
+        try expect(floatExponentMin(T) <= -(floatMantissaDigits(T) - 1));
+        try expect(-(floatMantissaDigits(T) - 1) <= floatExponentMax(T));
+    }
+}