Merge pull request #10276 from tiehuis/musl-trig-reimpl

sin/cos/tan musl reimplementation
2026-02-21 16:54:52 +00:00 · 2021-12-05 14:49:04 -08:00 · 2021-12-05 14:49:04 -08:00 · a7828c261a
commit a7828c261a
parent 9e03cf9489 b02384e03d
7 changed files with 1369 additions and 225 deletions
--- a/lib/std/math/__rem_pio2.zig
+++ b/lib/std/math/__rem_pio2.zig
@ -0,0 +1,198 @@
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+//
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__rem_pio2.c
+
+const std = @import("../std.zig");
+const __rem_pio2_large = @import("__rem_pio2_large.zig").__rem_pio2_large;
+const math = std.math;
+
+const toint = 1.5 / math.epsilon(f64);
+// pi/4
+const pio4 = 0x1.921fb54442d18p-1;
+// invpio2:  53 bits of 2/pi
+const invpio2 = 6.36619772367581382433e-01; // 0x3FE45F30, 0x6DC9C883
+// pio2_1:   first  33 bit of pi/2
+const pio2_1 = 1.57079632673412561417e+00; // 0x3FF921FB, 0x54400000
+// pio2_1t:  pi/2 - pio2_1
+const pio2_1t = 6.07710050650619224932e-11; // 0x3DD0B461, 0x1A626331
+// pio2_2:   second 33 bit of pi/2
+const pio2_2 = 6.07710050630396597660e-11; // 0x3DD0B461, 0x1A600000
+// pio2_2t:  pi/2 - (pio2_1+pio2_2)
+const pio2_2t = 2.02226624879595063154e-21; // 0x3BA3198A, 0x2E037073
+// pio2_3:   third  33 bit of pi/2
+const pio2_3 = 2.02226624871116645580e-21; // 0x3BA3198A, 0x2E000000
+// pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+const pio2_3t = 8.47842766036889956997e-32; // 0x397B839A, 0x252049C1
+
+fn U(x: anytype) usize {
+    return @intCast(usize, x);
+}
+
+fn medium(ix: u32, x: f64, y: *[2]f64) i32 {
+    var w: f64 = undefined;
+    var t: f64 = undefined;
+    var r: f64 = undefined;
+    var @"fn": f64 = undefined;
+    var n: i32 = undefined;
+    var ex: i32 = undefined;
+    var ey: i32 = undefined;
+    var ui: u64 = undefined;
+
+    // rint(x/(pi/2))
+    @"fn" = x * invpio2 + toint - toint;
+    n = @floatToInt(i32, @"fn");
+    r = x - @"fn" * pio2_1;
+    w = @"fn" * pio2_1t; // 1st round, good to 85 bits
+    // Matters with directed rounding.
+    if (r - w < -pio4) {
+        n -= 1;
+        @"fn" -= 1;
+        r = x - @"fn" * pio2_1;
+        w = @"fn" * pio2_1t;
+    } else if (r - w > pio4) {
+        n += 1;
+        @"fn" += 1;
+        r = x - @"fn" * pio2_1;
+        w = @"fn" * pio2_1t;
+    }
+    y[0] = r - w;
+    ui = @bitCast(u64, y[0]);
+    ey = @intCast(i32, (ui >> 52) & 0x7ff);
+    ex = @intCast(i32, ix >> 20);
+    if (ex - ey > 16) { // 2nd round, good to 118 bits
+        t = r;
+        w = @"fn" * pio2_2;
+        r = t - w;
+        w = @"fn" * pio2_2t - ((t - r) - w);
+        y[0] = r - w;
+        ui = @bitCast(u64, y[0]);
+        ey = @intCast(i32, (ui >> 52) & 0x7ff);
+        if (ex - ey > 49) { // 3rd round, good to 151 bits, covers all cases
+            t = r;
+            w = @"fn" * pio2_3;
+            r = t - w;
+            w = @"fn" * pio2_3t - ((t - r) - w);
+            y[0] = r - w;
+        }
+    }
+    y[1] = (r - y[0]) - w;
+    return n;
+}
+
+// Returns the remainder of x rem pi/2 in y[0]+y[1]
+//
+// use __rem_pio2_large() for large x
+//
+// caller must handle the case when reduction is not needed: |x| ~<= pi/4 */
+pub fn __rem_pio2(x: f64, y: *[2]f64) i32 {
+    var z: f64 = undefined;
+    var tx: [3]f64 = undefined;
+    var ty: [2]f64 = undefined;
+    var n: i32 = undefined;
+    var ix: u32 = undefined;
+    var sign: bool = undefined;
+    var i: i32 = undefined;
+    var ui: u64 = undefined;
+
+    ui = @bitCast(u64, x);
+    sign = ui >> 63 != 0;
+    ix = @truncate(u32, (ui >> 32) & 0x7fffffff);
+    if (ix <= 0x400f6a7a) { // |x| ~<= 5pi/4
+        if ((ix & 0xfffff) == 0x921fb) { // |x| ~= pi/2 or 2pi/2
+            return medium(ix, x, y);
+        }
+        if (ix <= 0x4002d97c) { // |x| ~<= 3pi/4
+            if (!sign) {
+                z = x - pio2_1; // one round good to 85 bits
+                y[0] = z - pio2_1t;
+                y[1] = (z - y[0]) - pio2_1t;
+                return 1;
+            } else {
+                z = x + pio2_1;
+                y[0] = z + pio2_1t;
+                y[1] = (z - y[0]) + pio2_1t;
+                return -1;
+            }
+        } else {
+            if (!sign) {
+                z = x - 2 * pio2_1;
+                y[0] = z - 2 * pio2_1t;
+                y[1] = (z - y[0]) - 2 * pio2_1t;
+                return 2;
+            } else {
+                z = x + 2 * pio2_1;
+                y[0] = z + 2 * pio2_1t;
+                y[1] = (z - y[0]) + 2 * pio2_1t;
+                return -2;
+            }
+        }
+    }
+    if (ix <= 0x401c463b) { // |x| ~<= 9pi/4
+        if (ix <= 0x4015fdbc) { // |x| ~<= 7pi/4
+            if (ix == 0x4012d97c) { // |x| ~= 3pi/2
+                return medium(ix, x, y);
+            }
+            if (!sign) {
+                z = x - 3 * pio2_1;
+                y[0] = z - 3 * pio2_1t;
+                y[1] = (z - y[0]) - 3 * pio2_1t;
+                return 3;
+            } else {
+                z = x + 3 * pio2_1;
+                y[0] = z + 3 * pio2_1t;
+                y[1] = (z - y[0]) + 3 * pio2_1t;
+                return -3;
+            }
+        } else {
+            if (ix == 0x401921fb) { // |x| ~= 4pi/2 */
+                return medium(ix, x, y);
+            }
+            if (!sign) {
+                z = x - 4 * pio2_1;
+                y[0] = z - 4 * pio2_1t;
+                y[1] = (z - y[0]) - 4 * pio2_1t;
+                return 4;
+            } else {
+                z = x + 4 * pio2_1;
+                y[0] = z + 4 * pio2_1t;
+                y[1] = (z - y[0]) + 4 * pio2_1t;
+                return -4;
+            }
+        }
+    }
+    if (ix < 0x413921fb) { // |x| ~< 2^20*(pi/2), medium size
+        return medium(ix, x, y);
+    }
+    // all other (large) arguments
+    if (ix >= 0x7ff00000) { // x is inf or NaN
+        y[0] = x - x;
+        y[1] = y[0];
+        return 0;
+    }
+    // set z = scalbn(|x|,-ilogb(x)+23)
+    ui = @bitCast(u64, x);
+    ui &= std.math.maxInt(u64) >> 12;
+    ui |= @as(u64, 0x3ff + 23) << 52;
+    z = @bitCast(f64, ui);
+
+    i = 0;
+    while (i < 2) : (i += 1) {
+        tx[U(i)] = @intToFloat(f64, @floatToInt(i32, z));
+        z = (z - tx[U(i)]) * 0x1p24;
+    }
+    tx[U(i)] = z;
+    // skip zero terms, first term is non-zero
+    while (tx[U(i)] == 0.0) {
+        i -= 1;
+    }
+    n = __rem_pio2_large(tx[0..], ty[0..], @intCast(i32, (ix >> 20)) - (0x3ff + 23), i + 1, 1);
+    if (sign) {
+        y[0] = -ty[0];
+        y[1] = -ty[1];
+        return -n;
+    }
+    y[0] = ty[0];
+    y[1] = ty[1];
+    return n;
+}
--- a/lib/std/math/__rem_pio2_large.zig
+++ b/lib/std/math/__rem_pio2_large.zig
@ -0,0 +1,510 @@
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+//
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__rem_pio2_large.c
+
+const std = @import("../std.zig");
+const math = std.math;
+
+const init_jk = [_]i32{ 3, 4, 4, 6 }; // initial value for jk
+
+//
+// Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
+//
+//              integer array, contains the (24*i)-th to (24*i+23)-th
+//              bit of 2/pi after binary point. The corresponding
+//              floating value is
+//
+//                      ipio2[i] * 2^(-24(i+1)).
+//
+// NB: This table must have at least (e0-3)/24 + jk terms.
+//     For quad precision (e0 <= 16360, jk = 6), this is 686.
+///
+const ipio2 = [_]i32{
+    0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
+    0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
+    0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
+    0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
+    0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
+    0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
+    0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
+    0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
+    0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
+    0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
+    0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
+
+    //#if LDBL_MAX_EXP > 1024
+    0x47C419, 0xC367CD, 0xDCE809, 0x2A8359, 0xC4768B, 0x961CA6,
+    0xDDAF44, 0xD15719, 0x053EA5, 0xFF0705, 0x3F7E33, 0xE832C2,
+    0xDE4F98, 0x327DBB, 0xC33D26, 0xEF6B1E, 0x5EF89F, 0x3A1F35,
+    0xCAF27F, 0x1D87F1, 0x21907C, 0x7C246A, 0xFA6ED5, 0x772D30,
+    0x433B15, 0xC614B5, 0x9D19C3, 0xC2C4AD, 0x414D2C, 0x5D000C,
+    0x467D86, 0x2D71E3, 0x9AC69B, 0x006233, 0x7CD2B4, 0x97A7B4,
+    0xD55537, 0xF63ED7, 0x1810A3, 0xFC764D, 0x2A9D64, 0xABD770,
+    0xF87C63, 0x57B07A, 0xE71517, 0x5649C0, 0xD9D63B, 0x3884A7,
+    0xCB2324, 0x778AD6, 0x23545A, 0xB91F00, 0x1B0AF1, 0xDFCE19,
+    0xFF319F, 0x6A1E66, 0x615799, 0x47FBAC, 0xD87F7E, 0xB76522,
+    0x89E832, 0x60BFE6, 0xCDC4EF, 0x09366C, 0xD43F5D, 0xD7DE16,
+    0xDE3B58, 0x929BDE, 0x2822D2, 0xE88628, 0x4D58E2, 0x32CAC6,
+    0x16E308, 0xCB7DE0, 0x50C017, 0xA71DF3, 0x5BE018, 0x34132E,
+    0x621283, 0x014883, 0x5B8EF5, 0x7FB0AD, 0xF2E91E, 0x434A48,
+    0xD36710, 0xD8DDAA, 0x425FAE, 0xCE616A, 0xA4280A, 0xB499D3,
+    0xF2A606, 0x7F775C, 0x83C2A3, 0x883C61, 0x78738A, 0x5A8CAF,
+    0xBDD76F, 0x63A62D, 0xCBBFF4, 0xEF818D, 0x67C126, 0x45CA55,
+    0x36D9CA, 0xD2A828, 0x8D61C2, 0x77C912, 0x142604, 0x9B4612,
+    0xC459C4, 0x44C5C8, 0x91B24D, 0xF31700, 0xAD43D4, 0xE54929,
+    0x10D5FD, 0xFCBE00, 0xCC941E, 0xEECE70, 0xF53E13, 0x80F1EC,
+    0xC3E7B3, 0x28F8C7, 0x940593, 0x3E71C1, 0xB3092E, 0xF3450B,
+    0x9C1288, 0x7B20AB, 0x9FB52E, 0xC29247, 0x2F327B, 0x6D550C,
+    0x90A772, 0x1FE76B, 0x96CB31, 0x4A1679, 0xE27941, 0x89DFF4,
+    0x9794E8, 0x84E6E2, 0x973199, 0x6BED88, 0x365F5F, 0x0EFDBB,
+    0xB49A48, 0x6CA467, 0x427271, 0x325D8D, 0xB8159F, 0x09E5BC,
+    0x25318D, 0x3974F7, 0x1C0530, 0x010C0D, 0x68084B, 0x58EE2C,
+    0x90AA47, 0x02E774, 0x24D6BD, 0xA67DF7, 0x72486E, 0xEF169F,
+    0xA6948E, 0xF691B4, 0x5153D1, 0xF20ACF, 0x339820, 0x7E4BF5,
+    0x6863B2, 0x5F3EDD, 0x035D40, 0x7F8985, 0x295255, 0xC06437,
+    0x10D86D, 0x324832, 0x754C5B, 0xD4714E, 0x6E5445, 0xC1090B,
+    0x69F52A, 0xD56614, 0x9D0727, 0x50045D, 0xDB3BB4, 0xC576EA,
+    0x17F987, 0x7D6B49, 0xBA271D, 0x296996, 0xACCCC6, 0x5414AD,
+    0x6AE290, 0x89D988, 0x50722C, 0xBEA404, 0x940777, 0x7030F3,
+    0x27FC00, 0xA871EA, 0x49C266, 0x3DE064, 0x83DD97, 0x973FA3,
+    0xFD9443, 0x8C860D, 0xDE4131, 0x9D3992, 0x8C70DD, 0xE7B717,
+    0x3BDF08, 0x2B3715, 0xA0805C, 0x93805A, 0x921110, 0xD8E80F,
+    0xAF806C, 0x4BFFDB, 0x0F9038, 0x761859, 0x15A562, 0xBBCB61,
+    0xB989C7, 0xBD4010, 0x04F2D2, 0x277549, 0xF6B6EB, 0xBB22DB,
+    0xAA140A, 0x2F2689, 0x768364, 0x333B09, 0x1A940E, 0xAA3A51,
+    0xC2A31D, 0xAEEDAF, 0x12265C, 0x4DC26D, 0x9C7A2D, 0x9756C0,
+    0x833F03, 0xF6F009, 0x8C402B, 0x99316D, 0x07B439, 0x15200C,
+    0x5BC3D8, 0xC492F5, 0x4BADC6, 0xA5CA4E, 0xCD37A7, 0x36A9E6,
+    0x9492AB, 0x6842DD, 0xDE6319, 0xEF8C76, 0x528B68, 0x37DBFC,
+    0xABA1AE, 0x3115DF, 0xA1AE00, 0xDAFB0C, 0x664D64, 0xB705ED,
+    0x306529, 0xBF5657, 0x3AFF47, 0xB9F96A, 0xF3BE75, 0xDF9328,
+    0x3080AB, 0xF68C66, 0x15CB04, 0x0622FA, 0x1DE4D9, 0xA4B33D,
+    0x8F1B57, 0x09CD36, 0xE9424E, 0xA4BE13, 0xB52333, 0x1AAAF0,
+    0xA8654F, 0xA5C1D2, 0x0F3F0B, 0xCD785B, 0x76F923, 0x048B7B,
+    0x721789, 0x53A6C6, 0xE26E6F, 0x00EBEF, 0x584A9B, 0xB7DAC4,
+    0xBA66AA, 0xCFCF76, 0x1D02D1, 0x2DF1B1, 0xC1998C, 0x77ADC3,
+    0xDA4886, 0xA05DF7, 0xF480C6, 0x2FF0AC, 0x9AECDD, 0xBC5C3F,
+    0x6DDED0, 0x1FC790, 0xB6DB2A, 0x3A25A3, 0x9AAF00, 0x9353AD,
+    0x0457B6, 0xB42D29, 0x7E804B, 0xA707DA, 0x0EAA76, 0xA1597B,
+    0x2A1216, 0x2DB7DC, 0xFDE5FA, 0xFEDB89, 0xFDBE89, 0x6C76E4,
+    0xFCA906, 0x70803E, 0x156E85, 0xFF87FD, 0x073E28, 0x336761,
+    0x86182A, 0xEABD4D, 0xAFE7B3, 0x6E6D8F, 0x396795, 0x5BBF31,
+    0x48D784, 0x16DF30, 0x432DC7, 0x356125, 0xCE70C9, 0xB8CB30,
+    0xFD6CBF, 0xA200A4, 0xE46C05, 0xA0DD5A, 0x476F21, 0xD21262,
+    0x845CB9, 0x496170, 0xE0566B, 0x015299, 0x375550, 0xB7D51E,
+    0xC4F133, 0x5F6E13, 0xE4305D, 0xA92E85, 0xC3B21D, 0x3632A1,
+    0xA4B708, 0xD4B1EA, 0x21F716, 0xE4698F, 0x77FF27, 0x80030C,
+    0x2D408D, 0xA0CD4F, 0x99A520, 0xD3A2B3, 0x0A5D2F, 0x42F9B4,
+    0xCBDA11, 0xD0BE7D, 0xC1DB9B, 0xBD17AB, 0x81A2CA, 0x5C6A08,
+    0x17552E, 0x550027, 0xF0147F, 0x8607E1, 0x640B14, 0x8D4196,
+    0xDEBE87, 0x2AFDDA, 0xB6256B, 0x34897B, 0xFEF305, 0x9EBFB9,
+    0x4F6A68, 0xA82A4A, 0x5AC44F, 0xBCF82D, 0x985AD7, 0x95C7F4,
+    0x8D4D0D, 0xA63A20, 0x5F57A4, 0xB13F14, 0x953880, 0x0120CC,
+    0x86DD71, 0xB6DEC9, 0xF560BF, 0x11654D, 0x6B0701, 0xACB08C,
+    0xD0C0B2, 0x485551, 0x0EFB1E, 0xC37295, 0x3B06A3, 0x3540C0,
+    0x7BDC06, 0xCC45E0, 0xFA294E, 0xC8CAD6, 0x41F3E8, 0xDE647C,
+    0xD8649B, 0x31BED9, 0xC397A4, 0xD45877, 0xC5E369, 0x13DAF0,
+    0x3C3ABA, 0x461846, 0x5F7555, 0xF5BDD2, 0xC6926E, 0x5D2EAC,
+    0xED440E, 0x423E1C, 0x87C461, 0xE9FD29, 0xF3D6E7, 0xCA7C22,
+    0x35916F, 0xC5E008, 0x8DD7FF, 0xE26A6E, 0xC6FDB0, 0xC10893,
+    0x745D7C, 0xB2AD6B, 0x9D6ECD, 0x7B723E, 0x6A11C6, 0xA9CFF7,
+    0xDF7329, 0xBAC9B5, 0x5100B7, 0x0DB2E2, 0x24BA74, 0x607DE5,
+    0x8AD874, 0x2C150D, 0x0C1881, 0x94667E, 0x162901, 0x767A9F,
+    0xBEFDFD, 0xEF4556, 0x367ED9, 0x13D9EC, 0xB9BA8B, 0xFC97C4,
+    0x27A831, 0xC36EF1, 0x36C594, 0x56A8D8, 0xB5A8B4, 0x0ECCCF,
+    0x2D8912, 0x34576F, 0x89562C, 0xE3CE99, 0xB920D6, 0xAA5E6B,
+    0x9C2A3E, 0xCC5F11, 0x4A0BFD, 0xFBF4E1, 0x6D3B8E, 0x2C86E2,
+    0x84D4E9, 0xA9B4FC, 0xD1EEEF, 0xC9352E, 0x61392F, 0x442138,
+    0xC8D91B, 0x0AFC81, 0x6A4AFB, 0xD81C2F, 0x84B453, 0x8C994E,
+    0xCC2254, 0xDC552A, 0xD6C6C0, 0x96190B, 0xB8701A, 0x649569,
+    0x605A26, 0xEE523F, 0x0F117F, 0x11B5F4, 0xF5CBFC, 0x2DBC34,
+    0xEEBC34, 0xCC5DE8, 0x605EDD, 0x9B8E67, 0xEF3392, 0xB817C9,
+    0x9B5861, 0xBC57E1, 0xC68351, 0x103ED8, 0x4871DD, 0xDD1C2D,
+    0xA118AF, 0x462C21, 0xD7F359, 0x987AD9, 0xC0549E, 0xFA864F,
+    0xFC0656, 0xAE79E5, 0x362289, 0x22AD38, 0xDC9367, 0xAAE855,
+    0x382682, 0x9BE7CA, 0xA40D51, 0xB13399, 0x0ED7A9, 0x480569,
+    0xF0B265, 0xA7887F, 0x974C88, 0x36D1F9, 0xB39221, 0x4A827B,
+    0x21CF98, 0xDC9F40, 0x5547DC, 0x3A74E1, 0x42EB67, 0xDF9DFE,
+    0x5FD45E, 0xA4677B, 0x7AACBA, 0xA2F655, 0x23882B, 0x55BA41,
+    0x086E59, 0x862A21, 0x834739, 0xE6E389, 0xD49EE5, 0x40FB49,
+    0xE956FF, 0xCA0F1C, 0x8A59C5, 0x2BFA94, 0xC5C1D3, 0xCFC50F,
+    0xAE5ADB, 0x86C547, 0x624385, 0x3B8621, 0x94792C, 0x876110,
+    0x7B4C2A, 0x1A2C80, 0x12BF43, 0x902688, 0x893C78, 0xE4C4A8,
+    0x7BDBE5, 0xC23AC4, 0xEAF426, 0x8A67F7, 0xBF920D, 0x2BA365,
+    0xB1933D, 0x0B7CBD, 0xDC51A4, 0x63DD27, 0xDDE169, 0x19949A,
+    0x9529A8, 0x28CE68, 0xB4ED09, 0x209F44, 0xCA984E, 0x638270,
+    0x237C7E, 0x32B90F, 0x8EF5A7, 0xE75614, 0x08F121, 0x2A9DB5,
+    0x4D7E6F, 0x5119A5, 0xABF9B5, 0xD6DF82, 0x61DD96, 0x023616,
+    0x9F3AC4, 0xA1A283, 0x6DED72, 0x7A8D39, 0xA9B882, 0x5C326B,
+    0x5B2746, 0xED3400, 0x7700D2, 0x55F4FC, 0x4D5901,
+    0x8071E0,
+    //#endif
+};
+
+const PIo2 = [_]f64{
+    1.57079625129699707031e+00, // 0x3FF921FB, 0x40000000
+    7.54978941586159635335e-08, // 0x3E74442D, 0x00000000
+    5.39030252995776476554e-15, // 0x3CF84698, 0x80000000
+    3.28200341580791294123e-22, // 0x3B78CC51, 0x60000000
+    1.27065575308067607349e-29, // 0x39F01B83, 0x80000000
+    1.22933308981111328932e-36, // 0x387A2520, 0x40000000
+    2.73370053816464559624e-44, // 0x36E38222, 0x80000000
+    2.16741683877804819444e-51, // 0x3569F31D, 0x00000000
+};
+
+fn U(x: anytype) usize {
+    return @intCast(usize, x);
+}
+
+// Returns the last three digits of N with y = x - N*pi/2 so that |y| < pi/2.
+//
+// The method is to compute the integer (mod 8) and fraction parts of
+// (2/pi)*x without doing the full multiplication. In general we
+// skip the part of the product that are known to be a huge integer (
+// more accurately, = 0 mod 8 ). Thus the number of operations are
+// independent of the exponent of the input.
+//
+// (2/pi) is represented by an array of 24-bit integers in ipio2[].
+//
+// Input parameters:
+//      x[]     The input value (must be positive) is broken into nx
+//              pieces of 24-bit integers in double precision format.
+//              x[i] will be the i-th 24 bit of x. The scaled exponent
+//              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+//              match x's up to 24 bits.
+//
+//              Example of breaking a double positive z into x[0]+x[1]+x[2]:
+//                      e0 = ilogb(z)-23
+//                      z  = scalbn(z,-e0)
+//              for i = 0,1,2
+//                      x[i] = floor(z)
+//                      z    = (z-x[i])*2**24
+//
+//
+//      y[]     ouput result in an array of double precision numbers.
+//              The dimension of y[] is:
+//                      24-bit  precision       1
+//                      53-bit  precision       2
+//                      64-bit  precision       2
+//                      113-bit precision       3
+//              The actual value is the sum of them. Thus for 113-bit
+//              precison, one may have to do something like:
+//
+//              long double t,w,r_head, r_tail;
+//              t = (long double)y[2] + (long double)y[1];
+//              w = (long double)y[0];
+//              r_head = t+w;
+//              r_tail = w - (r_head - t);
+//
+//      e0      The exponent of x[0]. Must be <= 16360 or you need to
+//              expand the ipio2 table.
+//
+//      nx      dimension of x[]
+//
+//      prec    an integer indicating the precision:
+//                      0       24  bits (single)
+//                      1       53  bits (double)
+//                      2       64  bits (extended)
+//                      3       113 bits (quad)
+//
+// Here is the description of some local variables:
+//
+//      jk      jk+1 is the initial number of terms of ipio2[] needed
+//              in the computation. The minimum and recommended value
+//              for jk is 3,4,4,6 for single, double, extended, and quad.
+//              jk+1 must be 2 larger than you might expect so that our
+//              recomputation test works. (Up to 24 bits in the integer
+//              part (the 24 bits of it that we compute) and 23 bits in
+//              the fraction part may be lost to cancelation before we
+//              recompute.)
+//
+//      jz      local integer variable indicating the number of
+//              terms of ipio2[] used.
+//
+//      jx      nx - 1
+//
+//      jv      index for pointing to the suitable ipio2[] for the
+//              computation. In general, we want
+//                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+//              is an integer. Thus
+//                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+//              Hence jv = max(0,(e0-3)/24).
+//
+//      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
+//
+//      q[]     double array with integral value, representing the
+//              24-bits chunk of the product of x and 2/pi.
+//
+//      q0      the corresponding exponent of q[0]. Note that the
+//              exponent for q[i] would be q0-24*i.
+//
+//      PIo2[]  double precision array, obtained by cutting pi/2
+//              into 24 bits chunks.
+//
+//      f[]     ipio2[] in floating point
+//
+//      iq[]    integer array by breaking up q[] in 24-bits chunk.
+//
+//      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
+//
+//      ih      integer. If >0 it indicates q[] is >= 0.5, hence
+//              it also indicates the *sign* of the result.
+//
+///
+//
+// Constants:
+// The hexadecimal values are the intended ones for the following
+// constants. The decimal values may be used, provided that the
+// compiler will convert from decimal to binary accurately enough
+// to produce the hexadecimal values shown.
+///
+pub fn __rem_pio2_large(x: []f64, y: []f64, e0: i32, nx: i32, prec: usize) i32 {
+    var jz: i32 = undefined;
+    var jx: i32 = undefined;
+    var jv: i32 = undefined;
+    var jp: i32 = undefined;
+    var jk: i32 = undefined;
+    var carry: i32 = undefined;
+    var n: i32 = undefined;
+    var iq: [20]i32 = undefined;
+    var i: i32 = undefined;
+    var j: i32 = undefined;
+    var k: i32 = undefined;
+    var m: i32 = undefined;
+    var q0: i32 = undefined;
+    var ih: i32 = undefined;
+
+    var z: f64 = undefined;
+    var fw: f64 = undefined;
+    var f: [20]f64 = undefined;
+    var fq: [20]f64 = undefined;
+    var q: [20]f64 = undefined;
+
+    // initialize jk
+    jk = init_jk[prec];
+    jp = jk;
+
+    // determine jx,jv,q0, note that 3>q0
+    jx = nx - 1;
+    jv = @divFloor(e0 - 3, 24);
+    if (jv < 0) jv = 0;
+    q0 = e0 - 24 * (jv + 1);
+
+    // set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk]
+    j = jv - jx;
+    m = jx + jk;
+    i = 0;
+    while (i <= m) : ({
+        i += 1;
+        j += 1;
+    }) {
+        f[U(i)] = if (j < 0) 0.0 else @intToFloat(f64, ipio2[U(j)]);
+    }
+
+    // compute q[0],q[1],...q[jk]
+    i = 0;
+    while (i <= jk) : (i += 1) {
+        j = 0;
+        fw = 0;
+        while (j <= jx) : (j += 1) {
+            fw += x[U(j)] * f[U(jx + i - j)];
+        }
+        q[U(i)] = fw;
+    }
+
+    jz = jk;
+
+    // This is to handle a non-trivial goto translation from C.
+    // An unconditional return statement is found at the end of this loop.
+    recompute: while (true) {
+        // distill q[] into iq[] reversingly
+        i = 0;
+        j = jz;
+        z = q[U(jz)];
+        while (j > 0) : ({
+            i += 1;
+            j -= 1;
+        }) {
+            fw = @intToFloat(f64, @floatToInt(i32, 0x1p-24 * z));
+            iq[U(i)] = @floatToInt(i32, z - 0x1p24 * fw);
+            z = q[U(j - 1)] + fw;
+        }
+
+        // compute n
+        z = math.scalbn(z, q0); // actual value of z
+        z -= 8.0 * math.floor(z * 0.125); // trim off integer >= 8
+        n = @floatToInt(i32, z);
+        z -= @intToFloat(f64, n);
+        ih = 0;
+        if (q0 > 0) { // need iq[jz-1] to determine n
+            i = iq[U(jz - 1)] >> @intCast(u5, 24 - q0);
+            n += i;
+            iq[U(jz - 1)] -= i << @intCast(u5, 24 - q0);
+            ih = iq[U(jz - 1)] >> @intCast(u5, 23 - q0);
+        } else if (q0 == 0) {
+            ih = iq[U(jz - 1)] >> 23;
+        } else if (z >= 0.5) {
+            ih = 2;
+        }
+
+        if (ih > 0) { // q > 0.5
+            n += 1;
+            carry = 0;
+            i = 0;
+            while (i < jz) : (i += 1) { // compute 1-q
+                j = iq[U(i)];
+                if (carry == 0) {
+                    if (j != 0) {
+                        carry = 1;
+                        iq[U(i)] = 0x1000000 - j;
+                    }
+                } else {
+                    iq[U(i)] = 0xffffff - j;
+                }
+            }
+            if (q0 > 0) { // rare case: chance is 1 in 12
+                switch (q0) {
+                    1 => iq[U(jz - 1)] &= 0x7fffff,
+                    2 => iq[U(jz - 1)] &= 0x3fffff,
+                    else => unreachable,
+                }
+            }
+            if (ih == 2) {
+                z = 1.0 - z;
+                if (carry != 0) {
+                    z -= math.scalbn(@as(f64, 1.0), q0);
+                }
+            }
+        }
+
+        // check if recomputation is needed
+        if (z == 0.0) {
+            j = 0;
+            i = jz - 1;
+            while (i >= jk) : (i -= 1) {
+                j |= iq[U(i)];
+            }
+
+            if (j == 0) { // need recomputation
+                k = 1;
+                while (iq[U(jk - k)] == 0) : (k += 1) {
+                    // k = no. of terms needed
+                }
+
+                i = jz + 1;
+                while (i <= jz + k) : (i += 1) { // add q[jz+1] to q[jz+k]
+                    f[U(jx + i)] = @intToFloat(f64, ipio2[U(jv + i)]);
+                    j = 0;
+                    fw = 0;
+                    while (j <= jx) : (j += 1) {
+                        fw += x[U(j)] * f[U(jx + i - j)];
+                    }
+                    q[U(i)] = fw;
+                }
+                jz += k;
+                continue :recompute; // mimic goto recompute
+            }
+        }
+
+        // chop off zero terms
+        if (z == 0.0) {
+            jz -= 1;
+            q0 -= 24;
+            while (iq[U(jz)] == 0) {
+                jz -= 1;
+                q0 -= 24;
+            }
+        } else { // break z into 24-bit if necessary
+            z = math.scalbn(z, -q0);
+            if (z >= 0x1p24) {
+                fw = @intToFloat(f64, @floatToInt(i32, 0x1p-24 * z));
+                iq[U(jz)] = @floatToInt(i32, z - 0x1p24 * fw);
+                jz += 1;
+                q0 += 24;
+                iq[U(jz)] = @floatToInt(i32, fw);
+            } else {
+                iq[U(jz)] = @floatToInt(i32, z);
+            }
+        }
+
+        // convert integer "bit" chunk to floating-point value
+        fw = math.scalbn(@as(f64, 1.0), q0);
+        i = jz;
+        while (i >= 0) : (i -= 1) {
+            q[U(i)] = fw * @intToFloat(f64, iq[U(i)]);
+            fw *= 0x1p-24;
+        }
+
+        // compute PIo2[0,...,jp]*q[jz,...,0]
+        i = jz;
+        while (i >= 0) : (i -= 1) {
+            fw = 0;
+            k = 0;
+            while (k <= jp and k <= jz - i) : (k += 1) {
+                fw += PIo2[U(k)] * q[U(i + k)];
+            }
+            fq[U(jz - i)] = fw;
+        }
+
+        // compress fq[] into y[]
+        switch (prec) {
+            0 => {
+                fw = 0.0;
+                i = jz;
+                while (i >= 0) : (i -= 1) {
+                    fw += fq[U(i)];
+                }
+                y[0] = if (ih == 0) fw else -fw;
+            },
+
+            1, 2 => {
+                fw = 0.0;
+                i = jz;
+                while (i >= 0) : (i -= 1) {
+                    fw += fq[U(i)];
+                }
+                // TODO: drop excess precision here once double_t is used
+                fw = fw;
+                y[0] = if (ih == 0) fw else -fw;
+                fw = fq[0] - fw;
+                i = 1;
+                while (i <= jz) : (i += 1) {
+                    fw += fq[U(i)];
+                }
+                y[1] = if (ih == 0) fw else -fw;
+            },
+            3 => { // painful
+                i = jz;
+                while (i > 0) : (i -= 1) {
+                    fw = fq[U(i - 1)] + fq[U(i)];
+                    fq[U(i)] += fq[U(i - 1)] - fw;
+                    fq[U(i - 1)] = fw;
+                }
+                i = jz;
+                while (i > 1) : (i -= 1) {
+                    fw = fq[U(i - 1)] + fq[U(i)];
+                    fq[U(i)] += fq[U(i - 1)] - fw;
+                    fq[U(i - 1)] = fw;
+                }
+                fw = 0;
+                i = jz;
+                while (i >= 2) : (i -= 1) {
+                    fw += fq[U(i)];
+                }
+                if (ih == 0) {
+                    y[0] = fq[0];
+                    y[1] = fq[1];
+                    y[2] = fw;
+                } else {
+                    y[0] = -fq[0];
+                    y[1] = -fq[1];
+                    y[2] = -fw;
+                }
+            },
+            else => unreachable,
+        }
+
+        return n & 7;
+    }
+}
--- a/lib/std/math/__rem_pio2f.zig
+++ b/lib/std/math/__rem_pio2f.zig
@ -0,0 +1,70 @@
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+//
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__rem_pio2f.c
+
+const std = @import("../std.zig");
+const __rem_pio2_large = @import("__rem_pio2_large.zig").__rem_pio2_large;
+const math = std.math;
+
+const toint = 1.5 / math.epsilon(f64);
+// pi/4
+const pio4 = 0x1.921fb6p-1;
+// invpio2:  53 bits of 2/pi
+const invpio2 = 6.36619772367581382433e-01; // 0x3FE45F30, 0x6DC9C883
+// pio2_1:   first 25 bits of pi/2
+const pio2_1 = 1.57079631090164184570e+00; // 0x3FF921FB, 0x50000000
+// pio2_1t:  pi/2 - pio2_1
+const pio2_1t = 1.58932547735281966916e-08; // 0x3E5110b4, 0x611A6263
+
+// Returns the remainder of x rem pi/2 in *y
+// use double precision for everything except passing x
+// use __rem_pio2_large() for large x
+pub fn __rem_pio2f(x: f32, y: *f64) i32 {
+    var tx: [1]f64 = undefined;
+    var ty: [1]f64 = undefined;
+    var @"fn": f64 = undefined;
+    var ix: u32 = undefined;
+    var n: i32 = undefined;
+    var sign: bool = undefined;
+    var e0: u32 = undefined;
+    var ui: u32 = undefined;
+
+    ui = @bitCast(u32, x);
+    ix = ui & 0x7fffffff;
+
+    // 25+53 bit pi is good enough for medium size
+    if (ix < 0x4dc90fdb) { // |x| ~< 2^28*(pi/2), medium size
+        // Use a specialized rint() to get fn.
+        @"fn" = @floatCast(f64, x) * invpio2 + toint - toint;
+        n = @floatToInt(i32, @"fn");
+        y.* = x - @"fn" * pio2_1 - @"fn" * pio2_1t;
+        // Matters with directed rounding.
+        if (y.* < -pio4) {
+            n -= 1;
+            @"fn" -= 1;
+            y.* = x - @"fn" * pio2_1 - @"fn" * pio2_1t;
+        } else if (y.* > pio4) {
+            n += 1;
+            @"fn" += 1;
+            y.* = x - @"fn" * pio2_1 - @"fn" * pio2_1t;
+        }
+        return n;
+    }
+    if (ix >= 0x7f800000) { // x is inf or NaN
+        y.* = x - x;
+        return 0;
+    }
+    // scale x into [2^23, 2^24-1]
+    sign = ui >> 31 != 0;
+    e0 = (ix >> 23) - (0x7f + 23); // e0 = ilogb(|x|)-23, positive
+    ui = ix - (e0 << 23);
+    tx[0] = @bitCast(f32, ui);
+    n = __rem_pio2_large(&tx, &ty, @intCast(i32, e0), 1, 0);
+    if (sign) {
+        y.* = -ty[0];
+        return -n;
+    }
+    y.* = ty[0];
+    return n;
+}
--- a/lib/std/math/__trig.zig
+++ b/lib/std/math/__trig.zig
@ -0,0 +1,273 @@
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+//
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__cos.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__cosdf.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__sin.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__sindf.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__tand.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/__tandf.c
+
+// kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+//
+// Algorithm
+//      1. Since cos(-x) = cos(x), we need only to consider positive x.
+//      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
+//      3. cos(x) is approximated by a polynomial of degree 14 on
+//         [0,pi/4]
+//                                       4            14
+//              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
+//         where the remez error is
+//
+//      |              2     4     6     8     10    12     14 |     -58
+//      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
+//      |                                                      |
+//
+//                     4     6     8     10    12     14
+//      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
+//             cos(x) ~ 1 - x*x/2 + r
+//         since cos(x+y) ~ cos(x) - sin(x)*y
+//                        ~ cos(x) - x*y,
+//         a correction term is necessary in cos(x) and hence
+//              cos(x+y) = 1 - (x*x/2 - (r - x*y))
+//         For better accuracy, rearrange to
+//              cos(x+y) ~ w + (tmp + (r-x*y))
+//         where w = 1 - x*x/2 and tmp is a tiny correction term
+//         (1 - x*x/2 == w + tmp exactly in infinite precision).
+//         The exactness of w + tmp in infinite precision depends on w
+//         and tmp having the same precision as x.  If they have extra
+//         precision due to compiler bugs, then the extra precision is
+//         only good provided it is retained in all terms of the final
+//         expression for cos().  Retention happens in all cases tested
+//         under FreeBSD, so don't pessimize things by forcibly clipping
+//         any extra precision in w.
+pub fn __cos(x: f64, y: f64) f64 {
+    const C1 = 4.16666666666666019037e-02; // 0x3FA55555, 0x5555554C
+    const C2 = -1.38888888888741095749e-03; // 0xBF56C16C, 0x16C15177
+    const C3 = 2.48015872894767294178e-05; // 0x3EFA01A0, 0x19CB1590
+    const C4 = -2.75573143513906633035e-07; // 0xBE927E4F, 0x809C52AD
+    const C5 = 2.08757232129817482790e-09; // 0x3E21EE9E, 0xBDB4B1C4
+    const C6 = -1.13596475577881948265e-11; // 0xBDA8FAE9, 0xBE8838D4
+
+    const z = x * x;
+    const zs = z * z;
+    const r = z * (C1 + z * (C2 + z * C3)) + zs * zs * (C4 + z * (C5 + z * C6));
+    const hz = 0.5 * z;
+    const w = 1.0 - hz;
+    return w + (((1.0 - w) - hz) + (z * r - x * y));
+}
+
+pub fn __cosdf(x: f64) f32 {
+    // |cos(x) - c(x)| < 2**-34.1 (~[-5.37e-11, 5.295e-11]).
+    const C0 = -0x1ffffffd0c5e81.0p-54; // -0.499999997251031003120
+    const C1 = 0x155553e1053a42.0p-57; //  0.0416666233237390631894
+    const C2 = -0x16c087e80f1e27.0p-62; // -0.00138867637746099294692
+    const C3 = 0x199342e0ee5069.0p-68; //  0.0000243904487962774090654
+
+    // Try to optimize for parallel evaluation as in __tandf.c.
+    const z = x * x;
+    const w = z * z;
+    const r = C2 + z * C3;
+    return @floatCast(f32, ((1.0 + z * C0) + w * C1) + (w * z) * r);
+}
+
+// kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+// Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
+//
+// Algorithm
+//      1. Since sin(-x) = -sin(x), we need only to consider positive x.
+//      2. Callers must return sin(-0) = -0 without calling here since our
+//         odd polynomial is not evaluated in a way that preserves -0.
+//         Callers may do the optimization sin(x) ~ x for tiny x.
+//      3. sin(x) is approximated by a polynomial of degree 13 on
+//         [0,pi/4]
+//                               3            13
+//              sin(x) ~ x + S1*x + ... + S6*x
+//         where
+//
+//      |sin(x)         2     4     6     8     10     12  |     -58
+//      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
+//      |  x                                               |
+//
+//      4. sin(x+y) = sin(x) + sin'(x')*y
+//                  ~ sin(x) + (1-x*x/2)*y
+//         For better accuracy, let
+//                   3      2      2      2      2
+//              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
+//         then                   3    2
+//              sin(x) = x + (S1*x + (x *(r-y/2)+y))
+pub fn __sin(x: f64, y: f64, iy: i32) f64 {
+    const S1 = -1.66666666666666324348e-01; // 0xBFC55555, 0x55555549
+    const S2 = 8.33333333332248946124e-03; // 0x3F811111, 0x1110F8A6
+    const S3 = -1.98412698298579493134e-04; // 0xBF2A01A0, 0x19C161D5
+    const S4 = 2.75573137070700676789e-06; // 0x3EC71DE3, 0x57B1FE7D
+    const S5 = -2.50507602534068634195e-08; // 0xBE5AE5E6, 0x8A2B9CEB
+    const S6 = 1.58969099521155010221e-10; // 0x3DE5D93A, 0x5ACFD57C
+
+    const z = x * x;
+    const w = z * z;
+    const r = S2 + z * (S3 + z * S4) + z * w * (S5 + z * S6);
+    const v = z * x;
+    if (iy == 0) {
+        return x + v * (S1 + z * r);
+    } else {
+        return x - ((z * (0.5 * y - v * r) - y) - v * S1);
+    }
+}
+
+pub fn __sindf(x: f64) f32 {
+    // |sin(x)/x - s(x)| < 2**-37.5 (~[-4.89e-12, 4.824e-12]).
+    const S1 = -0x15555554cbac77.0p-55; // -0.166666666416265235595
+    const S2 = 0x111110896efbb2.0p-59; //  0.0083333293858894631756
+    const S3 = -0x1a00f9e2cae774.0p-65; // -0.000198393348360966317347
+    const S4 = 0x16cd878c3b46a7.0p-71; //  0.0000027183114939898219064
+
+    // Try to optimize for parallel evaluation as in __tandf.c.
+    const z = x * x;
+    const w = z * z;
+    const r = S3 + z * S4;
+    const s = z * x;
+    return @floatCast(f32, (x + s * (S1 + z * S2)) + s * w * r);
+}
+
+// kernel tan function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
+// Input x is assumed to be bounded by ~pi/4 in magnitude.
+// Input y is the tail of x.
+// Input odd indicates whether tan (if odd = 0) or -1/tan (if odd = 1) is returned.
+//
+// Algorithm
+//      1. Since tan(-x) = -tan(x), we need only to consider positive x.
+//      2. Callers must return tan(-0) = -0 without calling here since our
+//         odd polynomial is not evaluated in a way that preserves -0.
+//         Callers may do the optimization tan(x) ~ x for tiny x.
+//      3. tan(x) is approximated by a odd polynomial of degree 27 on
+//         [0,0.67434]
+//                               3             27
+//              tan(x) ~ x + T1*x + ... + T13*x
+//         where
+//
+//              |tan(x)         2     4            26   |     -59.2
+//              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
+//              |  x                                    |
+//
+//         Note: tan(x+y) = tan(x) + tan'(x)*y
+//                        ~ tan(x) + (1+x*x)*y
+//         Therefore, for better accuracy in computing tan(x+y), let
+//                   3      2      2       2       2
+//              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
+//         then
+//                                  3    2
+//              tan(x+y) = x + (T1*x + (x *(r+y)+y))
+//
+//      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
+//              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
+//                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
+pub fn __tan(x_: f64, y_: f64, odd: bool) f64 {
+    var x = x_;
+    var y = y_;
+
+    const T = [_]f64{
+        3.33333333333334091986e-01, // 3FD55555, 55555563
+        1.33333333333201242699e-01, // 3FC11111, 1110FE7A
+        5.39682539762260521377e-02, // 3FABA1BA, 1BB341FE
+        2.18694882948595424599e-02, // 3F9664F4, 8406D637
+        8.86323982359930005737e-03, // 3F8226E3, E96E8493
+        3.59207910759131235356e-03, // 3F6D6D22, C9560328
+        1.45620945432529025516e-03, // 3F57DBC8, FEE08315
+        5.88041240820264096874e-04, // 3F4344D8, F2F26501
+        2.46463134818469906812e-04, // 3F3026F7, 1A8D1068
+        7.81794442939557092300e-05, // 3F147E88, A03792A6
+        7.14072491382608190305e-05, // 3F12B80F, 32F0A7E9
+        -1.85586374855275456654e-05, // BEF375CB, DB605373
+        2.59073051863633712884e-05, // 3EFB2A70, 74BF7AD4
+    };
+    const pio4 = 7.85398163397448278999e-01; // 3FE921FB, 54442D18
+    const pio4lo = 3.06161699786838301793e-17; // 3C81A626, 33145C07
+
+    var z: f64 = undefined;
+    var r: f64 = undefined;
+    var v: f64 = undefined;
+    var w: f64 = undefined;
+    var s: f64 = undefined;
+    var a: f64 = undefined;
+    var w0: f64 = undefined;
+    var a0: f64 = undefined;
+    var hx: u32 = undefined;
+    var sign: bool = undefined;
+
+    hx = @intCast(u32, @bitCast(u64, x) >> 32);
+    const big = (hx & 0x7fffffff) >= 0x3FE59428; // |x| >= 0.6744
+    if (big) {
+        sign = hx >> 31 != 0;
+        if (sign) {
+            x = -x;
+            y = -y;
+        }
+        x = (pio4 - x) + (pio4lo - y);
+        y = 0.0;
+    }
+    z = x * x;
+    w = z * z;
+
+    // Break x^5*(T[1]+x^2*T[2]+...) into
+    // x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+    // x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+    r = T[1] + w * (T[3] + w * (T[5] + w * (T[7] + w * (T[9] + w * T[11]))));
+    v = z * (T[2] + w * (T[4] + w * (T[6] + w * (T[8] + w * (T[10] + w * T[12])))));
+    s = z * x;
+    r = y + z * (s * (r + v) + y) + s * T[0];
+    w = x + r;
+    if (big) {
+        s = 1 - 2 * @intToFloat(f64, @boolToInt(odd));
+        v = s - 2.0 * (x + (r - w * w / (w + s)));
+        return if (sign) -v else v;
+    }
+    if (!odd) {
+        return w;
+    }
+    // -1.0/(x+r) has up to 2ulp error, so compute it accurately
+    w0 = w;
+    w0 = @bitCast(f64, @bitCast(u64, w0) & 0xffffffff00000000);
+    v = r - (w0 - x); // w0+v = r+x
+    a = -1.0 / w;
+    a0 = a;
+    a0 = @bitCast(f64, @bitCast(u64, a0) & 0xffffffff00000000);
+    return a0 + a * (1.0 + a0 * w0 + a0 * v);
+}
+
+pub fn __tandf(x: f64, odd: bool) f32 {
+    // |tan(x)/x - t(x)| < 2**-25.5 (~[-2e-08, 2e-08]).
+    const T = [_]f64{
+        0x15554d3418c99f.0p-54, // 0.333331395030791399758
+        0x1112fd38999f72.0p-55, // 0.133392002712976742718
+        0x1b54c91d865afe.0p-57, // 0.0533812378445670393523
+        0x191df3908c33ce.0p-58, // 0.0245283181166547278873
+        0x185dadfcecf44e.0p-61, // 0.00297435743359967304927
+        0x1362b9bf971bcd.0p-59, // 0.00946564784943673166728
+    };
+
+    const z = x * x;
+    // Split up the polynomial into small independent terms to give
+    // opportunities for parallel evaluation.  The chosen splitting is
+    // micro-optimized for Athlons (XP, X64).  It costs 2 multiplications
+    // relative to Horner's method on sequential machines.
+    //
+    // We add the small terms from lowest degree up for efficiency on
+    // non-sequential machines (the lowest degree terms tend to be ready
+    // earlier).  Apart from this, we don't care about order of
+    // operations, and don't need to to care since we have precision to
+    // spare.  However, the chosen splitting is good for accuracy too,
+    // and would give results as accurate as Horner's method if the
+    // small terms were added from highest degree down.
+    const r = T[4] + z * T[5];
+    const t = T[2] + z * T[3];
+    const w = z * z;
+    const s = z * x;
+    const u = T[0] + z * T[1];
+    const r0 = (x + s * u) + (s * w) * (t + w * r);
+    return @floatCast(f32, if (odd) -1.0 / r0 else r0);
+}
--- a/lib/std/math/cos.zig
+++ b/lib/std/math/cos.zig
@ -1,12 +1,17 @@
-// Ported from go, which is licensed under a BSD-3 license.
-// https://golang.org/LICENSE
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
 //
-// https://golang.org/src/math/sin.go
+// https://git.musl-libc.org/cgit/musl/tree/src/math/cosf.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/cos.c

 const std = @import("../std.zig");
 const math = std.math;
 const expect = std.testing.expect;

+const kernel = @import("__trig.zig");
+const __rem_pio2 = @import("__rem_pio2.zig").__rem_pio2;
+const __rem_pio2f = @import("__rem_pio2f.zig").__rem_pio2f;
+
 /// Returns the cosine of the radian value x.
 ///
 /// Special Cases:
@ -15,109 +20,135 @@ const expect = std.testing.expect;
 pub fn cos(x: anytype) @TypeOf(x) {
    const T = @TypeOf(x);
    return switch (T) {
-        f32 => cos_(f32, x),
-        f64 => cos_(f64, x),
+        f32 => cos32(x),
+        f64 => cos64(x),
        else => @compileError("cos not implemented for " ++ @typeName(T)),
    };
 }

-// sin polynomial coefficients
-const S0 = 1.58962301576546568060E-10;
-const S1 = -2.50507477628578072866E-8;
-const S2 = 2.75573136213857245213E-6;
-const S3 = -1.98412698295895385996E-4;
-const S4 = 8.33333333332211858878E-3;
-const S5 = -1.66666666666666307295E-1;
+fn cos32(x: f32) f32 {
+    // Small multiples of pi/2 rounded to double precision.
+    const c1pio2: f64 = 1.0 * math.pi / 2.0; // 0x3FF921FB, 0x54442D18
+    const c2pio2: f64 = 2.0 * math.pi / 2.0; // 0x400921FB, 0x54442D18
+    const c3pio2: f64 = 3.0 * math.pi / 2.0; // 0x4012D97C, 0x7F3321D2
+    const c4pio2: f64 = 4.0 * math.pi / 2.0; // 0x401921FB, 0x54442D18

-// cos polynomial coeffiecients
-const C0 = -1.13585365213876817300E-11;
-const C1 = 2.08757008419747316778E-9;
-const C2 = -2.75573141792967388112E-7;
-const C3 = 2.48015872888517045348E-5;
-const C4 = -1.38888888888730564116E-3;
-const C5 = 4.16666666666665929218E-2;
+    var ix = @bitCast(u32, x);
+    const sign = ix >> 31 != 0;
+    ix &= 0x7fffffff;

-const pi4a = 7.85398125648498535156e-1;
-const pi4b = 3.77489470793079817668E-8;
-const pi4c = 2.69515142907905952645E-15;
-const m4pi = 1.273239544735162542821171882678754627704620361328125;
-
-fn cos_(comptime T: type, x_: T) T {
-    const I = std.meta.Int(.signed, @typeInfo(T).Float.bits);
-
-    var x = x_;
-    if (math.isNan(x) or math.isInf(x)) {
-        return math.nan(T);
+    if (ix <= 0x3f490fda) { // |x| ~<= pi/4
+        if (ix < 0x39800000) { // |x| < 2**-12
+            // raise inexact if x != 0
+            math.doNotOptimizeAway(x + 0x1p120);
+            return 1.0;
+        }
+        return kernel.__cosdf(x);
+    }
+    if (ix <= 0x407b53d1) { // |x| ~<= 5*pi/4
+        if (ix > 0x4016cbe3) { // |x|  ~> 3*pi/4
+            return -kernel.__cosdf(if (sign) x + c2pio2 else x - c2pio2);
+        } else {
+            if (sign) {
+                return kernel.__sindf(x + c1pio2);
+            } else {
+                return kernel.__sindf(c1pio2 - x);
+            }
+        }
+    }
+    if (ix <= 0x40e231d5) { // |x| ~<= 9*pi/4
+        if (ix > 0x40afeddf) { // |x| ~> 7*pi/4
+            return kernel.__cosdf(if (sign) x + c4pio2 else x - c4pio2);
+        } else {
+            if (sign) {
+                return kernel.__sindf(-x - c3pio2);
+            } else {
+                return kernel.__sindf(x - c3pio2);
+            }
+        }
    }

-    var sign = false;
-    x = math.fabs(x);
-
-    var y = math.floor(x * m4pi);
-    var j = @floatToInt(I, y);
-
-    if (j & 1 == 1) {
-        j += 1;
-        y += 1;
+    // cos(Inf or NaN) is NaN
+    if (ix >= 0x7f800000) {
+        return x - x;
    }

-    j &= 7;
-    if (j > 3) {
-        j -= 4;
-        sign = !sign;
-    }
-    if (j > 1) {
-        sign = !sign;
+    var y: f64 = undefined;
+    const n = __rem_pio2f(x, &y);
+    return switch (n & 3) {
+        0 => kernel.__cosdf(y),
+        1 => kernel.__sindf(-y),
+        2 => -kernel.__cosdf(y),
+        else => kernel.__sindf(y),
+    };
+}
+
+fn cos64(x: f64) f64 {
+    var ix = @bitCast(u64, x) >> 32;
+    ix &= 0x7fffffff;
+
+    // |x| ~< pi/4
+    if (ix <= 0x3fe921fb) {
+        if (ix < 0x3e46a09e) { // |x| < 2**-27 * sqrt(2)
+            // raise inexact if x!=0
+            math.doNotOptimizeAway(x + 0x1p120);
+            return 1.0;
+        }
+        return kernel.__cos(x, 0);
    }

-    const z = ((x - y * pi4a) - y * pi4b) - y * pi4c;
-    const w = z * z;
+    // cos(Inf or NaN) is NaN
+    if (ix >= 0x7ff00000) {
+        return x - x;
+    }

-    const r = if (j == 1 or j == 2)
-        z + z * w * (S5 + w * (S4 + w * (S3 + w * (S2 + w * (S1 + w * S0)))))
-    else
-        1.0 - 0.5 * w + w * w * (C5 + w * (C4 + w * (C3 + w * (C2 + w * (C1 + w * C0)))));
-
-    return if (sign) -r else r;
+    var y: [2]f64 = undefined;
+    const n = __rem_pio2(x, &y);
+    return switch (n & 3) {
+        0 => kernel.__cos(y[0], y[1]),
+        1 => -kernel.__sin(y[0], y[1], 1),
+        2 => -kernel.__cos(y[0], y[1]),
+        else => kernel.__sin(y[0], y[1], 1),
+    };
 }

 test "math.cos" {
-    try expect(cos(@as(f32, 0.0)) == cos_(f32, 0.0));
-    try expect(cos(@as(f64, 0.0)) == cos_(f64, 0.0));
+    try expect(cos(@as(f32, 0.0)) == cos32(0.0));
+    try expect(cos(@as(f64, 0.0)) == cos64(0.0));
 }

 test "math.cos32" {
-    const epsilon = 0.000001;
+    const epsilon = 0.00001;

-    try expect(math.approxEqAbs(f32, cos_(f32, 0.0), 1.0, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, 0.2), 0.980067, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, 0.8923), 0.627623, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, 1.5), 0.070737, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, -1.5), 0.070737, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, 37.45), 0.969132, epsilon));
-    try expect(math.approxEqAbs(f32, cos_(f32, 89.123), 0.400798, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(0.0), 1.0, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(0.2), 0.980067, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(0.8923), 0.627623, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(1.5), 0.070737, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(-1.5), 0.070737, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(37.45), 0.969132, epsilon));
+    try expect(math.approxEqAbs(f32, cos32(89.123), 0.400798, epsilon));
 }

 test "math.cos64" {
    const epsilon = 0.000001;

-    try expect(math.approxEqAbs(f64, cos_(f64, 0.0), 1.0, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, 0.2), 0.980067, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, 0.8923), 0.627623, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, 1.5), 0.070737, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, -1.5), 0.070737, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, 37.45), 0.969132, epsilon));
-    try expect(math.approxEqAbs(f64, cos_(f64, 89.123), 0.40080, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(0.0), 1.0, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(0.2), 0.980067, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(0.8923), 0.627623, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(1.5), 0.070737, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(-1.5), 0.070737, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(37.45), 0.969132, epsilon));
+    try expect(math.approxEqAbs(f64, cos64(89.123), 0.40080, epsilon));
 }

 test "math.cos32.special" {
-    try expect(math.isNan(cos_(f32, math.inf(f32))));
-    try expect(math.isNan(cos_(f32, -math.inf(f32))));
-    try expect(math.isNan(cos_(f32, math.nan(f32))));
+    try expect(math.isNan(cos32(math.inf(f32))));
+    try expect(math.isNan(cos32(-math.inf(f32))));
+    try expect(math.isNan(cos32(math.nan(f32))));
 }

 test "math.cos64.special" {
-    try expect(math.isNan(cos_(f64, math.inf(f64))));
-    try expect(math.isNan(cos_(f64, -math.inf(f64))));
-    try expect(math.isNan(cos_(f64, math.nan(f64))));
+    try expect(math.isNan(cos64(math.inf(f64))));
+    try expect(math.isNan(cos64(-math.inf(f64))));
+    try expect(math.isNan(cos64(math.nan(f64))));
 }
--- a/lib/std/math/sin.zig
+++ b/lib/std/math/sin.zig
@ -1,12 +1,17 @@
-// Ported from go, which is licensed under a BSD-3 license.
-// https://golang.org/LICENSE
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+//
+// https://git.musl-libc.org/cgit/musl/tree/src/math/sinf.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/sin.c
 //
-// https://golang.org/src/math/sin.go
-
 const std = @import("../std.zig");
 const math = std.math;
 const expect = std.testing.expect;

+const kernel = @import("__trig.zig");
+const __rem_pio2 = @import("__rem_pio2.zig").__rem_pio2;
+const __rem_pio2f = @import("__rem_pio2f.zig").__rem_pio2f;
+
 /// Returns the sine of the radian value x.
 ///
 /// Special Cases:
@ -16,114 +21,148 @@ const expect = std.testing.expect;
 pub fn sin(x: anytype) @TypeOf(x) {
    const T = @TypeOf(x);
    return switch (T) {
-        f32 => sin_(T, x),
-        f64 => sin_(T, x),
+        f32 => sin32(x),
+        f64 => sin64(x),
        else => @compileError("sin not implemented for " ++ @typeName(T)),
    };
 }

-// sin polynomial coefficients
-const S0 = 1.58962301576546568060E-10;
-const S1 = -2.50507477628578072866E-8;
-const S2 = 2.75573136213857245213E-6;
-const S3 = -1.98412698295895385996E-4;
-const S4 = 8.33333333332211858878E-3;
-const S5 = -1.66666666666666307295E-1;
+fn sin32(x: f32) f32 {
+    // Small multiples of pi/2 rounded to double precision.
+    const s1pio2: f64 = 1.0 * math.pi / 2.0; // 0x3FF921FB, 0x54442D18
+    const s2pio2: f64 = 2.0 * math.pi / 2.0; // 0x400921FB, 0x54442D18
+    const s3pio2: f64 = 3.0 * math.pi / 2.0; // 0x4012D97C, 0x7F3321D2
+    const s4pio2: f64 = 4.0 * math.pi / 2.0; // 0x401921FB, 0x54442D18

-// cos polynomial coeffiecients
-const C0 = -1.13585365213876817300E-11;
-const C1 = 2.08757008419747316778E-9;
-const C2 = -2.75573141792967388112E-7;
-const C3 = 2.48015872888517045348E-5;
-const C4 = -1.38888888888730564116E-3;
-const C5 = 4.16666666666665929218E-2;
+    var ix = @bitCast(u32, x);
+    const sign = ix >> 31 != 0;
+    ix &= 0x7fffffff;

-const pi4a = 7.85398125648498535156e-1;
-const pi4b = 3.77489470793079817668E-8;
-const pi4c = 2.69515142907905952645E-15;
-const m4pi = 1.273239544735162542821171882678754627704620361328125;
-
-fn sin_(comptime T: type, x_: T) T {
-    const I = std.meta.Int(.signed, @typeInfo(T).Float.bits);
-
-    var x = x_;
-    if (x == 0 or math.isNan(x)) {
-        return x;
+    if (ix <= 0x3f490fda) { // |x| ~<= pi/4
+        if (ix < 0x39800000) { // |x| < 2**-12
+            // raise inexact if x!=0 and underflow if subnormal
+            math.doNotOptimizeAway(if (ix < 0x00800000) x / 0x1p120 else x + 0x1p120);
+            return x;
+        }
+        return kernel.__sindf(x);
    }
-    if (math.isInf(x)) {
-        return math.nan(T);
+    if (ix <= 0x407b53d1) { // |x| ~<= 5*pi/4
+        if (ix <= 0x4016cbe3) { // |x| ~<= 3pi/4
+            if (sign) {
+                return -kernel.__cosdf(x + s1pio2);
+            } else {
+                return kernel.__cosdf(x - s1pio2);
+            }
+        }
+        return kernel.__sindf(if (sign) -(x + s2pio2) else -(x - s2pio2));
+    }
+    if (ix <= 0x40e231d5) { // |x| ~<= 9*pi/4
+        if (ix <= 0x40afeddf) { // |x| ~<= 7*pi/4
+            if (sign) {
+                return kernel.__cosdf(x + s3pio2);
+            } else {
+                return -kernel.__cosdf(x - s3pio2);
+            }
+        }
+        return kernel.__sindf(if (sign) x + s4pio2 else x - s4pio2);
    }

-    var sign = x < 0;
-    x = math.fabs(x);
-
-    var y = math.floor(x * m4pi);
-    var j = @floatToInt(I, y);
-
-    if (j & 1 == 1) {
-        j += 1;
-        y += 1;
+    // sin(Inf or NaN) is NaN
+    if (ix >= 0x7f800000) {
+        return x - x;
    }

-    j &= 7;
-    if (j > 3) {
-        j -= 4;
-        sign = !sign;
+    var y: f64 = undefined;
+    const n = __rem_pio2f(x, &y);
+    return switch (n & 3) {
+        0 => kernel.__sindf(y),
+        1 => kernel.__cosdf(y),
+        2 => kernel.__sindf(-y),
+        else => -kernel.__cosdf(y),
+    };
+}
+
+fn sin64(x: f64) f64 {
+    var ix = @bitCast(u64, x) >> 32;
+    ix &= 0x7fffffff;
+
+    // |x| ~< pi/4
+    if (ix <= 0x3fe921fb) {
+        if (ix < 0x3e500000) { // |x| < 2**-26
+            // raise inexact if x != 0 and underflow if subnormal
+            math.doNotOptimizeAway(if (ix < 0x00100000) x / 0x1p120 else x + 0x1p120);
+            return x;
+        }
+        return kernel.__sin(x, 0.0, 0);
    }

-    const z = ((x - y * pi4a) - y * pi4b) - y * pi4c;
-    const w = z * z;
+    // sin(Inf or NaN) is NaN
+    if (ix >= 0x7ff00000) {
+        return x - x;
+    }

-    const r = if (j == 1 or j == 2)
-        1.0 - 0.5 * w + w * w * (C5 + w * (C4 + w * (C3 + w * (C2 + w * (C1 + w * C0)))))
-    else
-        z + z * w * (S5 + w * (S4 + w * (S3 + w * (S2 + w * (S1 + w * S0)))));
-
-    return if (sign) -r else r;
+    var y: [2]f64 = undefined;
+    const n = __rem_pio2(x, &y);
+    return switch (n & 3) {
+        0 => kernel.__sin(y[0], y[1], 1),
+        1 => kernel.__cos(y[0], y[1]),
+        2 => -kernel.__sin(y[0], y[1], 1),
+        else => -kernel.__cos(y[0], y[1]),
+    };
 }

 test "math.sin" {
-    try expect(sin(@as(f32, 0.0)) == sin_(f32, 0.0));
-    try expect(sin(@as(f64, 0.0)) == sin_(f64, 0.0));
+    try expect(sin(@as(f32, 0.0)) == sin32(0.0));
+    try expect(sin(@as(f64, 0.0)) == sin64(0.0));
    try expect(comptime (math.sin(@as(f64, 2))) == math.sin(@as(f64, 2)));
 }

 test "math.sin32" {
-    const epsilon = 0.000001;
+    const epsilon = 0.00001;

-    try expect(math.approxEqAbs(f32, sin_(f32, 0.0), 0.0, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, 0.2), 0.198669, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, 0.8923), 0.778517, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, 1.5), 0.997495, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, -1.5), -0.997495, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, 37.45), -0.246544, epsilon));
-    try expect(math.approxEqAbs(f32, sin_(f32, 89.123), 0.916166, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(0.0), 0.0, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(0.2), 0.198669, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(0.8923), 0.778517, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(1.5), 0.997495, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(-1.5), -0.997495, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(37.45), -0.246544, epsilon));
+    try expect(math.approxEqAbs(f32, sin32(89.123), 0.916166, epsilon));
 }

 test "math.sin64" {
    const epsilon = 0.000001;

-    try expect(math.approxEqAbs(f64, sin_(f64, 0.0), 0.0, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, 0.2), 0.198669, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, 0.8923), 0.778517, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, 1.5), 0.997495, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, -1.5), -0.997495, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, 37.45), -0.246543, epsilon));
-    try expect(math.approxEqAbs(f64, sin_(f64, 89.123), 0.916166, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(0.0), 0.0, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(0.2), 0.198669, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(0.8923), 0.778517, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(1.5), 0.997495, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(-1.5), -0.997495, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(37.45), -0.246543, epsilon));
+    try expect(math.approxEqAbs(f64, sin64(89.123), 0.916166, epsilon));
 }

 test "math.sin32.special" {
-    try expect(sin_(f32, 0.0) == 0.0);
-    try expect(sin_(f32, -0.0) == -0.0);
-    try expect(math.isNan(sin_(f32, math.inf(f32))));
-    try expect(math.isNan(sin_(f32, -math.inf(f32))));
-    try expect(math.isNan(sin_(f32, math.nan(f32))));
+    try expect(sin32(0.0) == 0.0);
+    try expect(sin32(-0.0) == -0.0);
+    try expect(math.isNan(sin32(math.inf(f32))));
+    try expect(math.isNan(sin32(-math.inf(f32))));
+    try expect(math.isNan(sin32(math.nan(f32))));
 }

 test "math.sin64.special" {
-    try expect(sin_(f64, 0.0) == 0.0);
-    try expect(sin_(f64, -0.0) == -0.0);
-    try expect(math.isNan(sin_(f64, math.inf(f64))));
-    try expect(math.isNan(sin_(f64, -math.inf(f64))));
-    try expect(math.isNan(sin_(f64, math.nan(f64))));
+    try expect(sin64(0.0) == 0.0);
+    try expect(sin64(-0.0) == -0.0);
+    try expect(math.isNan(sin64(math.inf(f64))));
+    try expect(math.isNan(sin64(-math.inf(f64))));
+    try expect(math.isNan(sin64(math.nan(f64))));
+}
+
+test "math.sin32 #9901" {
+    const float = @bitCast(f32, @as(u32, 0b11100011111111110000000000000000));
+    _ = std.math.sin(float);
+}
+
+test "math.sin64 #9901" {
+    const float = @bitCast(f64, @as(u64, 0b1111111101000001000000001111110111111111100000000000000000000001));
+    _ = std.math.sin(float);
 }
--- a/lib/std/math/tan.zig
+++ b/lib/std/math/tan.zig
@ -1,12 +1,18 @@
-// Ported from go, which is licensed under a BSD-3 license.
-// https://golang.org/LICENSE
+// Ported from musl, which is licensed under the MIT license:
+// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
 //
+// https://git.musl-libc.org/cgit/musl/tree/src/math/tanf.c
+// https://git.musl-libc.org/cgit/musl/tree/src/math/tan.c
 // https://golang.org/src/math/tan.go

 const std = @import("../std.zig");
 const math = std.math;
 const expect = std.testing.expect;

+const kernel = @import("__trig.zig");
+const __rem_pio2 = @import("__rem_pio2.zig").__rem_pio2;
+const __rem_pio2f = @import("__rem_pio2f.zig").__rem_pio2f;
+
 /// Returns the tangent of the radian value x.
 ///
 /// Special Cases:
@ -16,102 +22,119 @@ const expect = std.testing.expect;
 pub fn tan(x: anytype) @TypeOf(x) {
    const T = @TypeOf(x);
    return switch (T) {
-        f32 => tan_(f32, x),
-        f64 => tan_(f64, x),
+        f32 => tan32(x),
+        f64 => tan64(x),
        else => @compileError("tan not implemented for " ++ @typeName(T)),
    };
 }

-const Tp0 = -1.30936939181383777646E4;
-const Tp1 = 1.15351664838587416140E6;
-const Tp2 = -1.79565251976484877988E7;
+fn tan32(x: f32) f32 {
+    // Small multiples of pi/2 rounded to double precision.
+    const t1pio2: f64 = 1.0 * math.pi / 2.0; // 0x3FF921FB, 0x54442D18
+    const t2pio2: f64 = 2.0 * math.pi / 2.0; // 0x400921FB, 0x54442D18
+    const t3pio2: f64 = 3.0 * math.pi / 2.0; // 0x4012D97C, 0x7F3321D2
+    const t4pio2: f64 = 4.0 * math.pi / 2.0; // 0x401921FB, 0x54442D18

-const Tq1 = 1.36812963470692954678E4;
-const Tq2 = -1.32089234440210967447E6;
-const Tq3 = 2.50083801823357915839E7;
-const Tq4 = -5.38695755929454629881E7;
+    var ix = @bitCast(u32, x);
+    const sign = ix >> 31 != 0;
+    ix &= 0x7fffffff;

-const pi4a = 7.85398125648498535156e-1;
-const pi4b = 3.77489470793079817668E-8;
-const pi4c = 2.69515142907905952645E-15;
-const m4pi = 1.273239544735162542821171882678754627704620361328125;
-
-fn tan_(comptime T: type, x_: T) T {
-    const I = std.meta.Int(.signed, @typeInfo(T).Float.bits);
-
-    var x = x_;
-    if (x == 0 or math.isNan(x)) {
-        return x;
+    if (ix <= 0x3f490fda) { // |x| ~<= pi/4
+        if (ix < 0x39800000) { // |x| < 2**-12
+            // raise inexact if x!=0 and underflow if subnormal
+            math.doNotOptimizeAway(if (ix < 0x00800000) x / 0x1p120 else x + 0x1p120);
+            return x;
+        }
+        return kernel.__tandf(x, false);
    }
-    if (math.isInf(x)) {
-        return math.nan(T);
+    if (ix <= 0x407b53d1) { // |x| ~<= 5*pi/4
+        if (ix <= 0x4016cbe3) { // |x| ~<= 3pi/4
+            return kernel.__tandf((if (sign) x + t1pio2 else x - t1pio2), true);
+        } else {
+            return kernel.__tandf((if (sign) x + t2pio2 else x - t2pio2), false);
+        }
+    }
+    if (ix <= 0x40e231d5) { // |x| ~<= 9*pi/4
+        if (ix <= 0x40afeddf) { // |x| ~<= 7*pi/4
+            return kernel.__tandf((if (sign) x + t3pio2 else x - t3pio2), true);
+        } else {
+            return kernel.__tandf((if (sign) x + t4pio2 else x - t4pio2), false);
+        }
    }

-    var sign = x < 0;
-    x = math.fabs(x);
-
-    var y = math.floor(x * m4pi);
-    var j = @floatToInt(I, y);
-
-    if (j & 1 == 1) {
-        j += 1;
-        y += 1;
+    // tan(Inf or NaN) is NaN
+    if (ix >= 0x7f800000) {
+        return x - x;
    }

-    const z = ((x - y * pi4a) - y * pi4b) - y * pi4c;
-    const w = z * z;
+    var y: f64 = undefined;
+    const n = __rem_pio2f(x, &y);
+    return kernel.__tandf(y, n & 1 != 0);
+}

-    var r = if (w > 1e-14)
-        z + z * (w * ((Tp0 * w + Tp1) * w + Tp2) / ((((w + Tq1) * w + Tq2) * w + Tq3) * w + Tq4))
-    else
-        z;
+fn tan64(x: f64) f64 {
+    var ix = @bitCast(u64, x) >> 32;
+    ix &= 0x7fffffff;

-    if (j & 2 == 2) {
-        r = -1 / r;
+    // |x| ~< pi/4
+    if (ix <= 0x3fe921fb) {
+        if (ix < 0x3e400000) { // |x| < 2**-27
+            // raise inexact if x!=0 and underflow if subnormal
+            math.doNotOptimizeAway(if (ix < 0x00100000) x / 0x1p120 else x + 0x1p120);
+            return x;
+        }
+        return kernel.__tan(x, 0.0, false);
    }

-    return if (sign) -r else r;
+    // tan(Inf or NaN) is NaN
+    if (ix >= 0x7ff00000) {
+        return x - x;
+    }
+
+    var y: [2]f64 = undefined;
+    const n = __rem_pio2(x, &y);
+    return kernel.__tan(y[0], y[1], n & 1 != 0);
 }

 test "math.tan" {
-    try expect(tan(@as(f32, 0.0)) == tan_(f32, 0.0));
-    try expect(tan(@as(f64, 0.0)) == tan_(f64, 0.0));
+    try expect(tan(@as(f32, 0.0)) == tan32(0.0));
+    try expect(tan(@as(f64, 0.0)) == tan64(0.0));
 }

 test "math.tan32" {
-    const epsilon = 0.000001;
+    const epsilon = 0.00001;

-    try expect(math.approxEqAbs(f32, tan_(f32, 0.0), 0.0, epsilon));
-    try expect(math.approxEqAbs(f32, tan_(f32, 0.2), 0.202710, epsilon));
-    try expect(math.approxEqAbs(f32, tan_(f32, 0.8923), 1.240422, epsilon));
-    try expect(math.approxEqAbs(f32, tan_(f32, 1.5), 14.101420, epsilon));
-    try expect(math.approxEqAbs(f32, tan_(f32, 37.45), -0.254397, epsilon));
-    try expect(math.approxEqAbs(f32, tan_(f32, 89.123), 2.285852, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(0.0), 0.0, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(0.2), 0.202710, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(0.8923), 1.240422, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(1.5), 14.101420, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(37.45), -0.254397, epsilon));
+    try expect(math.approxEqAbs(f32, tan32(89.123), 2.285852, epsilon));
 }

 test "math.tan64" {
    const epsilon = 0.000001;

-    try expect(math.approxEqAbs(f64, tan_(f64, 0.0), 0.0, epsilon));
-    try expect(math.approxEqAbs(f64, tan_(f64, 0.2), 0.202710, epsilon));
-    try expect(math.approxEqAbs(f64, tan_(f64, 0.8923), 1.240422, epsilon));
-    try expect(math.approxEqAbs(f64, tan_(f64, 1.5), 14.101420, epsilon));
-    try expect(math.approxEqAbs(f64, tan_(f64, 37.45), -0.254397, epsilon));
-    try expect(math.approxEqAbs(f64, tan_(f64, 89.123), 2.2858376, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(0.0), 0.0, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(0.2), 0.202710, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(0.8923), 1.240422, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(1.5), 14.101420, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(37.45), -0.254397, epsilon));
+    try expect(math.approxEqAbs(f64, tan64(89.123), 2.2858376, epsilon));
 }

 test "math.tan32.special" {
-    try expect(tan_(f32, 0.0) == 0.0);
-    try expect(tan_(f32, -0.0) == -0.0);
-    try expect(math.isNan(tan_(f32, math.inf(f32))));
-    try expect(math.isNan(tan_(f32, -math.inf(f32))));
-    try expect(math.isNan(tan_(f32, math.nan(f32))));
+    try expect(tan32(0.0) == 0.0);
+    try expect(tan32(-0.0) == -0.0);
+    try expect(math.isNan(tan32(math.inf(f32))));
+    try expect(math.isNan(tan32(-math.inf(f32))));
+    try expect(math.isNan(tan32(math.nan(f32))));
 }

 test "math.tan64.special" {
-    try expect(tan_(f64, 0.0) == 0.0);
-    try expect(tan_(f64, -0.0) == -0.0);
-    try expect(math.isNan(tan_(f64, math.inf(f64))));
-    try expect(math.isNan(tan_(f64, -math.inf(f64))));
-    try expect(math.isNan(tan_(f64, math.nan(f64))));
+    try expect(tan64(0.0) == 0.0);
+    try expect(tan64(-0.0) == -0.0);
+    try expect(math.isNan(tan64(math.inf(f64))));
+    try expect(math.isNan(tan64(-math.inf(f64))));
+    try expect(math.isNan(tan64(math.nan(f64))));
 }