compiler-rt: simplify implementations

This improves readability as well as compatibility with stage2. Most of compiler-rt is now enabled for stage2 with just a few functions disabled (until stage2 passes more behavior tests).
2026-02-21 16:54:52 +00:00 · 2022-01-01 19:49:34 -07:00 · 2022-01-01 19:49:34 -07:00 · 6b14c58f63
commit 6b14c58f63
parent 9dc25dd0b6
18 changed files with 1139 additions and 986 deletions
--- a/lib/std/special/compiler_rt.zig
+++ b/lib/std/special/compiler_rt.zig
--- a/lib/std/special/compiler_rt/absv.zig
+++ b/lib/std/special/compiler_rt/absv.zig
@ -2,31 +2,36 @@
 // * @panic, if value can not be represented
 // - absvXi4_generic for unoptimized version

-fn absvXi_generic(comptime ST: type) fn (a: ST) callconv(.C) ST {
-    return struct {
-        fn f(a: ST) callconv(.C) ST {
-            const UT = switch (ST) {
-                i32 => u32,
-                i64 => u64,
-                i128 => u128,
-                else => unreachable,
-            };
-            // taken from  Bit Twiddling Hacks
-            // compute the integer absolute value (abs) without branching
-            var x: ST = a;
-            const N: UT = @bitSizeOf(ST);
-            const sign: ST = a >> N - 1;
-            x +%= sign;
-            x ^= sign;
-            if (x < 0)
-                @panic("compiler_rt absv: overflow");
-            return x;
-        }
-    }.f;
+inline fn absvXi(comptime ST: type, a: ST) ST {
+    const UT = switch (ST) {
+        i32 => u32,
+        i64 => u64,
+        i128 => u128,
+        else => unreachable,
+    };
+    // taken from  Bit Twiddling Hacks
+    // compute the integer absolute value (abs) without branching
+    var x: ST = a;
+    const N: UT = @bitSizeOf(ST);
+    const sign: ST = a >> N - 1;
+    x +%= sign;
+    x ^= sign;
+    if (x < 0)
+        @panic("compiler_rt absv: overflow");
+    return x;
+}
+
+pub fn __absvsi2(a: i32) callconv(.C) i32 {
+    return absvXi(i32, a);
+}
+
+pub fn __absvdi2(a: i64) callconv(.C) i64 {
+    return absvXi(i64, a);
+}
+
+pub fn __absvti2(a: i128) callconv(.C) i128 {
+    return absvXi(i128, a);
 }
-pub const __absvsi2 = absvXi_generic(i32);
-pub const __absvdi2 = absvXi_generic(i64);
-pub const __absvti2 = absvXi_generic(i128);

 test {
    _ = @import("absvsi2_test.zig");
--- a/lib/std/special/compiler_rt/atomics.zig
+++ b/lib/std/special/compiler_rt/atomics.zig
@ -131,213 +131,303 @@ comptime {
 // Specialized versions of the GCC atomic builtin functions.
 // LLVM emits those iff the object size is known and the pointers are correctly
 // aligned.
+inline fn atomic_load_N(comptime T: type, src: *T, model: i32) T {
+    _ = model;
+    if (@sizeOf(T) > largest_atomic_size) {
+        var sl = spinlocks.get(@ptrToInt(src));
+        defer sl.release();
+        return src.*;
+    } else {
+        return @atomicLoad(T, src, .SeqCst);
+    }
+}

-fn atomicLoadFn(comptime T: type) fn (*T, i32) callconv(.C) T {
-    return struct {
-        fn atomic_load_N(src: *T, model: i32) callconv(.C) T {
-            _ = model;
-            if (@sizeOf(T) > largest_atomic_size) {
-                var sl = spinlocks.get(@ptrToInt(src));
-                defer sl.release();
-                return src.*;
-            } else {
-                return @atomicLoad(T, src, .SeqCst);
-            }
+fn __atomic_load_1(src: *u8, model: i32) callconv(.C) u8 {
+    return atomic_load_N(u8, src, model);
+}
+
+fn __atomic_load_2(src: *u16, model: i32) callconv(.C) u16 {
+    return atomic_load_N(u16, src, model);
+}
+
+fn __atomic_load_4(src: *u32, model: i32) callconv(.C) u32 {
+    return atomic_load_N(u32, src, model);
+}
+
+fn __atomic_load_8(src: *u64, model: i32) callconv(.C) u64 {
+    return atomic_load_N(u64, src, model);
+}
+
+inline fn atomic_store_N(comptime T: type, dst: *T, value: T, model: i32) void {
+    _ = model;
+    if (@sizeOf(T) > largest_atomic_size) {
+        var sl = spinlocks.get(@ptrToInt(dst));
+        defer sl.release();
+        dst.* = value;
+    } else {
+        @atomicStore(T, dst, value, .SeqCst);
+    }
+}
+
+fn __atomic_store_1(dst: *u8, value: u8, model: i32) callconv(.C) void {
+    return atomic_store_N(u8, dst, value, model);
+}
+
+fn __atomic_store_2(dst: *u16, value: u16, model: i32) callconv(.C) void {
+    return atomic_store_N(u16, dst, value, model);
+}
+
+fn __atomic_store_4(dst: *u32, value: u32, model: i32) callconv(.C) void {
+    return atomic_store_N(u32, dst, value, model);
+}
+
+fn __atomic_store_8(dst: *u64, value: u64, model: i32) callconv(.C) void {
+    return atomic_store_N(u64, dst, value, model);
+}
+
+inline fn atomic_exchange_N(comptime T: type, ptr: *T, val: T, model: i32) T {
+    _ = model;
+    if (@sizeOf(T) > largest_atomic_size) {
+        var sl = spinlocks.get(@ptrToInt(ptr));
+        defer sl.release();
+        const value = ptr.*;
+        ptr.* = val;
+        return value;
+    } else {
+        return @atomicRmw(T, ptr, .Xchg, val, .SeqCst);
+    }
+}
+
+fn __atomic_exchange_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return atomic_exchange_N(u8, ptr, val, model);
+}
+
+fn __atomic_exchange_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return atomic_exchange_N(u16, ptr, val, model);
+}
+
+fn __atomic_exchange_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return atomic_exchange_N(u32, ptr, val, model);
+}
+
+fn __atomic_exchange_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return atomic_exchange_N(u64, ptr, val, model);
+}
+
+inline fn atomic_compare_exchange_N(
+    comptime T: type,
+    ptr: *T,
+    expected: *T,
+    desired: T,
+    success: i32,
+    failure: i32,
+) i32 {
+    _ = success;
+    _ = failure;
+    if (@sizeOf(T) > largest_atomic_size) {
+        var sl = spinlocks.get(@ptrToInt(ptr));
+        defer sl.release();
+        const value = ptr.*;
+        if (value == expected.*) {
+            ptr.* = desired;
+            return 1;
        }
-    }.atomic_load_N;
+        expected.* = value;
+        return 0;
+    } else {
+        if (@cmpxchgStrong(T, ptr, expected.*, desired, .SeqCst, .SeqCst)) |old_value| {
+            expected.* = old_value;
+            return 0;
+        }
+        return 1;
+    }
+}
+
+fn __atomic_compare_exchange_1(ptr: *u8, expected: *u8, desired: u8, success: i32, failure: i32) callconv(.C) i32 {
+    return atomic_compare_exchange_N(u8, ptr, expected, desired, success, failure);
+}
+
+fn __atomic_compare_exchange_2(ptr: *u16, expected: *u16, desired: u16, success: i32, failure: i32) callconv(.C) i32 {
+    return atomic_compare_exchange_N(u16, ptr, expected, desired, success, failure);
+}
+
+fn __atomic_compare_exchange_4(ptr: *u32, expected: *u32, desired: u32, success: i32, failure: i32) callconv(.C) i32 {
+    return atomic_compare_exchange_N(u32, ptr, expected, desired, success, failure);
+}
+
+fn __atomic_compare_exchange_8(ptr: *u64, expected: *u64, desired: u64, success: i32, failure: i32) callconv(.C) i32 {
+    return atomic_compare_exchange_N(u64, ptr, expected, desired, success, failure);
+}
+
+inline fn fetch_op_N(comptime T: type, comptime op: std.builtin.AtomicRmwOp, ptr: *T, val: T, model: i32) T {
+    _ = model;
+    if (@sizeOf(T) > largest_atomic_size) {
+        var sl = spinlocks.get(@ptrToInt(ptr));
+        defer sl.release();
+
+        const value = ptr.*;
+        ptr.* = switch (op) {
+            .Add => value +% val,
+            .Sub => value -% val,
+            .And => value & val,
+            .Nand => ~(value & val),
+            .Or => value | val,
+            .Xor => value ^ val,
+            else => @compileError("unsupported atomic op"),
+        };
+
+        return value;
+    }
+
+    return @atomicRmw(T, ptr, op, val, .SeqCst);
+}
+
+fn __atomic_fetch_add_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .Add, ptr, val, model);
+}
+
+fn __atomic_fetch_add_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .Add, ptr, val, model);
+}
+
+fn __atomic_fetch_add_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .Add, ptr, val, model);
+}
+
+fn __atomic_fetch_add_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .Add, ptr, val, model);
+}
+
+fn __atomic_fetch_sub_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .Sub, ptr, val, model);
+}
+
+fn __atomic_fetch_sub_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .Sub, ptr, val, model);
+}
+
+fn __atomic_fetch_sub_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .Sub, ptr, val, model);
+}
+
+fn __atomic_fetch_sub_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .Sub, ptr, val, model);
+}
+
+fn __atomic_fetch_and_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .And, ptr, val, model);
+}
+
+fn __atomic_fetch_and_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .And, ptr, val, model);
+}
+
+fn __atomic_fetch_and_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .And, ptr, val, model);
+}
+
+fn __atomic_fetch_and_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .And, ptr, val, model);
+}
+
+fn __atomic_fetch_or_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .Or, ptr, val, model);
+}
+
+fn __atomic_fetch_or_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .Or, ptr, val, model);
+}
+
+fn __atomic_fetch_or_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .Or, ptr, val, model);
+}
+
+fn __atomic_fetch_or_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .Or, ptr, val, model);
+}
+
+fn __atomic_fetch_xor_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .Xor, ptr, val, model);
+}
+
+fn __atomic_fetch_xor_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .Xor, ptr, val, model);
+}
+
+fn __atomic_fetch_xor_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .Xor, ptr, val, model);
+}
+
+fn __atomic_fetch_xor_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .Xor, ptr, val, model);
+}
+
+fn __atomic_fetch_nand_1(ptr: *u8, val: u8, model: i32) callconv(.C) u8 {
+    return fetch_op_N(u8, .Nand, ptr, val, model);
+}
+
+fn __atomic_fetch_nand_2(ptr: *u16, val: u16, model: i32) callconv(.C) u16 {
+    return fetch_op_N(u16, .Nand, ptr, val, model);
+}
+
+fn __atomic_fetch_nand_4(ptr: *u32, val: u32, model: i32) callconv(.C) u32 {
+    return fetch_op_N(u32, .Nand, ptr, val, model);
+}
+
+fn __atomic_fetch_nand_8(ptr: *u64, val: u64, model: i32) callconv(.C) u64 {
+    return fetch_op_N(u64, .Nand, ptr, val, model);
 }

 comptime {
    if (supports_atomic_ops) {
-        const atomicLoad_u8 = atomicLoadFn(u8);
-        const atomicLoad_u16 = atomicLoadFn(u16);
-        const atomicLoad_u32 = atomicLoadFn(u32);
-        const atomicLoad_u64 = atomicLoadFn(u64);
-        @export(atomicLoad_u8, .{ .name = "__atomic_load_1", .linkage = linkage });
-        @export(atomicLoad_u16, .{ .name = "__atomic_load_2", .linkage = linkage });
-        @export(atomicLoad_u32, .{ .name = "__atomic_load_4", .linkage = linkage });
-        @export(atomicLoad_u64, .{ .name = "__atomic_load_8", .linkage = linkage });
-    }
-}
-
-fn atomicStoreFn(comptime T: type) fn (*T, T, i32) callconv(.C) void {
-    return struct {
-        fn atomic_store_N(dst: *T, value: T, model: i32) callconv(.C) void {
-            _ = model;
-            if (@sizeOf(T) > largest_atomic_size) {
-                var sl = spinlocks.get(@ptrToInt(dst));
-                defer sl.release();
-                dst.* = value;
-            } else {
-                @atomicStore(T, dst, value, .SeqCst);
-            }
-        }
-    }.atomic_store_N;
-}
-
-comptime {
-    if (supports_atomic_ops) {
-        const atomicStore_u8 = atomicStoreFn(u8);
-        const atomicStore_u16 = atomicStoreFn(u16);
-        const atomicStore_u32 = atomicStoreFn(u32);
-        const atomicStore_u64 = atomicStoreFn(u64);
-        @export(atomicStore_u8, .{ .name = "__atomic_store_1", .linkage = linkage });
-        @export(atomicStore_u16, .{ .name = "__atomic_store_2", .linkage = linkage });
-        @export(atomicStore_u32, .{ .name = "__atomic_store_4", .linkage = linkage });
-        @export(atomicStore_u64, .{ .name = "__atomic_store_8", .linkage = linkage });
-    }
-}
-
-fn atomicExchangeFn(comptime T: type) fn (*T, T, i32) callconv(.C) T {
-    return struct {
-        fn atomic_exchange_N(ptr: *T, val: T, model: i32) callconv(.C) T {
-            _ = model;
-            if (@sizeOf(T) > largest_atomic_size) {
-                var sl = spinlocks.get(@ptrToInt(ptr));
-                defer sl.release();
-                const value = ptr.*;
-                ptr.* = val;
-                return value;
-            } else {
-                return @atomicRmw(T, ptr, .Xchg, val, .SeqCst);
-            }
-        }
-    }.atomic_exchange_N;
-}
-
-comptime {
-    if (supports_atomic_ops) {
-        const atomicExchange_u8 = atomicExchangeFn(u8);
-        const atomicExchange_u16 = atomicExchangeFn(u16);
-        const atomicExchange_u32 = atomicExchangeFn(u32);
-        const atomicExchange_u64 = atomicExchangeFn(u64);
-        @export(atomicExchange_u8, .{ .name = "__atomic_exchange_1", .linkage = linkage });
-        @export(atomicExchange_u16, .{ .name = "__atomic_exchange_2", .linkage = linkage });
-        @export(atomicExchange_u32, .{ .name = "__atomic_exchange_4", .linkage = linkage });
-        @export(atomicExchange_u64, .{ .name = "__atomic_exchange_8", .linkage = linkage });
-    }
-}
-
-fn atomicCompareExchangeFn(comptime T: type) fn (*T, *T, T, i32, i32) callconv(.C) i32 {
-    return struct {
-        fn atomic_compare_exchange_N(ptr: *T, expected: *T, desired: T, success: i32, failure: i32) callconv(.C) i32 {
-            _ = success;
-            _ = failure;
-            if (@sizeOf(T) > largest_atomic_size) {
-                var sl = spinlocks.get(@ptrToInt(ptr));
-                defer sl.release();
-                const value = ptr.*;
-                if (value == expected.*) {
-                    ptr.* = desired;
-                    return 1;
-                }
-                expected.* = value;
-                return 0;
-            } else {
-                if (@cmpxchgStrong(T, ptr, expected.*, desired, .SeqCst, .SeqCst)) |old_value| {
-                    expected.* = old_value;
-                    return 0;
-                }
-                return 1;
-            }
-        }
-    }.atomic_compare_exchange_N;
-}
-
-comptime {
-    if (supports_atomic_ops) {
-        const atomicCompareExchange_u8 = atomicCompareExchangeFn(u8);
-        const atomicCompareExchange_u16 = atomicCompareExchangeFn(u16);
-        const atomicCompareExchange_u32 = atomicCompareExchangeFn(u32);
-        const atomicCompareExchange_u64 = atomicCompareExchangeFn(u64);
-        @export(atomicCompareExchange_u8, .{ .name = "__atomic_compare_exchange_1", .linkage = linkage });
-        @export(atomicCompareExchange_u16, .{ .name = "__atomic_compare_exchange_2", .linkage = linkage });
-        @export(atomicCompareExchange_u32, .{ .name = "__atomic_compare_exchange_4", .linkage = linkage });
-        @export(atomicCompareExchange_u64, .{ .name = "__atomic_compare_exchange_8", .linkage = linkage });
-    }
-}
-
-fn fetchFn(comptime T: type, comptime op: std.builtin.AtomicRmwOp) fn (*T, T, i32) callconv(.C) T {
-    return struct {
-        pub fn fetch_op_N(ptr: *T, val: T, model: i32) callconv(.C) T {
-            _ = model;
-            if (@sizeOf(T) > largest_atomic_size) {
-                var sl = spinlocks.get(@ptrToInt(ptr));
-                defer sl.release();
-
-                const value = ptr.*;
-                ptr.* = switch (op) {
-                    .Add => value +% val,
-                    .Sub => value -% val,
-                    .And => value & val,
-                    .Nand => ~(value & val),
-                    .Or => value | val,
-                    .Xor => value ^ val,
-                    else => @compileError("unsupported atomic op"),
-                };
-
-                return value;
-            }
-
-            return @atomicRmw(T, ptr, op, val, .SeqCst);
-        }
-    }.fetch_op_N;
-}
-
-comptime {
-    if (supports_atomic_ops) {
-        const fetch_add_u8 = fetchFn(u8, .Add);
-        const fetch_add_u16 = fetchFn(u16, .Add);
-        const fetch_add_u32 = fetchFn(u32, .Add);
-        const fetch_add_u64 = fetchFn(u64, .Add);
-        @export(fetch_add_u8, .{ .name = "__atomic_fetch_add_1", .linkage = linkage });
-        @export(fetch_add_u16, .{ .name = "__atomic_fetch_add_2", .linkage = linkage });
-        @export(fetch_add_u32, .{ .name = "__atomic_fetch_add_4", .linkage = linkage });
-        @export(fetch_add_u64, .{ .name = "__atomic_fetch_add_8", .linkage = linkage });
-
-        const fetch_sub_u8 = fetchFn(u8, .Sub);
-        const fetch_sub_u16 = fetchFn(u16, .Sub);
-        const fetch_sub_u32 = fetchFn(u32, .Sub);
-        const fetch_sub_u64 = fetchFn(u64, .Sub);
-        @export(fetch_sub_u8, .{ .name = "__atomic_fetch_sub_1", .linkage = linkage });
-        @export(fetch_sub_u16, .{ .name = "__atomic_fetch_sub_2", .linkage = linkage });
-        @export(fetch_sub_u32, .{ .name = "__atomic_fetch_sub_4", .linkage = linkage });
-        @export(fetch_sub_u64, .{ .name = "__atomic_fetch_sub_8", .linkage = linkage });
-
-        const fetch_and_u8 = fetchFn(u8, .And);
-        const fetch_and_u16 = fetchFn(u16, .And);
-        const fetch_and_u32 = fetchFn(u32, .And);
-        const fetch_and_u64 = fetchFn(u64, .And);
-        @export(fetch_and_u8, .{ .name = "__atomic_fetch_and_1", .linkage = linkage });
-        @export(fetch_and_u16, .{ .name = "__atomic_fetch_and_2", .linkage = linkage });
-        @export(fetch_and_u32, .{ .name = "__atomic_fetch_and_4", .linkage = linkage });
-        @export(fetch_and_u64, .{ .name = "__atomic_fetch_and_8", .linkage = linkage });
-
-        const fetch_or_u8 = fetchFn(u8, .Or);
-        const fetch_or_u16 = fetchFn(u16, .Or);
-        const fetch_or_u32 = fetchFn(u32, .Or);
-        const fetch_or_u64 = fetchFn(u64, .Or);
-        @export(fetch_or_u8, .{ .name = "__atomic_fetch_or_1", .linkage = linkage });
-        @export(fetch_or_u16, .{ .name = "__atomic_fetch_or_2", .linkage = linkage });
-        @export(fetch_or_u32, .{ .name = "__atomic_fetch_or_4", .linkage = linkage });
-        @export(fetch_or_u64, .{ .name = "__atomic_fetch_or_8", .linkage = linkage });
-
-        const fetch_xor_u8 = fetchFn(u8, .Xor);
-        const fetch_xor_u16 = fetchFn(u16, .Xor);
-        const fetch_xor_u32 = fetchFn(u32, .Xor);
-        const fetch_xor_u64 = fetchFn(u64, .Xor);
-        @export(fetch_xor_u8, .{ .name = "__atomic_fetch_xor_1", .linkage = linkage });
-        @export(fetch_xor_u16, .{ .name = "__atomic_fetch_xor_2", .linkage = linkage });
-        @export(fetch_xor_u32, .{ .name = "__atomic_fetch_xor_4", .linkage = linkage });
-        @export(fetch_xor_u64, .{ .name = "__atomic_fetch_xor_8", .linkage = linkage });
-
-        const fetch_nand_u8 = fetchFn(u8, .Nand);
-        const fetch_nand_u16 = fetchFn(u16, .Nand);
-        const fetch_nand_u32 = fetchFn(u32, .Nand);
-        const fetch_nand_u64 = fetchFn(u64, .Nand);
-        @export(fetch_nand_u8, .{ .name = "__atomic_fetch_nand_1", .linkage = linkage });
-        @export(fetch_nand_u16, .{ .name = "__atomic_fetch_nand_2", .linkage = linkage });
-        @export(fetch_nand_u32, .{ .name = "__atomic_fetch_nand_4", .linkage = linkage });
-        @export(fetch_nand_u64, .{ .name = "__atomic_fetch_nand_8", .linkage = linkage });
+        @export(__atomic_fetch_add_1, .{ .name = "__atomic_fetch_add_1", .linkage = linkage });
+        @export(__atomic_fetch_add_2, .{ .name = "__atomic_fetch_add_2", .linkage = linkage });
+        @export(__atomic_fetch_add_4, .{ .name = "__atomic_fetch_add_4", .linkage = linkage });
+        @export(__atomic_fetch_add_8, .{ .name = "__atomic_fetch_add_8", .linkage = linkage });
+
+        @export(__atomic_fetch_sub_1, .{ .name = "__atomic_fetch_sub_1", .linkage = linkage });
+        @export(__atomic_fetch_sub_2, .{ .name = "__atomic_fetch_sub_2", .linkage = linkage });
+        @export(__atomic_fetch_sub_4, .{ .name = "__atomic_fetch_sub_4", .linkage = linkage });
+        @export(__atomic_fetch_sub_8, .{ .name = "__atomic_fetch_sub_8", .linkage = linkage });
+
+        @export(__atomic_fetch_and_1, .{ .name = "__atomic_fetch_and_1", .linkage = linkage });
+        @export(__atomic_fetch_and_2, .{ .name = "__atomic_fetch_and_2", .linkage = linkage });
+        @export(__atomic_fetch_and_4, .{ .name = "__atomic_fetch_and_4", .linkage = linkage });
+        @export(__atomic_fetch_and_8, .{ .name = "__atomic_fetch_and_8", .linkage = linkage });
+
+        @export(__atomic_fetch_or_1, .{ .name = "__atomic_fetch_or_1", .linkage = linkage });
+        @export(__atomic_fetch_or_2, .{ .name = "__atomic_fetch_or_2", .linkage = linkage });
+        @export(__atomic_fetch_or_4, .{ .name = "__atomic_fetch_or_4", .linkage = linkage });
+        @export(__atomic_fetch_or_8, .{ .name = "__atomic_fetch_or_8", .linkage = linkage });
+
+        @export(__atomic_fetch_xor_1, .{ .name = "__atomic_fetch_xor_1", .linkage = linkage });
+        @export(__atomic_fetch_xor_2, .{ .name = "__atomic_fetch_xor_2", .linkage = linkage });
+        @export(__atomic_fetch_xor_4, .{ .name = "__atomic_fetch_xor_4", .linkage = linkage });
+        @export(__atomic_fetch_xor_8, .{ .name = "__atomic_fetch_xor_8", .linkage = linkage });
+
+        @export(__atomic_fetch_nand_1, .{ .name = "__atomic_fetch_nand_1", .linkage = linkage });
+        @export(__atomic_fetch_nand_2, .{ .name = "__atomic_fetch_nand_2", .linkage = linkage });
+        @export(__atomic_fetch_nand_4, .{ .name = "__atomic_fetch_nand_4", .linkage = linkage });
+        @export(__atomic_fetch_nand_8, .{ .name = "__atomic_fetch_nand_8", .linkage = linkage });
+
+        @export(__atomic_load_1, .{ .name = "__atomic_load_1", .linkage = linkage });
+        @export(__atomic_load_2, .{ .name = "__atomic_load_2", .linkage = linkage });
+        @export(__atomic_load_4, .{ .name = "__atomic_load_4", .linkage = linkage });
+        @export(__atomic_load_8, .{ .name = "__atomic_load_8", .linkage = linkage });
+
+        @export(__atomic_store_1, .{ .name = "__atomic_store_1", .linkage = linkage });
+        @export(__atomic_store_2, .{ .name = "__atomic_store_2", .linkage = linkage });
+        @export(__atomic_store_4, .{ .name = "__atomic_store_4", .linkage = linkage });
+        @export(__atomic_store_8, .{ .name = "__atomic_store_8", .linkage = linkage });
+
+        @export(__atomic_exchange_1, .{ .name = "__atomic_exchange_1", .linkage = linkage });
+        @export(__atomic_exchange_2, .{ .name = "__atomic_exchange_2", .linkage = linkage });
+        @export(__atomic_exchange_4, .{ .name = "__atomic_exchange_4", .linkage = linkage });
+        @export(__atomic_exchange_8, .{ .name = "__atomic_exchange_8", .linkage = linkage });
+
+        @export(__atomic_compare_exchange_1, .{ .name = "__atomic_compare_exchange_1", .linkage = linkage });
+        @export(__atomic_compare_exchange_2, .{ .name = "__atomic_compare_exchange_2", .linkage = linkage });
+        @export(__atomic_compare_exchange_4, .{ .name = "__atomic_compare_exchange_4", .linkage = linkage });
+        @export(__atomic_compare_exchange_8, .{ .name = "__atomic_compare_exchange_8", .linkage = linkage });
    }
 }
--- a/lib/std/special/compiler_rt/bswap.zig
+++ b/lib/std/special/compiler_rt/bswap.zig
@ -2,7 +2,7 @@ const std = @import("std");
 const builtin = @import("builtin");

 // bswap - byteswap
-// - bswapXi2_generic for unoptimized big and little endian
+// - bswapXi2 for unoptimized big and little endian
 // ie for u32
 // DE AD BE EF   <- little|big endian
 // FE BE AD DE   <- big|little endian
@ -11,64 +11,64 @@ const builtin = @import("builtin");
 // 00 00 ff 00 << 1*8 (2n right  byte)
 // 00 00 00 ff << 3*8 (rightmost byte)

-fn bswapXi2_generic(comptime T: type) fn (a: T) callconv(.C) T {
-    return struct {
-        fn f(a: T) callconv(.C) T {
-            @setRuntimeSafety(builtin.is_test);
-            switch (@bitSizeOf(T)) {
-                32 => {
-                    // zig fmt: off
-                    return (((a & 0xff000000) >> 24)
-                         |  ((a & 0x00ff0000) >> 8 )
-                         |  ((a & 0x0000ff00) << 8 )
-                         |  ((a & 0x000000ff) << 24));
-                    // zig fmt: on
-                },
-                64 => {
-                    // zig fmt: off
-                    return (((a & 0xff00000000000000) >> 56)
-                         |  ((a & 0x00ff000000000000) >> 40 )
-                         |  ((a & 0x0000ff0000000000) >> 24 )
-                         |  ((a & 0x000000ff00000000) >> 8 )
-                         |  ((a & 0x00000000ff000000) << 8 )
-                         |  ((a & 0x0000000000ff0000) << 24 )
-                         |  ((a & 0x000000000000ff00) << 40 )
-                         |  ((a & 0x00000000000000ff) << 56));
-                    // zig fmt: on
-                },
-                128 => {
-                    // zig fmt: off
-                    return (((a & 0xff000000000000000000000000000000) >> 120)
-                         |  ((a & 0x00ff0000000000000000000000000000) >> 104)
-                         |  ((a & 0x0000ff00000000000000000000000000) >> 88 )
-                         |  ((a & 0x000000ff000000000000000000000000) >> 72 )
-                         |  ((a & 0x00000000ff0000000000000000000000) >> 56 )
-                         |  ((a & 0x0000000000ff00000000000000000000) >> 40 )
-                         |  ((a & 0x000000000000ff000000000000000000) >> 24 )
-                         |  ((a & 0x00000000000000ff0000000000000000) >> 8  )
-                         |  ((a & 0x0000000000000000ff00000000000000) << 8  )
-                         |  ((a & 0x000000000000000000ff000000000000) << 24 )
-                         |  ((a & 0x00000000000000000000ff0000000000) << 40 )
-                         |  ((a & 0x0000000000000000000000ff00000000) << 56 )
-                         |  ((a & 0x000000000000000000000000ff000000) << 72 )
-                         |  ((a & 0x00000000000000000000000000ff0000) << 88 )
-                         |  ((a & 0x0000000000000000000000000000ff00) << 104)
-                         |  ((a & 0x000000000000000000000000000000ff) << 120));
-                    // zig fmt: on
-                },
-                else => {
-                    unreachable;
-                },
-            }
-        }
-    }.f;
+inline fn bswapXi2(comptime T: type, a: T) T {
+    @setRuntimeSafety(builtin.is_test);
+    switch (@bitSizeOf(T)) {
+        32 => {
+            // zig fmt: off
+            return (((a & 0xff000000) >> 24)
+                 |  ((a & 0x00ff0000) >> 8 )
+                 |  ((a & 0x0000ff00) << 8 )
+                 |  ((a & 0x000000ff) << 24));
+            // zig fmt: on
+        },
+        64 => {
+            // zig fmt: off
+            return (((a & 0xff00000000000000) >> 56)
+                 |  ((a & 0x00ff000000000000) >> 40 )
+                 |  ((a & 0x0000ff0000000000) >> 24 )
+                 |  ((a & 0x000000ff00000000) >> 8 )
+                 |  ((a & 0x00000000ff000000) << 8 )
+                 |  ((a & 0x0000000000ff0000) << 24 )
+                 |  ((a & 0x000000000000ff00) << 40 )
+                 |  ((a & 0x00000000000000ff) << 56));
+            // zig fmt: on
+        },
+        128 => {
+            // zig fmt: off
+            return (((a & 0xff000000000000000000000000000000) >> 120)
+                 |  ((a & 0x00ff0000000000000000000000000000) >> 104)
+                 |  ((a & 0x0000ff00000000000000000000000000) >> 88 )
+                 |  ((a & 0x000000ff000000000000000000000000) >> 72 )
+                 |  ((a & 0x00000000ff0000000000000000000000) >> 56 )
+                 |  ((a & 0x0000000000ff00000000000000000000) >> 40 )
+                 |  ((a & 0x000000000000ff000000000000000000) >> 24 )
+                 |  ((a & 0x00000000000000ff0000000000000000) >> 8  )
+                 |  ((a & 0x0000000000000000ff00000000000000) << 8  )
+                 |  ((a & 0x000000000000000000ff000000000000) << 24 )
+                 |  ((a & 0x00000000000000000000ff0000000000) << 40 )
+                 |  ((a & 0x0000000000000000000000ff00000000) << 56 )
+                 |  ((a & 0x000000000000000000000000ff000000) << 72 )
+                 |  ((a & 0x00000000000000000000000000ff0000) << 88 )
+                 |  ((a & 0x0000000000000000000000000000ff00) << 104)
+                 |  ((a & 0x000000000000000000000000000000ff) << 120));
+            // zig fmt: on
+        },
+        else => unreachable,
+    }
 }

-pub const __bswapsi2 = bswapXi2_generic(u32);
+pub fn __bswapsi2(a: u32) callconv(.C) u32 {
+    return bswapXi2(u32, a);
+}

-pub const __bswapdi2 = bswapXi2_generic(u64);
+pub fn __bswapdi2(a: u64) callconv(.C) u64 {
+    return bswapXi2(u64, a);
+}

-pub const __bswapti2 = bswapXi2_generic(u128);
+pub fn __bswapti2(a: u128) callconv(.C) u128 {
+    return bswapXi2(u128, a);
+}

 test {
    _ = @import("bswapsi2_test.zig");
--- a/lib/std/special/compiler_rt/cmp.zig
+++ b/lib/std/special/compiler_rt/cmp.zig
@ -11,28 +11,40 @@ const builtin = @import("builtin");
 // a == b => 1
 // a >  b => 2

-fn XcmpXi2_generic(comptime T: type) fn (a: T, b: T) callconv(.C) i32 {
-    return struct {
-        fn f(a: T, b: T) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
-            var cmp1: i32 = 0;
-            var cmp2: i32 = 0;
-            if (a > b)
-                cmp1 = 1;
-            if (a < b)
-                cmp2 = 1;
-            return cmp1 - cmp2 + 1;
-        }
-    }.f;
+inline fn XcmpXi2(comptime T: type, a: T, b: T) i32 {
+    @setRuntimeSafety(builtin.is_test);
+    var cmp1: i32 = 0;
+    var cmp2: i32 = 0;
+    if (a > b)
+        cmp1 = 1;
+    if (a < b)
+        cmp2 = 1;
+    return cmp1 - cmp2 + 1;
 }

-pub const __cmpsi2 = XcmpXi2_generic(i32);
-pub const __cmpdi2 = XcmpXi2_generic(i64);
-pub const __cmpti2 = XcmpXi2_generic(i128);
+pub fn __cmpsi2(a: i32, b: i32) callconv(.C) i32 {
+    return XcmpXi2(i32, a, b);
+}

-pub const __ucmpsi2 = XcmpXi2_generic(u32);
-pub const __ucmpdi2 = XcmpXi2_generic(u64);
-pub const __ucmpti2 = XcmpXi2_generic(u128);
+pub fn __cmpdi2(a: i64, b: i64) callconv(.C) i32 {
+    return XcmpXi2(i64, a, b);
+}
+
+pub fn __cmpti2(a: i128, b: i128) callconv(.C) i32 {
+    return XcmpXi2(i128, a, b);
+}
+
+pub fn __ucmpsi2(a: u32, b: u32) callconv(.C) i32 {
+    return XcmpXi2(u32, a, b);
+}
+
+pub fn __ucmpdi2(a: u64, b: u64) callconv(.C) i32 {
+    return XcmpXi2(u64, a, b);
+}
+
+pub fn __ucmpti2(a: u128, b: u128) callconv(.C) i32 {
+    return XcmpXi2(u128, a, b);
+}

 test {
    _ = @import("cmpsi2_test.zig");
--- a/lib/std/special/compiler_rt/count0bits.zig
+++ b/lib/std/special/compiler_rt/count0bits.zig
@ -2,44 +2,40 @@ const std = @import("std");
 const builtin = @import("builtin");

 // clz - count leading zeroes
-// - clzXi2_generic for unoptimized little and big endian
+// - clzXi2 for unoptimized little and big endian
 // - __clzsi2_thumb1: assume a != 0
 // - __clzsi2_arm32: assume a != 0

 // ctz - count trailing zeroes
-// - ctzXi2_generic for unoptimized little and big endian
+// - ctzXi2 for unoptimized little and big endian

 // ffs - find first set
 // * ffs = (a == 0) => 0, (a != 0) => ctz + 1
 // * dont pay for `if (x == 0) return shift;` inside ctz
-// - ffsXi2_generic for unoptimized little and big endian
+// - ffsXi2 for unoptimized little and big endian

-fn clzXi2_generic(comptime T: type) fn (a: T) callconv(.C) i32 {
-    return struct {
-        fn f(a: T) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
+inline fn clzXi2(comptime T: type, a: T) i32 {
+    @setRuntimeSafety(builtin.is_test);

-            var x = switch (@bitSizeOf(T)) {
-                32 => @bitCast(u32, a),
-                64 => @bitCast(u64, a),
-                128 => @bitCast(u128, a),
-                else => unreachable,
-            };
-            var n: T = @bitSizeOf(T);
-            // Count first bit set using binary search, from Hacker's Delight
-            var y: @TypeOf(x) = 0;
-            comptime var shift: u8 = @bitSizeOf(T);
-            inline while (shift > 0) {
-                shift = shift >> 1;
-                y = x >> shift;
-                if (y != 0) {
-                    n = n - shift;
-                    x = y;
-                }
-            }
-            return @intCast(i32, n - @bitCast(T, x));
+    var x = switch (@bitSizeOf(T)) {
+        32 => @bitCast(u32, a),
+        64 => @bitCast(u64, a),
+        128 => @bitCast(u128, a),
+        else => unreachable,
+    };
+    var n: T = @bitSizeOf(T);
+    // Count first bit set using binary search, from Hacker's Delight
+    var y: @TypeOf(x) = 0;
+    comptime var shift: u8 = @bitSizeOf(T);
+    inline while (shift > 0) {
+        shift = shift >> 1;
+        y = x >> shift;
+        if (y != 0) {
+            n = n - shift;
+            x = y;
        }
-    }.f;
+    }
+    return @intCast(i32, n - @bitCast(T, x));
 }

 fn __clzsi2_thumb1() callconv(.Naked) void {
@ -125,103 +121,113 @@ fn __clzsi2_arm32() callconv(.Naked) void {
    unreachable;
 }

-pub const __clzsi2 = impl: {
-    switch (builtin.cpu.arch) {
-        .arm, .armeb, .thumb, .thumbeb => {
-            const use_thumb1 =
-                (builtin.cpu.arch.isThumb() or
-                std.Target.arm.featureSetHas(builtin.cpu.features, .noarm)) and
-                !std.Target.arm.featureSetHas(builtin.cpu.features, .thumb2);
+fn clzsi2_generic(a: i32) callconv(.C) i32 {
+    return clzXi2(i32, a);
+}

-            if (use_thumb1) {
-                break :impl __clzsi2_thumb1;
-            }
-            // From here on we're either targeting Thumb2 or ARM.
-            else if (!builtin.cpu.arch.isThumb()) {
-                break :impl __clzsi2_arm32;
-            }
-            // Use the generic implementation otherwise.
-            else break :impl clzXi2_generic(i32);
-        },
-        else => break :impl clzXi2_generic(i32),
-    }
+pub const __clzsi2 = switch (builtin.cpu.arch) {
+    .arm, .armeb, .thumb, .thumbeb => impl: {
+        const use_thumb1 =
+            (builtin.cpu.arch.isThumb() or
+            std.Target.arm.featureSetHas(builtin.cpu.features, .noarm)) and
+            !std.Target.arm.featureSetHas(builtin.cpu.features, .thumb2);
+
+        if (use_thumb1) {
+            break :impl __clzsi2_thumb1;
+        }
+        // From here on we're either targeting Thumb2 or ARM.
+        else if (!builtin.cpu.arch.isThumb()) {
+            break :impl __clzsi2_arm32;
+        }
+        // Use the generic implementation otherwise.
+        else break :impl clzsi2_generic;
+    },
+    else => clzsi2_generic,
 };

-pub const __clzdi2 = clzXi2_generic(i64);
-
-pub const __clzti2 = clzXi2_generic(i128);
-
-fn ctzXi2_generic(comptime T: type) fn (a: T) callconv(.C) i32 {
-    return struct {
-        fn f(a: T) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
-
-            var x = switch (@bitSizeOf(T)) {
-                32 => @bitCast(u32, a),
-                64 => @bitCast(u64, a),
-                128 => @bitCast(u128, a),
-                else => unreachable,
-            };
-            var n: T = 1;
-            // Number of trailing zeroes as binary search, from Hacker's Delight
-            var mask: @TypeOf(x) = std.math.maxInt(@TypeOf(x));
-            comptime var shift = @bitSizeOf(T);
-            if (x == 0) return shift;
-            inline while (shift > 1) {
-                shift = shift >> 1;
-                mask = mask >> shift;
-                if ((x & mask) == 0) {
-                    n = n + shift;
-                    x = x >> shift;
-                }
-            }
-            return @intCast(i32, n - @bitCast(T, (x & 1)));
-        }
-    }.f;
+pub fn __clzdi2(a: i64) callconv(.C) i32 {
+    return clzXi2(i64, a);
 }

-pub const __ctzsi2 = ctzXi2_generic(i32);
-
-pub const __ctzdi2 = ctzXi2_generic(i64);
-
-pub const __ctzti2 = ctzXi2_generic(i128);
-
-fn ffsXi2_generic(comptime T: type) fn (a: T) callconv(.C) i32 {
-    return struct {
-        fn f(a: T) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
-
-            var x = switch (@bitSizeOf(T)) {
-                32 => @bitCast(u32, a),
-                64 => @bitCast(u64, a),
-                128 => @bitCast(u128, a),
-                else => unreachable,
-            };
-            var n: T = 1;
-            // adapted from Number of trailing zeroes (see ctzXi2_generic)
-            var mask: @TypeOf(x) = std.math.maxInt(@TypeOf(x));
-            comptime var shift = @bitSizeOf(T);
-            // In contrast to ctz return 0
-            if (x == 0) return 0;
-            inline while (shift > 1) {
-                shift = shift >> 1;
-                mask = mask >> shift;
-                if ((x & mask) == 0) {
-                    n = n + shift;
-                    x = x >> shift;
-                }
-            }
-            // return ctz + 1
-            return @intCast(i32, n - @bitCast(T, (x & 1))) + @as(i32, 1);
-        }
-    }.f;
+pub fn __clzti2(a: i128) callconv(.C) i32 {
+    return clzXi2(i128, a);
 }

-pub const __ffssi2 = ffsXi2_generic(i32);
+inline fn ctzXi2(comptime T: type, a: T) i32 {
+    @setRuntimeSafety(builtin.is_test);

-pub const __ffsdi2 = ffsXi2_generic(i64);
+    var x = switch (@bitSizeOf(T)) {
+        32 => @bitCast(u32, a),
+        64 => @bitCast(u64, a),
+        128 => @bitCast(u128, a),
+        else => unreachable,
+    };
+    var n: T = 1;
+    // Number of trailing zeroes as binary search, from Hacker's Delight
+    var mask: @TypeOf(x) = std.math.maxInt(@TypeOf(x));
+    comptime var shift = @bitSizeOf(T);
+    if (x == 0) return shift;
+    inline while (shift > 1) {
+        shift = shift >> 1;
+        mask = mask >> shift;
+        if ((x & mask) == 0) {
+            n = n + shift;
+            x = x >> shift;
+        }
+    }
+    return @intCast(i32, n - @bitCast(T, (x & 1)));
+}

-pub const __ffsti2 = ffsXi2_generic(i128);
+pub fn __ctzsi2(a: i32) callconv(.C) i32 {
+    return ctzXi2(i32, a);
+}
+
+pub fn __ctzdi2(a: i64) callconv(.C) i32 {
+    return ctzXi2(i64, a);
+}
+
+pub fn __ctzti2(a: i128) callconv(.C) i32 {
+    return ctzXi2(i128, a);
+}
+
+inline fn ffsXi2(comptime T: type, a: T) i32 {
+    @setRuntimeSafety(builtin.is_test);
+
+    var x = switch (@bitSizeOf(T)) {
+        32 => @bitCast(u32, a),
+        64 => @bitCast(u64, a),
+        128 => @bitCast(u128, a),
+        else => unreachable,
+    };
+    var n: T = 1;
+    // adapted from Number of trailing zeroes (see ctzXi2)
+    var mask: @TypeOf(x) = std.math.maxInt(@TypeOf(x));
+    comptime var shift = @bitSizeOf(T);
+    // In contrast to ctz return 0
+    if (x == 0) return 0;
+    inline while (shift > 1) {
+        shift = shift >> 1;
+        mask = mask >> shift;
+        if ((x & mask) == 0) {
+            n = n + shift;
+            x = x >> shift;
+        }
+    }
+    // return ctz + 1
+    return @intCast(i32, n - @bitCast(T, (x & 1))) + @as(i32, 1);
+}
+
+pub fn __ffssi2(a: i32) callconv(.C) i32 {
+    return ffsXi2(i32, a);
+}
+
+pub fn __ffsdi2(a: i64) callconv(.C) i32 {
+    return ffsXi2(i64, a);
+}
+
+pub fn __ffsti2(a: i128) callconv(.C) i32 {
+    return ffsXi2(i128, a);
+}

 test {
    _ = @import("clzsi2_test.zig");
--- a/lib/std/special/compiler_rt/fixuint.zig
+++ b/lib/std/special/compiler_rt/fixuint.zig
@ -1,7 +1,7 @@
 const is_test = @import("builtin").is_test;
 const Log2Int = @import("std").math.Log2Int;

-pub fn fixuint(comptime fp_t: type, comptime fixuint_t: type, a: fp_t) fixuint_t {
+pub inline fn fixuint(comptime fp_t: type, comptime fixuint_t: type, a: fp_t) fixuint_t {
    @setRuntimeSafety(is_test);

    const rep_t = switch (fp_t) {
--- a/lib/std/special/compiler_rt/floatXisf.zig
+++ b/lib/std/special/compiler_rt/floatXisf.zig
@ -4,7 +4,7 @@ const maxInt = std.math.maxInt;

 const FLT_MANT_DIG = 24;

-fn __floatXisf(comptime T: type, arg: T) f32 {
+inline fn floatXisf(comptime T: type, arg: T) f32 {
    @setRuntimeSafety(builtin.is_test);

    const bits = @typeInfo(T).Int.bits;
@ -71,18 +71,15 @@ fn __floatXisf(comptime T: type, arg: T) f32 {
 }

 pub fn __floatdisf(arg: i64) callconv(.C) f32 {
-    @setRuntimeSafety(builtin.is_test);
-    return @call(.{ .modifier = .always_inline }, __floatXisf, .{ i64, arg });
+    return floatXisf(i64, arg);
 }

 pub fn __floattisf(arg: i128) callconv(.C) f32 {
-    @setRuntimeSafety(builtin.is_test);
-    return @call(.{ .modifier = .always_inline }, __floatXisf, .{ i128, arg });
+    return floatXisf(i128, arg);
 }

 pub fn __aeabi_l2f(arg: i64) callconv(.AAPCS) f32 {
-    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatdisf, .{arg});
+    return floatXisf(i64, arg);
 }

 test {
--- a/lib/std/special/compiler_rt/floatsiXf.zig
+++ b/lib/std/special/compiler_rt/floatsiXf.zig
@ -2,7 +2,7 @@ const builtin = @import("builtin");
 const std = @import("std");
 const maxInt = std.math.maxInt;

-fn floatsiXf(comptime T: type, a: i32) T {
+inline fn floatsiXf(comptime T: type, a: i32) T {
    @setRuntimeSafety(builtin.is_test);

    const bits = @typeInfo(T).Float.bits;
@ -56,27 +56,27 @@ fn floatsiXf(comptime T: type, a: i32) T {

 pub fn __floatsisf(arg: i32) callconv(.C) f32 {
    @setRuntimeSafety(builtin.is_test);
-    return @call(.{ .modifier = .always_inline }, floatsiXf, .{ f32, arg });
+    return floatsiXf(f32, arg);
 }

 pub fn __floatsidf(arg: i32) callconv(.C) f64 {
    @setRuntimeSafety(builtin.is_test);
-    return @call(.{ .modifier = .always_inline }, floatsiXf, .{ f64, arg });
+    return floatsiXf(f64, arg);
 }

 pub fn __floatsitf(arg: i32) callconv(.C) f128 {
    @setRuntimeSafety(builtin.is_test);
-    return @call(.{ .modifier = .always_inline }, floatsiXf, .{ f128, arg });
+    return floatsiXf(f128, arg);
 }

 pub fn __aeabi_i2d(arg: i32) callconv(.AAPCS) f64 {
    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatsidf, .{arg});
+    return floatsiXf(f64, arg);
 }

 pub fn __aeabi_i2f(arg: i32) callconv(.AAPCS) f32 {
    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatsisf, .{arg});
+    return floatsiXf(f32, arg);
 }

 fn test_one_floatsitf(a: i32, expected: u128) !void {
--- a/lib/std/special/compiler_rt/floatundisf.zig
+++ b/lib/std/special/compiler_rt/floatundisf.zig
@ -4,7 +4,7 @@ const maxInt = std.math.maxInt;

 const FLT_MANT_DIG = 24;

-pub fn __floatundisf(arg: u64) callconv(.C) f32 {
+inline fn floatundisf(arg: u64) f32 {
    @setRuntimeSafety(builtin.is_test);

    if (arg == 0) return 0;
@ -56,9 +56,12 @@ pub fn __floatundisf(arg: u64) callconv(.C) f32 {
    return @bitCast(f32, result);
 }

+pub fn __floatundisf(arg: u64) callconv(.C) f32 {
+    return floatundisf(arg);
+}
+
 pub fn __aeabi_ul2f(arg: u64) callconv(.AAPCS) f32 {
-    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatundisf, .{arg});
+    return floatundisf(arg);
 }

 fn test__floatundisf(a: u64, expected: f32) !void {
--- a/lib/std/special/compiler_rt/floatunsidf.zig
+++ b/lib/std/special/compiler_rt/floatunsidf.zig
@ -4,7 +4,7 @@ const maxInt = std.math.maxInt;

 const implicitBit = @as(u64, 1) << 52;

-pub fn __floatunsidf(arg: u32) callconv(.C) f64 {
+inline fn floatunsidf(arg: u32) f64 {
    @setRuntimeSafety(builtin.is_test);

    if (arg == 0) return 0.0;
@ -18,9 +18,12 @@ pub fn __floatunsidf(arg: u32) callconv(.C) f64 {
    return @bitCast(f64, mant | (exp + 1023) << 52);
 }

+pub fn __floatunsidf(arg: u32) callconv(.C) f64 {
+    return floatunsidf(arg);
+}
+
 pub fn __aeabi_ui2d(arg: u32) callconv(.AAPCS) f64 {
-    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatunsidf, .{arg});
+    return floatunsidf(arg);
 }

 fn test_one_floatunsidf(a: u32, expected: u64) !void {
--- a/lib/std/special/compiler_rt/floatunsisf.zig
+++ b/lib/std/special/compiler_rt/floatunsisf.zig
@ -6,7 +6,7 @@ const significandBits = 23;
 const exponentBias = 127;
 const implicitBit = @as(u32, 1) << significandBits;

-pub fn __floatunsisf(arg: u32) callconv(.C) f32 {
+inline fn floatunsisf(arg: u32) f32 {
    @setRuntimeSafety(builtin.is_test);

    if (arg == 0) return 0.0;
@ -38,9 +38,12 @@ pub fn __floatunsisf(arg: u32) callconv(.C) f32 {
    return @bitCast(f32, result);
 }

+pub fn __floatunsisf(arg: u32) callconv(.C) f32 {
+    return floatunsisf(arg);
+}
+
 pub fn __aeabi_ui2f(arg: u32) callconv(.AAPCS) f32 {
-    @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __floatunsisf, .{arg});
+    return floatunsisf(arg);
 }

 fn test_one_floatunsisf(a: u32, expected: u32) !void {
--- a/lib/std/special/compiler_rt/negXi2.zig
+++ b/lib/std/special/compiler_rt/negXi2.zig
@ -2,7 +2,7 @@ const std = @import("std");
 const builtin = @import("builtin");

 // neg - negate (the number)
-// - negXi2_generic for unoptimized little and big endian
+// - negXi2 for unoptimized little and big endian

 // sfffffff = 2^31-1
 // two's complement inverting bits and add 1 would result in -INT_MIN == 0
@ -11,20 +11,22 @@ const builtin = @import("builtin");
 // * size optimized builds
 // * machines that dont support carry operations

-fn negXi2_generic(comptime T: type) fn (a: T) callconv(.C) T {
-    return struct {
-        fn f(a: T) callconv(.C) T {
-            @setRuntimeSafety(builtin.is_test);
-            return -a;
-        }
-    }.f;
+inline fn negXi2(comptime T: type, a: T) T {
+    @setRuntimeSafety(builtin.is_test);
+    return -a;
 }

-pub const __negsi2 = negXi2_generic(i32);
+pub fn __negsi2(a: i32) callconv(.C) i32 {
+    return negXi2(i32, a);
+}

-pub const __negdi2 = negXi2_generic(i64);
+pub fn __negdi2(a: i64) callconv(.C) i64 {
+    return negXi2(i64, a);
+}

-pub const __negti2 = negXi2_generic(i128);
+pub fn __negti2(a: i128) callconv(.C) i128 {
+    return negXi2(i128, a);
+}

 test {
    _ = @import("negsi2_test.zig");
--- a/lib/std/special/compiler_rt/negv.zig
+++ b/lib/std/special/compiler_rt/negv.zig
@ -3,26 +3,31 @@
 // - negvXi4_generic for unoptimized version

 // assume -0 == 0 is gracefully handled by the hardware
-fn negvXi_generic(comptime ST: type) fn (a: ST) callconv(.C) ST {
-    return struct {
-        fn f(a: ST) callconv(.C) ST {
-            const UT = switch (ST) {
-                i32 => u32,
-                i64 => u64,
-                i128 => u128,
-                else => unreachable,
-            };
-            const N: UT = @bitSizeOf(ST);
-            const min: ST = @bitCast(ST, (@as(UT, 1) << (N - 1)));
-            if (a == min)
-                @panic("compiler_rt negv: overflow");
-            return -a;
-        }
-    }.f;
+inline fn negvXi(comptime ST: type, a: ST) ST {
+    const UT = switch (ST) {
+        i32 => u32,
+        i64 => u64,
+        i128 => u128,
+        else => unreachable,
+    };
+    const N: UT = @bitSizeOf(ST);
+    const min: ST = @bitCast(ST, (@as(UT, 1) << (N - 1)));
+    if (a == min)
+        @panic("compiler_rt negv: overflow");
+    return -a;
+}
+
+pub fn __negvsi2(a: i32) callconv(.C) i32 {
+    return negvXi(i32, a);
+}
+
+pub fn __negvdi2(a: i64) callconv(.C) i64 {
+    return negvXi(i64, a);
+}
+
+pub fn __negvti2(a: i128) callconv(.C) i128 {
+    return negvXi(i128, a);
 }
-pub const __negvsi2 = negvXi_generic(i32);
-pub const __negvdi2 = negvXi_generic(i64);
-pub const __negvti2 = negvXi_generic(i128);

 test {
    _ = @import("negvsi2_test.zig");
--- a/lib/std/special/compiler_rt/parity.zig
+++ b/lib/std/special/compiler_rt/parity.zig
@ -4,34 +4,36 @@ const builtin = @import("builtin");
 // parity - if number of bits set is even => 0, else => 1
 // - pariytXi2_generic for big and little endian

-fn parityXi2_generic(comptime T: type) fn (a: T) callconv(.C) i32 {
-    return struct {
-        fn f(a: T) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
+inline fn parityXi2(comptime T: type, a: T) i32 {
+    @setRuntimeSafety(builtin.is_test);

-            var x = switch (@bitSizeOf(T)) {
-                32 => @bitCast(u32, a),
-                64 => @bitCast(u64, a),
-                128 => @bitCast(u128, a),
-                else => unreachable,
-            };
-            // Bit Twiddling Hacks: Compute parity in parallel
-            comptime var shift: u8 = @bitSizeOf(T) / 2;
-            inline while (shift > 2) {
-                x ^= x >> shift;
-                shift = shift >> 1;
-            }
-            x &= 0xf;
-            return (@intCast(u16, 0x6996) >> @intCast(u4, x)) & 1; // optimization for >>2 and >>1
-        }
-    }.f;
+    var x = switch (@bitSizeOf(T)) {
+        32 => @bitCast(u32, a),
+        64 => @bitCast(u64, a),
+        128 => @bitCast(u128, a),
+        else => unreachable,
+    };
+    // Bit Twiddling Hacks: Compute parity in parallel
+    comptime var shift: u8 = @bitSizeOf(T) / 2;
+    inline while (shift > 2) {
+        x ^= x >> shift;
+        shift = shift >> 1;
+    }
+    x &= 0xf;
+    return (@intCast(u16, 0x6996) >> @intCast(u4, x)) & 1; // optimization for >>2 and >>1
 }

-pub const __paritysi2 = parityXi2_generic(i32);
+pub fn __paritysi2(a: i32) callconv(.C) i32 {
+    return parityXi2(i32, a);
+}

-pub const __paritydi2 = parityXi2_generic(i64);
+pub fn __paritydi2(a: i64) callconv(.C) i32 {
+    return parityXi2(i64, a);
+}

-pub const __parityti2 = parityXi2_generic(i128);
+pub fn __parityti2(a: i128) callconv(.C) i32 {
+    return parityXi2(i128, a);
+}

 test {
    _ = @import("paritysi2_test.zig");
--- a/lib/std/special/compiler_rt/popcount.zig
+++ b/lib/std/special/compiler_rt/popcount.zig
@ -10,35 +10,37 @@ const std = @import("std");
 // TAOCP: Combinational Algorithms, Bitwise Tricks And Techniques,
 //   subsubsection "Working with the rightmost bits" and "Sideways addition".

-fn popcountXi2_generic(comptime ST: type) fn (a: ST) callconv(.C) i32 {
-    return struct {
-        fn f(a: ST) callconv(.C) i32 {
-            @setRuntimeSafety(builtin.is_test);
-            const UT = switch (ST) {
-                i32 => u32,
-                i64 => u64,
-                i128 => u128,
-                else => unreachable,
-            };
-            var x = @bitCast(UT, a);
-            x -= (x >> 1) & (~@as(UT, 0) / 3); // 0x55...55, aggregate duos
-            x = ((x >> 2) & (~@as(UT, 0) / 5)) // 0x33...33, aggregate nibbles
-            + (x & (~@as(UT, 0) / 5));
-            x += x >> 4;
-            x &= ~@as(UT, 0) / 17; // 0x0F...0F, aggregate bytes
-            // 8 most significant bits of x + (x<<8) + (x<<16) + ..
-            x *%= ~@as(UT, 0) / 255; // 0x01...01
-            x >>= (@bitSizeOf(ST) - 8);
-            return @intCast(i32, x);
-        }
-    }.f;
+inline fn popcountXi2(comptime ST: type, a: ST) i32 {
+    @setRuntimeSafety(builtin.is_test);
+    const UT = switch (ST) {
+        i32 => u32,
+        i64 => u64,
+        i128 => u128,
+        else => unreachable,
+    };
+    var x = @bitCast(UT, a);
+    x -= (x >> 1) & (~@as(UT, 0) / 3); // 0x55...55, aggregate duos
+    x = ((x >> 2) & (~@as(UT, 0) / 5)) // 0x33...33, aggregate nibbles
+    + (x & (~@as(UT, 0) / 5));
+    x += x >> 4;
+    x &= ~@as(UT, 0) / 17; // 0x0F...0F, aggregate bytes
+    // 8 most significant bits of x + (x<<8) + (x<<16) + ..
+    x *%= ~@as(UT, 0) / 255; // 0x01...01
+    x >>= (@bitSizeOf(ST) - 8);
+    return @intCast(i32, x);
 }

-pub const __popcountsi2 = popcountXi2_generic(i32);
+pub fn __popcountsi2(a: i32) callconv(.C) i32 {
+    return popcountXi2(i32, a);
+}

-pub const __popcountdi2 = popcountXi2_generic(i64);
+pub fn __popcountdi2(a: i64) callconv(.C) i32 {
+    return popcountXi2(i64, a);
+}

-pub const __popcountti2 = popcountXi2_generic(i128);
+pub fn __popcountti2(a: i128) callconv(.C) i32 {
+    return popcountXi2(i128, a);
+}

 test {
    _ = @import("popcountsi2_test.zig");
--- a/lib/std/special/compiler_rt/shift.zig
+++ b/lib/std/special/compiler_rt/shift.zig
@ -19,7 +19,7 @@ fn Dwords(comptime T: type, comptime signed_half: bool) type {

 // Arithmetic shift left
 // Precondition: 0 <= b < bits_in_dword
-pub fn ashlXi3(comptime T: type, a: T, b: i32) T {
+pub inline fn ashlXi3(comptime T: type, a: T, b: i32) T {
    const dwords = Dwords(T, false);
    const S = Log2Int(dwords.HalfT);

@ -42,7 +42,7 @@ pub fn ashlXi3(comptime T: type, a: T, b: i32) T {

 // Arithmetic shift right
 // Precondition: 0 <= b < T.bit_count
-pub fn ashrXi3(comptime T: type, a: T, b: i32) T {
+pub inline fn ashrXi3(comptime T: type, a: T, b: i32) T {
    const dwords = Dwords(T, true);
    const S = Log2Int(dwords.HalfT);

@ -69,7 +69,7 @@ pub fn ashrXi3(comptime T: type, a: T, b: i32) T {

 // Logical shift right
 // Precondition: 0 <= b < T.bit_count
-pub fn lshrXi3(comptime T: type, a: T, b: i32) T {
+pub inline fn lshrXi3(comptime T: type, a: T, b: i32) T {
    const dwords = Dwords(T, false);
    const S = Log2Int(dwords.HalfT);

@ -91,32 +91,32 @@ pub fn lshrXi3(comptime T: type, a: T, b: i32) T {
 }

 pub fn __ashldi3(a: i64, b: i32) callconv(.C) i64 {
-    return @call(.{ .modifier = .always_inline }, ashlXi3, .{ i64, a, b });
+    return ashlXi3(i64, a, b);
 }
 pub fn __ashlti3(a: i128, b: i32) callconv(.C) i128 {
-    return @call(.{ .modifier = .always_inline }, ashlXi3, .{ i128, a, b });
+    return ashlXi3(i128, a, b);
 }
 pub fn __ashrdi3(a: i64, b: i32) callconv(.C) i64 {
-    return @call(.{ .modifier = .always_inline }, ashrXi3, .{ i64, a, b });
+    return ashrXi3(i64, a, b);
 }
 pub fn __ashrti3(a: i128, b: i32) callconv(.C) i128 {
-    return @call(.{ .modifier = .always_inline }, ashrXi3, .{ i128, a, b });
+    return ashrXi3(i128, a, b);
 }
 pub fn __lshrdi3(a: i64, b: i32) callconv(.C) i64 {
-    return @call(.{ .modifier = .always_inline }, lshrXi3, .{ i64, a, b });
+    return lshrXi3(i64, a, b);
 }
 pub fn __lshrti3(a: i128, b: i32) callconv(.C) i128 {
-    return @call(.{ .modifier = .always_inline }, lshrXi3, .{ i128, a, b });
+    return lshrXi3(i128, a, b);
 }

 pub fn __aeabi_llsl(a: i64, b: i32) callconv(.AAPCS) i64 {
-    return __ashldi3(a, b);
+    return ashlXi3(i64, a, b);
 }
 pub fn __aeabi_lasr(a: i64, b: i32) callconv(.AAPCS) i64 {
-    return __ashrdi3(a, b);
+    return ashrXi3(i64, a, b);
 }
 pub fn __aeabi_llsr(a: i64, b: i32) callconv(.AAPCS) i64 {
-    return __lshrdi3(a, b);
+    return lshrXi3(i64, a, b);
 }

 test {
--- a/src/type.zig
+++ b/src/type.zig
@ -4603,7 +4603,18 @@ pub const CType = enum {
                .longlong,
                .ulonglong,
                => return 64,
-                .longdouble => @panic("TODO figure out what kind of float `long double` is on this target"),
+                .longdouble => switch (target.cpu.arch) {
+                    .riscv64,
+                    .aarch64,
+                    .aarch64_be,
+                    .aarch64_32,
+                    .s390x,
+                    .mips64,
+                    .mips64el,
+                    => return 128,
+
+                    else => return 80,
+                },
            },

            .windows, .uefi => switch (self) {