diff --git a/lib/std/special/compiler_rt/extendXfYf2.zig b/lib/std/special/compiler_rt/extendXfYf2.zig
index 9a2580e9ec..2c3f0c88fc 100644
--- a/lib/std/special/compiler_rt/extendXfYf2.zig
+++ b/lib/std/special/compiler_rt/extendXfYf2.zig
@@ -1,6 +1,7 @@
 const std = @import("std");
 const builtin = @import("builtin");
 const is_test = builtin.is_test;
+const native_arch = builtin.cpu.arch;
 
 pub fn __extendsfdf2(a: f32) callconv(.C) f64 {
     return extendXfYf2(f64, f32, @bitCast(u32, a));
@@ -14,12 +15,16 @@ pub fn __extendsftf2(a: f32) callconv(.C) f128 {
     return extendXfYf2(f128, f32, @bitCast(u32, a));
 }
 
-pub fn __extendhfsf2(a: u16) callconv(.C) f32 {
-    return extendXfYf2(f32, f16, a);
+// AArch64 is the only ABI (at the moment) to support f16 arguments without the
+// need for extending them to wider fp types.
+pub const F16T = if (native_arch.isAARCH64()) f16 else u16;
+
+pub fn __extendhfsf2(a: F16T) callconv(.C) f32 {
+    return extendXfYf2(f32, f16, @bitCast(u16, a));
 }
 
-pub fn __extendhftf2(a: u16) callconv(.C) f128 {
-    return extendXfYf2(f128, f16, a);
+pub fn __extendhftf2(a: F16T) callconv(.C) f128 {
+    return extendXfYf2(f128, f16, @bitCast(u16, a));
 }
 
 pub fn __extendxftf2(a: c_longdouble) callconv(.C) f128 {
@@ -29,16 +34,14 @@ pub fn __extendxftf2(a: c_longdouble) callconv(.C) f128 {
 
 pub fn __aeabi_h2f(arg: u16) callconv(.AAPCS) f32 {
     @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __extendhfsf2, .{arg});
+    return @call(.{ .modifier = .always_inline }, extendXfYf2, .{ f32, f16, arg });
 }
 
 pub fn __aeabi_f2d(arg: f32) callconv(.AAPCS) f64 {
     @setRuntimeSafety(false);
-    return @call(.{ .modifier = .always_inline }, __extendsfdf2, .{arg});
+    return @call(.{ .modifier = .always_inline }, extendXfYf2, .{ f64, f32, @bitCast(u32, arg) });
 }
 
-const CHAR_BIT = 8;
-
 inline fn extendXfYf2(comptime dst_t: type, comptime src_t: type, a: std.meta.Int(.unsigned, @typeInfo(src_t).Float.bits)) dst_t {
     @setRuntimeSafety(builtin.is_test);
 
@@ -50,7 +53,7 @@ inline fn extendXfYf2(comptime dst_t: type, comptime src_t: type, a: std.meta.In
 
     // Various constants whose values follow from the type parameters.
     // Any reasonable optimizer will fold and propagate all of these.
-    const srcBits = @sizeOf(src_t) * CHAR_BIT;
+    const srcBits = @bitSizeOf(src_t);
     const srcExpBits = srcBits - srcSigBits - 1;
     const srcInfExp = (1 << srcExpBits) - 1;
     const srcExpBias = srcInfExp >> 1;
@@ -62,7 +65,7 @@ inline fn extendXfYf2(comptime dst_t: type, comptime src_t: type, a: std.meta.In
     const srcQNaN = 1 << (srcSigBits - 1);
     const srcNaNCode = srcQNaN - 1;
 
-    const dstBits = @sizeOf(dst_t) * CHAR_BIT;
+    const dstBits = @bitSizeOf(dst_t);
     const dstExpBits = dstBits - dstSigBits - 1;
     const dstInfExp = (1 << dstExpBits) - 1;
     const dstExpBias = dstInfExp >> 1;
diff --git a/lib/std/special/compiler_rt/extendXfYf2_test.zig b/lib/std/special/compiler_rt/extendXfYf2_test.zig
index 89545576a2..d0c4f82e97 100644
--- a/lib/std/special/compiler_rt/extendXfYf2_test.zig
+++ b/lib/std/special/compiler_rt/extendXfYf2_test.zig
@@ -3,6 +3,7 @@ const __extendhfsf2 = @import("extendXfYf2.zig").__extendhfsf2;
 const __extendhftf2 = @import("extendXfYf2.zig").__extendhftf2;
 const __extendsftf2 = @import("extendXfYf2.zig").__extendsftf2;
 const __extenddftf2 = @import("extendXfYf2.zig").__extenddftf2;
+const F16T = @import("extendXfYf2.zig").F16T;
 
 fn test__extenddftf2(a: f64, expectedHi: u64, expectedLo: u64) !void {
     const x = __extenddftf2(a);
@@ -27,7 +28,7 @@ fn test__extenddftf2(a: f64, expectedHi: u64, expectedLo: u64) !void {
 }
 
 fn test__extendhfsf2(a: u16, expected: u32) !void {
-    const x = __extendhfsf2(a);
+    const x = __extendhfsf2(@bitCast(F16T, a));
     const rep = @bitCast(u32, x);
 
     if (rep == expected) {
@@ -159,7 +160,7 @@ fn makeInf32() f32 {
 }
 
 fn test__extendhftf2(a: u16, expectedHi: u64, expectedLo: u64) !void {
-    const x = __extendhftf2(a);
+    const x = __extendhftf2(@bitCast(F16T, a));
 
     const rep = @bitCast(u128, x);
     const hi = @intCast(u64, rep >> 64);
diff --git a/lib/std/special/compiler_rt/truncXfYf2.zig b/lib/std/special/compiler_rt/truncXfYf2.zig
index 3cad52426e..559ec0ec4f 100644
--- a/lib/std/special/compiler_rt/truncXfYf2.zig
+++ b/lib/std/special/compiler_rt/truncXfYf2.zig
@@ -1,15 +1,21 @@
 const std = @import("std");
+const builtin = @import("builtin");
+const native_arch = builtin.cpu.arch;
 
-pub fn __truncsfhf2(a: f32) callconv(.C) u16 {
-    return @bitCast(u16, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f32, a }));
+// AArch64 is the only ABI (at the moment) to support f16 arguments without the
+// need for extending them to wider fp types.
+pub const F16T = if (native_arch.isAARCH64()) f16 else u16;
+
+pub fn __truncsfhf2(a: f32) callconv(.C) F16T {
+    return @bitCast(F16T, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f32, a }));
 }
 
-pub fn __truncdfhf2(a: f64) callconv(.C) u16 {
-    return @bitCast(u16, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f64, a }));
+pub fn __truncdfhf2(a: f64) callconv(.C) F16T {
+    return @bitCast(F16T, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f64, a }));
 }
 
-pub fn __trunctfhf2(a: f128) callconv(.C) u16 {
-    return @bitCast(u16, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f128, a }));
+pub fn __trunctfhf2(a: f128) callconv(.C) F16T {
+    return @bitCast(F16T, @call(.{ .modifier = .always_inline }, truncXfYf2, .{ f16, f128, a }));
 }
 
 pub fn __trunctfsf2(a: f128) callconv(.C) f32 {
diff --git a/lib/std/special/compiler_rt/truncXfYf2_test.zig b/lib/std/special/compiler_rt/truncXfYf2_test.zig
index 23c83afd9f..83ec8afab0 100644
--- a/lib/std/special/compiler_rt/truncXfYf2_test.zig
+++ b/lib/std/special/compiler_rt/truncXfYf2_test.zig
@@ -1,7 +1,7 @@
 const __truncsfhf2 = @import("truncXfYf2.zig").__truncsfhf2;
 
 fn test__truncsfhf2(a: u32, expected: u16) !void {
-    const actual = __truncsfhf2(@bitCast(f32, a));
+    const actual = @bitCast(u16, __truncsfhf2(@bitCast(f32, a)));
 
     if (actual == expected) {
         return;
@@ -82,7 +82,7 @@ fn test__truncdfhf2(a: f64, expected: u16) void {
 }
 
 fn test__truncdfhf2_raw(a: u64, expected: u16) void {
-    const actual = __truncdfhf2(@bitCast(f64, a));
+    const actual = @bitCast(u16, __truncdfhf2(@bitCast(f64, a)));
 
     if (actual == expected) {
         return;
diff --git a/test/behavior/cast_stage1.zig b/test/behavior/cast_stage1.zig
index 1a5679bc0c..f6bf975011 100644
--- a/test/behavior/cast_stage1.zig
+++ b/test/behavior/cast_stage1.zig
@@ -263,6 +263,32 @@ test "cast *[1][*]const u8 to [*]const ?[*]const u8" {
     try expect(mem.eql(u8, std.mem.spanZ(@ptrCast([*:0]const u8, x[0].?)), "window name"));
 }
 
+test "cast f16 to wider types" {
+    const S = struct {
+        fn doTheTest() !void {
+            var x: f16 = 1234.0;
+            try std.testing.expectEqual(@as(f32, 1234.0), x);
+            try std.testing.expectEqual(@as(f64, 1234.0), x);
+            try std.testing.expectEqual(@as(f128, 1234.0), x);
+        }
+    };
+    try S.doTheTest();
+    comptime try S.doTheTest();
+}
+
+test "cast f128 to narrower types" {
+    const S = struct {
+        fn doTheTest() !void {
+            var x: f128 = 1234.0;
+            try std.testing.expectEqual(@as(f16, 1234.0), @floatCast(f16, x));
+            try std.testing.expectEqual(@as(f32, 1234.0), @floatCast(f32, x));
+            try std.testing.expectEqual(@as(f64, 1234.0), @floatCast(f64, x));
+        }
+    };
+    try S.doTheTest();
+    comptime try S.doTheTest();
+}
+
 test "vector casts" {
     const S = struct {
         fn doTheTest() !void {
diff --git a/test/behavior/muladd.zig b/test/behavior/muladd.zig
index 5129303c92..eaa30324df 100644
--- a/test/behavior/muladd.zig
+++ b/test/behavior/muladd.zig
@@ -24,8 +24,7 @@ fn testMulAdd() !void {
         var c: f64 = 6.25;
         try expect(@mulAdd(f64, a, b, c) == 20);
     }
-    // TODO https://github.com/ziglang/zig/issues/9900
-    if (@import("builtin").cpu.arch != .aarch64) {
+    {
         var a: f16 = 5.5;
         var b: f128 = 2.5;
         var c: f128 = 6.25;