stage2: rework @mulAdd

* mul_add AIR instruction: use `pl_op` instead of `ty_pl`. The type is always the same as the operand; no need to waste bytes redundantly storing the type. * AstGen: use coerced_ty for all the operands except for one which we use to communicate the type. * Sema: use the correct source location for requireRuntimeBlock in handling of `@mulAdd`. * native backends: handle liveness even for the functions that are TODO. * C backend: implement `@mulAdd`. It lowers to libc calls. * LLVM backend: make `@mulAdd` handle all float types. - improved fptrunc and fpext to handle f80 with compiler-rt calls. * Value.mulAdd: handle all float types and use the `@mulAdd` builtin. * behavior tests: revert the changes to testing `@mulAdd`. These changes broke the test coverage, making it only tested at compile-time. Improved f80 support: * std.math.fma handles f80 * move fma functions from freestanding libc to compiler-rt - add __fmax and fmal - make __fmax and fmaq only exported when they don't alias fmal. - make their linkage weak just like the rest of compiler-rt symbols. * removed `longDoubleIsF128` and replaced it with `longDoubleIs` which takes a type as a parameter. The implementation is now more accurate and handles more targets. Similarly, in stage2 the function CTypes.sizeInBits is more accurate for long double for more targets.
2026-02-09 19:10:48 +00:00 · 2022-03-06 15:23:21 -07:00 · 2022-03-06 15:23:21 -07:00 · 71b8760d3b
commit 71b8760d3b
parent 6637335981
19 changed files with 403 additions and 217 deletions
--- a/lib/std/math/fma.zig
+++ b/lib/std/math/fma.zig
@ -19,6 +19,8 @@ pub fn fma(comptime T: type, x: T, y: T, z: T) T {
        // TODO this is not correct for some targets
        c_longdouble => @floatCast(c_longdouble, fma128(x, y, z)),

+        f80 => @floatCast(f80, fma128(x, y, z)),
+
        else => @compileError("fma not implemented for " ++ @typeName(T)),
    };
 }
--- a/lib/std/special/c.zig
+++ b/lib/std/special/c.zig
@ -12,7 +12,7 @@ const maxInt = std.math.maxInt;
 const native_os = builtin.os.tag;
 const native_arch = builtin.cpu.arch;
 const native_abi = builtin.abi;
-const long_double_is_f128 = builtin.target.longDoubleIsF128();
+const long_double_is_f128 = builtin.target.longDoubleIs(f128);

 const is_wasm = switch (native_arch) {
    .wasm32, .wasm64 => true,
@ -90,10 +90,6 @@ comptime {
    @export(fmod, .{ .name = "fmod", .linkage = .Strong });
    @export(fmodf, .{ .name = "fmodf", .linkage = .Strong });

-    @export(fma, .{ .name = "fma", .linkage = .Strong });
-    @export(fmaf, .{ .name = "fmaf", .linkage = .Strong });
-    @export(fmal, .{ .name = "fmal", .linkage = .Strong });
-
    @export(sincos, .{ .name = "sincos", .linkage = .Strong });
    @export(sincosf, .{ .name = "sincosf", .linkage = .Strong });

@ -561,20 +557,6 @@ test "fmod, fmodf" {
    }
 }

-fn fmaf(a: f32, b: f32, c: f32) callconv(.C) f32 {
-    return math.fma(f32, a, b, c);
-}
-
-fn fma(a: f64, b: f64, c: f64) callconv(.C) f64 {
-    return math.fma(f64, a, b, c);
-}
-fn fmal(a: c_longdouble, b: c_longdouble, c: c_longdouble) callconv(.C) c_longdouble {
-    if (!long_double_is_f128) {
-        @panic("TODO implement this");
-    }
-    return math.fma(c_longdouble, a, b, c);
-}
-
 fn sincos(a: f64, r_sin: *f64, r_cos: *f64) callconv(.C) void {
    r_sin.* = math.sin(a);
    r_cos.* = math.cos(a);
--- a/lib/std/special/compiler_rt.zig
+++ b/lib/std/special/compiler_rt.zig
@ -19,7 +19,8 @@ const strong_linkage = if (is_test)
 else
    std.builtin.GlobalLinkage.Strong;

-const long_double_is_f128 = builtin.target.longDoubleIsF128();
+const long_double_is_f80 = builtin.target.longDoubleIs(f80);
+const long_double_is_f128 = builtin.target.longDoubleIs(f128);

 comptime {
    // These files do their own comptime exporting logic.
@ -758,14 +759,35 @@ comptime {
    @export(floorf, .{ .name = "floorf", .linkage = linkage });
    @export(floor, .{ .name = "floor", .linkage = linkage });
    @export(floorl, .{ .name = "floorl", .linkage = linkage });
-    @export(fmaq, .{ .name = "fmaq", .linkage = linkage });
+
+    @export(fma, .{ .name = "fma", .linkage = linkage });
+    @export(fmaf, .{ .name = "fmaf", .linkage = linkage });
+    @export(fmal, .{ .name = "fmal", .linkage = linkage });
+    if (!long_double_is_f80) {
+        @export(__fmax, .{ .name = "__fmax", .linkage = linkage });
+    }
+    if (!long_double_is_f128) {
+        @export(fmaq, .{ .name = "fmaq", .linkage = linkage });
+    }
 }

 const math = std.math;

+fn fmaf(a: f32, b: f32, c: f32) callconv(.C) f32 {
+    return math.fma(f32, a, b, c);
+}
+fn fma(a: f64, b: f64, c: f64) callconv(.C) f64 {
+    return math.fma(f64, a, b, c);
+}
+fn __fmax(a: f80, b: f80, c: f80) callconv(.C) f80 {
+    return math.fma(f80, a, b, c);
+}
 fn fmaq(a: f128, b: f128, c: f128) callconv(.C) f128 {
    return math.fma(f128, a, b, c);
 }
+fn fmal(a: c_longdouble, b: c_longdouble, c: c_longdouble) callconv(.C) c_longdouble {
+    return math.fma(c_longdouble, a, b, c);
+}

 // TODO add intrinsics for these (and probably the double version too)
 // and have the math stuff use the intrinsic. same as @mod and @rem
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@ -1714,9 +1714,55 @@ pub const Target = struct {
        };
    }

-    pub inline fn longDoubleIsF128(target: Target) bool {
-        return switch (target.cpu.arch) {
-            .riscv64, .aarch64, .aarch64_be, .aarch64_32, .s390x, .mips64, .mips64el => true,
+    pub inline fn longDoubleIs(target: Target, comptime F: type) bool {
+        if (target.abi == .msvc) {
+            return F == f64;
+        }
+        return switch (F) {
+            f128 => switch (target.cpu.arch) {
+                .riscv64,
+                .aarch64,
+                .aarch64_be,
+                .aarch64_32,
+                .s390x,
+                .mips64,
+                .mips64el,
+                .sparc,
+                .sparcv9,
+                .sparcel,
+                .powerpc,
+                .powerpcle,
+                .powerpc64,
+                .powerpc64le,
+                => true,
+
+                else => false,
+            },
+            f80 => switch (target.cpu.arch) {
+                .x86_64, .i386 => true,
+                else => false,
+            },
+            f64 => switch (target.cpu.arch) {
+                .x86_64,
+                .i386,
+                .riscv64,
+                .aarch64,
+                .aarch64_be,
+                .aarch64_32,
+                .s390x,
+                .mips64,
+                .mips64el,
+                .sparc,
+                .sparcv9,
+                .sparcel,
+                .powerpc,
+                .powerpcle,
+                .powerpc64,
+                .powerpc64le,
+                => false,
+
+                else => true,
+            },
            else => false,
        };
    }
--- a/src/Air.zig
+++ b/src/Air.zig
@ -580,7 +580,8 @@ pub const Inst = struct {
        prefetch,

        /// Computes `(a * b) + c`, but only rounds once.
-        /// Uses the `ty_pl` field.
+        /// Uses the `pl_op` field with payload `Bin`.
+        /// The operand is the addend. The mulends are lhs and rhs.
        mul_add,

        /// Implements @fieldParentPtr builtin.
@ -728,12 +729,6 @@ pub const Bin = struct {
    rhs: Inst.Ref,
 };

-pub const MulAdd = struct {
-    mulend1: Inst.Ref,
-    mulend2: Inst.Ref,
-    addend: Inst.Ref,
-};
-
 pub const FieldParentPtr = struct {
    field_ptr: Inst.Ref,
    field_index: u32,
@ -899,7 +894,6 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
        .aggregate_init,
        .union_init,
        .field_parent_ptr,
-        .mul_add,
        => return air.getRefType(datas[inst].ty_pl.ty),

        .not,
@ -997,6 +991,8 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
            return ptr_ty.elemType();
        },

+        .mul_add => return air.typeOf(datas[inst].pl_op.operand),
+
        .add_with_overflow,
        .sub_with_overflow,
        .mul_with_overflow,
--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@ -7309,8 +7309,8 @@ fn builtinCall(
        },
        .mul_add => {
            const float_type = try typeExpr(gz, scope, params[0]);
-            const mulend1 = try expr(gz, scope, .{ .ty = float_type }, params[1]);
-            const mulend2 = try expr(gz, scope, .{ .ty = float_type }, params[2]);
+            const mulend1 = try expr(gz, scope, .{ .coerced_ty = float_type }, params[1]);
+            const mulend2 = try expr(gz, scope, .{ .coerced_ty = float_type }, params[2]);
            const addend = try expr(gz, scope, .{ .ty = float_type }, params[3]);
            const result = try gz.addPlNode(.mul_add, node, Zir.Inst.MulAdd{
                .mulend1 = mulend1,
--- a/src/Liveness.zig
+++ b/src/Liveness.zig
@ -465,8 +465,9 @@ fn analyzeInst(
            return trackOperands(a, new_set, inst, main_tomb, .{ extra.ptr, extra.expected_value, extra.new_value });
        },
        .mul_add => {
-            const extra = a.air.extraData(Air.MulAdd, inst_datas[inst].ty_pl.payload).data;
-            return trackOperands(a, new_set, inst, main_tomb, .{ extra.mulend1, extra.mulend2, extra.addend });
+            const pl_op = inst_datas[inst].pl_op;
+            const extra = a.air.extraData(Air.Bin, pl_op.payload).data;
+            return trackOperands(a, new_set, inst, main_tomb, .{ extra.lhs, extra.rhs, pl_op.operand });
        },
        .atomic_load => {
            const ptr = inst_datas[inst].atomic_load.ptr;
--- a/src/Sema.zig
+++ b/src/Sema.zig
@ -13525,48 +13525,26 @@ fn zirMulAdd(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
    const mulend2_src: LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
    const addend_src: LazySrcLoc = .{ .node_offset_builtin_call_arg3 = inst_data.src_node };

-    const mulend1 = sema.resolveInst(extra.mulend1);
-    const mulend2 = sema.resolveInst(extra.mulend2);
    const addend = sema.resolveInst(extra.addend);
-    // All args have the same type
-    const ty = sema.typeOf(mulend1);
-    switch (ty.zigTypeTag()) {
-        .ComptimeFloat, .Float => {},
-        .Vector => {
-            const scalar_ty = ty.scalarType();
-            switch (scalar_ty.zigTypeTag()) {
-                .ComptimeFloat, .Float => {},
-                else => return sema.fail(block, src, "expected vector of floats or float type, found '{}'", .{scalar_ty}),
-            }
-        },
-        else => return sema.fail(block, src, "expected vector of floats or float type, found '{}'", .{ty}),
-    }
+    const ty = sema.typeOf(addend);
+    const mulend1 = try sema.coerce(block, ty, sema.resolveInst(extra.mulend1), mulend1_src);
+    const mulend2 = try sema.coerce(block, ty, sema.resolveInst(extra.mulend2), mulend2_src);

    const target = sema.mod.getTarget();
+
    switch (ty.zigTypeTag()) {
        .ComptimeFloat, .Float => {
            const maybe_mulend1 = try sema.resolveMaybeUndefVal(block, mulend1_src, mulend1);
            const maybe_mulend2 = try sema.resolveMaybeUndefVal(block, mulend2_src, mulend2);
            const maybe_addend = try sema.resolveMaybeUndefVal(block, addend_src, addend);

-            if (maybe_mulend1) |mulend1_val| {
-                if (mulend1_val.isUndef())
-                    return sema.addConstUndef(ty);
-            }
-
-            if (maybe_mulend2) |mulend2_val| {
-                if (mulend2_val.isUndef())
-                    return sema.addConstUndef(ty);
-            }
-
-            if (maybe_addend) |addend_val| {
-                if (addend_val.isUndef())
-                    return sema.addConstUndef(ty);
-            }
-
-            if (maybe_mulend1) |mulend1_val| {
+            const runtime_src = if (maybe_mulend1) |mulend1_val| rs: {
                if (maybe_mulend2) |mulend2_val| {
+                    if (mulend2_val.isUndef()) return sema.addConstUndef(ty);
+
                    if (maybe_addend) |addend_val| {
+                        if (addend_val.isUndef()) return sema.addConstUndef(ty);
+
                        const result_val = try Value.mulAdd(
                            ty,
                            mulend1_val,
@ -13576,25 +13554,46 @@ fn zirMulAdd(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
                            target,
                        );
                        return sema.addConstant(ty, result_val);
+                    } else {
+                        break :rs addend_src;
                    }
+                } else {
+                    if (maybe_addend) |addend_val| {
+                        if (addend_val.isUndef()) return sema.addConstUndef(ty);
+                    }
+                    break :rs mulend2_src;
                }
-            }
+            } else rs: {
+                if (maybe_mulend2) |mulend2_val| {
+                    if (mulend2_val.isUndef()) return sema.addConstUndef(ty);
+                }
+                if (maybe_addend) |addend_val| {
+                    if (addend_val.isUndef()) return sema.addConstUndef(ty);
+                }
+                break :rs mulend1_src;
+            };

-            try sema.requireRuntimeBlock(block, src);
+            try sema.requireRuntimeBlock(block, runtime_src);
            return block.addInst(.{
                .tag = .mul_add,
-                .data = .{ .ty_pl = .{
-                    .ty = try sema.addType(ty),
-                    .payload = try sema.addExtra(Air.MulAdd{
-                        .mulend1 = mulend1,
-                        .mulend2 = mulend2,
-                        .addend = addend,
+                .data = .{ .pl_op = .{
+                    .operand = addend,
+                    .payload = try sema.addExtra(Air.Bin{
+                        .lhs = mulend1,
+                        .rhs = mulend2,
                    }),
                } },
            });
        },
-        .Vector => return sema.fail(block, src, "TODO: implement @mulAdd for vectors", .{}),
-        else => unreachable,
+        .Vector => {
+            const scalar_ty = ty.scalarType();
+            switch (scalar_ty.zigTypeTag()) {
+                .ComptimeFloat, .Float => {},
+                else => return sema.fail(block, src, "expected vector of floats or float type, found '{}'", .{scalar_ty}),
+            }
+            return sema.fail(block, src, "TODO: implement @mulAdd for vectors", .{});
+        },
+        else => return sema.fail(block, src, "expected vector of floats or float type, found '{}'", .{ty}),
    }
 }

--- a/src/Zir.zig
+++ b/src/Zir.zig
@ -891,6 +891,8 @@ pub const Inst = struct {
        atomic_store,
        /// Implements the `@mulAdd` builtin.
        /// Uses the `pl_node` union field with payload `MulAdd`.
+        /// The addend communicates the type of the builtin.
+        /// The mulends need to be coerced to the same type.
        mul_add,
        /// Implements the `@call` builtin.
        /// Uses the `pl_node` union field with payload `BuiltinCall`.
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@ -3654,8 +3654,12 @@ fn airPrefetch(self: *Self, inst: Air.Inst.Index) !void {
 }

 fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
-    _ = inst;
-    return self.fail("TODO implement airMulAdd for aarch64", .{});
+    const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+    const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else {
+        return self.fail("TODO implement airMulAdd for aarch64", .{});
+    };
+    return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, pl_op.operand });
 }

 fn resolveInst(self: *Self, inst: Air.Inst.Ref) InnerError!MCValue {
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@ -4088,8 +4088,12 @@ fn airPrefetch(self: *Self, inst: Air.Inst.Index) !void {
 }

 fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
-    _ = inst;
-    return self.fail("TODO implement airMulAdd for arm", .{});
+    const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+    const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else {
+        return self.fail("TODO implement airMulAdd for arm", .{});
+    };
+    return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, pl_op.operand });
 }

 fn resolveInst(self: *Self, inst: Air.Inst.Ref) InnerError!MCValue {
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@ -2205,8 +2205,12 @@ fn airPrefetch(self: *Self, inst: Air.Inst.Index) !void {
 }

 fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
-    _ = inst;
-    return self.fail("TODO implement airMulAdd for riscv64", .{});
+    const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+    const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else {
+        return self.fail("TODO implement airMulAdd for riscv64", .{});
+    };
+    return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, pl_op.operand });
 }

 fn resolveInst(self: *Self, inst: Air.Inst.Ref) InnerError!MCValue {
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@ -5561,8 +5561,12 @@ fn airPrefetch(self: *Self, inst: Air.Inst.Index) !void {
 }

 fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void {
-    _ = inst;
-    return self.fail("TODO implement airMulAdd for x86_64", .{});
+    const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+    const extra = self.air.extraData(Air.Bin, pl_op.payload).data;
+    const result: MCValue = if (self.liveness.isUnused(inst)) .dead else {
+        return self.fail("TODO implement airMulAdd for x86_64", .{});
+    };
+    return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, pl_op.operand });
 }

 fn resolveInst(self: *Self, inst: Air.Inst.Ref) InnerError!MCValue {
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@ -16,6 +16,7 @@ const trace = @import("../tracy.zig").trace;
 const LazySrcLoc = Module.LazySrcLoc;
 const Air = @import("../Air.zig");
 const Liveness = @import("../Liveness.zig");
+const CType = @import("../type.zig").CType;

 const Mutability = enum { Const, Mut };
 const BigIntConst = std.math.big.int.Const;
@ -1635,7 +1636,7 @@ fn genBody(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, OutO
            .trunc_float,
            => |tag| return f.fail("TODO: C backend: implement unary op for tag '{s}'", .{@tagName(tag)}),

-            .mul_add => return f.fail("TODO: C backend: implement @mulAdd", .{}),
+            .mul_add => try airMulAdd(f, inst),

            .add_with_overflow => try airAddWithOverflow(f, inst),
            .sub_with_overflow => try airSubWithOverflow(f, inst),
@ -3623,6 +3624,35 @@ fn airWasmMemoryGrow(f: *Function, inst: Air.Inst.Index) !CValue {
    return local;
 }

+fn airMulAdd(f: *Function, inst: Air.Inst.Index) !CValue {
+    if (f.liveness.isUnused(inst)) return CValue.none;
+    const pl_op = f.air.instructions.items(.data)[inst].pl_op;
+    const extra = f.air.extraData(Air.Bin, pl_op.payload).data;
+    const inst_ty = f.air.typeOfIndex(inst);
+    const mulend1 = try f.resolveInst(extra.lhs);
+    const mulend2 = try f.resolveInst(extra.rhs);
+    const addend = try f.resolveInst(pl_op.operand);
+    const writer = f.object.writer();
+    const target = f.object.dg.module.getTarget();
+    const fn_name = switch (inst_ty.floatBits(target)) {
+        16, 32 => "fmaf",
+        64 => "fma",
+        80 => if (CType.longdouble.sizeInBits(target) == 80) "fmal" else "__fmax",
+        128 => if (CType.longdouble.sizeInBits(target) == 128) "fmal" else "fmaq",
+        else => unreachable,
+    };
+    const local = try f.allocLocal(inst_ty, .Const);
+    try writer.writeAll(" = ");
+    try writer.print("{s}(", .{fn_name});
+    try f.writeCValue(writer, mulend1);
+    try writer.writeAll(", ");
+    try f.writeCValue(writer, mulend2);
+    try writer.writeAll(", ");
+    try f.writeCValue(writer, addend);
+    try writer.writeAll(");\n");
+    return local;
+}
+
 fn toMemoryOrder(order: std.builtin.AtomicOrder) [:0]const u8 {
    return switch (order) {
        .Unordered => "memory_order_relaxed",
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@ -18,6 +18,7 @@ const target_util = @import("../target.zig");
 const Value = @import("../value.zig").Value;
 const Type = @import("../type.zig").Type;
 const LazySrcLoc = Module.LazySrcLoc;
+const CType = @import("../type.zig").CType;

 const Error = error{ OutOfMemory, CodegenFail };

@ -2189,12 +2190,12 @@ pub const FuncGen = struct {
                .min       => try self.airMin(inst),
                .max       => try self.airMax(inst),
                .slice     => try self.airSlice(inst),
+                .mul_add   => try self.airMulAdd(inst),

                .add_with_overflow => try self.airOverflow(inst, "llvm.sadd.with.overflow", "llvm.uadd.with.overflow"),
                .sub_with_overflow => try self.airOverflow(inst, "llvm.ssub.with.overflow", "llvm.usub.with.overflow"),
                .mul_with_overflow => try self.airOverflow(inst, "llvm.smul.with.overflow", "llvm.umul.with.overflow"),
                .shl_with_overflow => try self.airShlWithOverflow(inst),
-                .mul_add           => try self.airMulAdd(inst),

                .bit_and, .bool_and => try self.airAnd(inst),
                .bit_or, .bool_or   => try self.airOr(inst),
@ -3844,43 +3845,43 @@ pub const FuncGen = struct {
    }

    fn airMulAdd(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
-        if (self.liveness.isUnused(inst))
-            return null;
+        if (self.liveness.isUnused(inst)) return null;

-        const ty_pl = self.air.instructions.items(.data)[inst].ty_pl;
-        const extra = self.air.extraData(Air.MulAdd, ty_pl.payload).data;
+        const pl_op = self.air.instructions.items(.data)[inst].pl_op;
+        const extra = self.air.extraData(Air.Bin, pl_op.payload).data;

-        const mulend1 = try self.resolveInst(extra.mulend1);
-        const mulend2 = try self.resolveInst(extra.mulend2);
-        const addend = try self.resolveInst(extra.addend);
+        const mulend1 = try self.resolveInst(extra.lhs);
+        const mulend2 = try self.resolveInst(extra.rhs);
+        const addend = try self.resolveInst(pl_op.operand);

        const ty = self.air.typeOfIndex(inst);
        const llvm_ty = try self.dg.llvmType(ty);
        const target = self.dg.module.getTarget();

-        const fn_val = switch (ty.floatBits(target)) {
-            16, 32, 64 => blk: {
-                break :blk self.getIntrinsic("llvm.fma", &.{llvm_ty});
-            },
-            // TODO: using `llvm.fma` for f80 does not seem to work for all targets, needs further investigation.
-            80 => return self.dg.todo("Implement mulAdd for f80", .{}),
-            128 => blk: {
-                // LLVM incorrectly lowers the fma builtin for f128 to fmal, which is for
-                // `long double`. On some targets this will be correct; on others it will be incorrect.
-                if (target.longDoubleIsF128()) {
-                    break :blk self.getIntrinsic("llvm.fma", &.{llvm_ty});
-                } else {
-                    break :blk self.dg.object.llvm_module.getNamedFunction("fmaq") orelse fn_blk: {
-                        const param_types = [_]*const llvm.Type{ llvm_ty, llvm_ty, llvm_ty };
-                        const fn_type = llvm.functionType(llvm_ty, &param_types, param_types.len, .False);
-                        break :fn_blk self.dg.object.llvm_module.addFunction("fmaq", fn_type);
-                    };
-                }
-            },
+        const Strat = union(enum) {
+            intrinsic,
+            libc: [*:0]const u8,
+        };
+        const strat: Strat = switch (ty.floatBits(target)) {
+            16, 32, 64 => Strat.intrinsic,
+            80 => if (CType.longdouble.sizeInBits(target) == 80) Strat{ .intrinsic = {} } else Strat{ .libc = "__fmax" },
+            // LLVM always lowers the fma builtin for f128 to fmal, which is for `long double`.
+            // On some targets this will be correct; on others it will be incorrect.
+            128 => if (CType.longdouble.sizeInBits(target) == 128) Strat{ .intrinsic = {} } else Strat{ .libc = "fmaq" },
            else => unreachable,
        };
+
+        const llvm_fn = switch (strat) {
+            .intrinsic => self.getIntrinsic("llvm.fma", &.{llvm_ty}),
+            .libc => |fn_name| self.dg.object.llvm_module.getNamedFunction(fn_name) orelse b: {
+                const param_types = [_]*const llvm.Type{ llvm_ty, llvm_ty, llvm_ty };
+                const fn_type = llvm.functionType(llvm_ty, &param_types, param_types.len, .False);
+                break :b self.dg.object.llvm_module.addFunction(fn_name, fn_type);
+            },
+        };
+
        const params = [_]*const llvm.Value{ mulend1, mulend2, addend };
-        return self.builder.buildCall(fn_val, &params, params.len, .C, .Auto, "");
+        return self.builder.buildCall(llvm_fn, &params, params.len, .C, .Auto, "");
    }

    fn airShlWithOverflow(self: *FuncGen, inst: Air.Inst.Index) !?*const llvm.Value {
@ -4061,8 +4062,15 @@ pub const FuncGen = struct {

        const ty_op = self.air.instructions.items(.data)[inst].ty_op;
        const operand = try self.resolveInst(ty_op.operand);
-        const dest_llvm_ty = try self.dg.llvmType(self.air.typeOfIndex(inst));
-
+        const operand_ty = self.air.typeOf(ty_op.operand);
+        const dest_ty = self.air.typeOfIndex(inst);
+        const target = self.dg.module.getTarget();
+        const dest_bits = dest_ty.floatBits(target);
+        const src_bits = operand_ty.floatBits(target);
+        if (!backendSupportsF80(target) and (src_bits == 80 or dest_bits == 80)) {
+            return softF80TruncOrExt(self, operand, src_bits, dest_bits);
+        }
+        const dest_llvm_ty = try self.dg.llvmType(dest_ty);
        return self.builder.buildFPTrunc(operand, dest_llvm_ty, "");
    }

@ -4072,8 +4080,15 @@ pub const FuncGen = struct {

        const ty_op = self.air.instructions.items(.data)[inst].ty_op;
        const operand = try self.resolveInst(ty_op.operand);
+        const operand_ty = self.air.typeOf(ty_op.operand);
+        const dest_ty = self.air.typeOfIndex(inst);
+        const target = self.dg.module.getTarget();
+        const dest_bits = dest_ty.floatBits(target);
+        const src_bits = operand_ty.floatBits(target);
+        if (!backendSupportsF80(target) and (src_bits == 80 or dest_bits == 80)) {
+            return softF80TruncOrExt(self, operand, src_bits, dest_bits);
+        }
        const dest_llvm_ty = try self.dg.llvmType(self.air.typeOfIndex(inst));
-
        return self.builder.buildFPExt(operand, dest_llvm_ty, "");
    }

@ -5105,6 +5120,87 @@ pub const FuncGen = struct {
        return null;
    }

+    fn softF80TruncOrExt(
+        self: *FuncGen,
+        operand: *const llvm.Value,
+        src_bits: u16,
+        dest_bits: u16,
+    ) !?*const llvm.Value {
+        const target = self.dg.module.getTarget();
+
+        var param_llvm_ty: *const llvm.Type = self.context.intType(80);
+        var ret_llvm_ty: *const llvm.Type = param_llvm_ty;
+        var fn_name: [*:0]const u8 = undefined;
+        var arg = operand;
+        var final_cast: ?*const llvm.Type = null;
+
+        assert(src_bits == 80 or dest_bits == 80);
+
+        if (src_bits == 80) switch (dest_bits) {
+            16 => {
+                // See corresponding condition at definition of
+                // __truncxfhf2 in compiler-rt.
+                if (target.cpu.arch.isAARCH64()) {
+                    ret_llvm_ty = self.context.halfType();
+                } else {
+                    ret_llvm_ty = self.context.intType(16);
+                    final_cast = self.context.halfType();
+                }
+                fn_name = "__truncxfhf2";
+            },
+            32 => {
+                ret_llvm_ty = self.context.floatType();
+                fn_name = "__truncxfsf2";
+            },
+            64 => {
+                ret_llvm_ty = self.context.doubleType();
+                fn_name = "__truncxfdf2";
+            },
+            80 => return operand,
+            128 => {
+                ret_llvm_ty = self.context.fp128Type();
+                fn_name = "__extendxftf2";
+            },
+            else => unreachable,
+        } else switch (src_bits) {
+            16 => {
+                // See corresponding condition at definition of
+                // __extendhfxf2 in compiler-rt.
+                param_llvm_ty = if (target.cpu.arch.isAARCH64())
+                    self.context.halfType()
+                else
+                    self.context.intType(16);
+                arg = self.builder.buildBitCast(arg, param_llvm_ty, "");
+                fn_name = "__extendhfxf2";
+            },
+            32 => {
+                param_llvm_ty = self.context.floatType();
+                fn_name = "__extendsfxf2";
+            },
+            64 => {
+                param_llvm_ty = self.context.doubleType();
+                fn_name = "__extenddfxf2";
+            },
+            80 => return operand,
+            128 => {
+                param_llvm_ty = self.context.fp128Type();
+                fn_name = "__trunctfxf2";
+            },
+            else => unreachable,
+        }
+
+        const llvm_fn = self.dg.object.llvm_module.getNamedFunction(fn_name) orelse f: {
+            const param_types = [_]*const llvm.Type{param_llvm_ty};
+            const fn_type = llvm.functionType(ret_llvm_ty, &param_types, param_types.len, .False);
+            break :f self.dg.object.llvm_module.addFunction(fn_name, fn_type);
+        };
+
+        var args: [1]*const llvm.Value = .{arg};
+        const result = self.builder.buildCall(llvm_fn, &args, args.len, .C, .Auto, "");
+        const final_cast_llvm_ty = final_cast orelse return result;
+        return self.builder.buildBitCast(result, final_cast_llvm_ty, "");
+    }
+
    fn getErrorNameTable(self: *FuncGen) !*const llvm.Value {
        if (self.dg.object.error_name_table) |table| {
            return table;
--- a/src/print_air.zig
+++ b/src/print_air.zig
@ -360,14 +360,14 @@ const Writer = struct {
    }

    fn writeMulAdd(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
-        const ty_pl = w.air.instructions.items(.data)[inst].ty_pl;
-        const extra = w.air.extraData(Air.MulAdd, ty_pl.payload).data;
+        const pl_op = w.air.instructions.items(.data)[inst].pl_op;
+        const extra = w.air.extraData(Air.Bin, pl_op.payload).data;

-        try w.writeOperand(s, inst, 0, extra.mulend1);
+        try w.writeOperand(s, inst, 0, extra.lhs);
        try s.writeAll(", ");
-        try w.writeOperand(s, inst, 1, extra.mulend2);
+        try w.writeOperand(s, inst, 1, extra.rhs);
        try s.writeAll(", ");
-        try w.writeOperand(s, inst, 2, extra.addend);
+        try w.writeOperand(s, inst, 2, pl_op.operand);
    }

    fn writeFence(w: *Writer, s: anytype, inst: Air.Inst.Index) @TypeOf(s).Error!void {
--- a/src/type.zig
+++ b/src/type.zig
@ -5436,33 +5436,36 @@ pub const CType = enum {
        switch (target.os.tag) {
            .freestanding, .other => switch (target.cpu.arch) {
                .msp430 => switch (self) {
-                    .short,
-                    .ushort,
-                    .int,
-                    .uint,
-                    => return 16,
-                    .long,
-                    .ulong,
-                    => return 32,
-                    .longlong,
-                    .ulonglong,
-                    => return 64,
-                    .longdouble => @panic("TODO figure out what kind of float `long double` is on this target"),
+                    .short, .ushort, .int, .uint => return 16,
+                    .long, .ulong => return 32,
+                    .longlong, .ulonglong, .longdouble => return 64,
                },
                else => switch (self) {
-                    .short,
-                    .ushort,
-                    => return 16,
-                    .int,
-                    .uint,
-                    => return 32,
-                    .long,
-                    .ulong,
-                    => return target.cpu.arch.ptrBitWidth(),
-                    .longlong,
-                    .ulonglong,
-                    => return 64,
-                    .longdouble => @panic("TODO figure out what kind of float `long double` is on this target"),
+                    .short, .ushort => return 16,
+                    .int, .uint => return 32,
+                    .long, .ulong => return target.cpu.arch.ptrBitWidth(),
+                    .longlong, .ulonglong => return 64,
+                    .longdouble => switch (target.cpu.arch) {
+                        .i386, .x86_64 => return 80,
+
+                        .riscv64,
+                        .aarch64,
+                        .aarch64_be,
+                        .aarch64_32,
+                        .s390x,
+                        .mips64,
+                        .mips64el,
+                        .sparc,
+                        .sparcv9,
+                        .sparcel,
+                        .powerpc,
+                        .powerpcle,
+                        .powerpc64,
+                        .powerpc64le,
+                        => return 128,
+
+                        else => return 64,
+                    },
                },
            },

@ -5477,19 +5480,13 @@ pub const CType = enum {
            .plan9,
            .solaris,
            => switch (self) {
-                .short,
-                .ushort,
-                => return 16,
-                .int,
-                .uint,
-                => return 32,
-                .long,
-                .ulong,
-                => return target.cpu.arch.ptrBitWidth(),
-                .longlong,
-                .ulonglong,
-                => return 64,
+                .short, .ushort => return 16,
+                .int, .uint => return 32,
+                .long, .ulong => return target.cpu.arch.ptrBitWidth(),
+                .longlong, .ulonglong => return 64,
                .longdouble => switch (target.cpu.arch) {
+                    .i386, .x86_64 => return 80,
+
                    .riscv64,
                    .aarch64,
                    .aarch64_be,
@ -5497,40 +5494,33 @@ pub const CType = enum {
                    .s390x,
                    .mips64,
                    .mips64el,
+                    .sparc,
+                    .sparcv9,
+                    .sparcel,
+                    .powerpc,
+                    .powerpcle,
+                    .powerpc64,
+                    .powerpc64le,
                    => return 128,

-                    else => return 80,
+                    else => return 64,
                },
            },

            .windows, .uefi => switch (self) {
-                .short,
-                .ushort,
-                => return 16,
-                .int,
-                .uint,
-                .long,
-                .ulong,
-                => return 32,
-                .longlong,
-                .ulonglong,
-                => return 64,
-                .longdouble => @panic("TODO figure out what kind of float `long double` is on this target"),
+                .short, .ushort => return 16,
+                .int, .uint, .long, .ulong => return 32,
+                .longlong, .ulonglong, .longdouble => return 64,
            },

-            .ios => switch (self) {
-                .short,
-                .ushort,
-                => return 16,
-                .int,
-                .uint,
-                => return 32,
-                .long,
-                .ulong,
-                .longlong,
-                .ulonglong,
-                => return 64,
-                .longdouble => @panic("TODO figure out what kind of float `long double` is on this target"),
+            .ios, .tvos, .watchos => switch (self) {
+                .short, .ushort => return 16,
+                .int, .uint => return 32,
+                .long, .ulong, .longlong, .ulonglong => return 64,
+                .longdouble => switch (target.cpu.arch) {
+                    .i386, .x86_64 => return 80,
+                    else => return 64,
+                },
            },

            .ananas,
@ -5549,8 +5539,6 @@ pub const CType = enum {
            .amdhsa,
            .ps4,
            .elfiamcu,
-            .tvos,
-            .watchos,
            .mesa3d,
            .contiki,
            .amdpal,
--- a/src/value.zig
+++ b/src/value.zig
@ -4020,37 +4020,44 @@ pub const Value = extern union {
        }
    }

-    pub fn mulAdd(float_type: Type, mulend1: Value, mulend2: Value, addend: Value, arena: Allocator, target: Target) Allocator.Error!Value {
+    pub fn mulAdd(
+        float_type: Type,
+        mulend1: Value,
+        mulend2: Value,
+        addend: Value,
+        arena: Allocator,
+        target: Target,
+    ) Allocator.Error!Value {
        switch (float_type.floatBits(target)) {
            16 => {
-                if (true) {
-                    // TODO: missing f16 implementation of FMA in `std.math.fma` or compiler-rt
-                    @panic("TODO implement mulAdd for f16");
-                }
+                const m1 = mulend1.toFloat(f16);
+                const m2 = mulend2.toFloat(f16);
+                const a = addend.toFloat(f16);
+                return Value.Tag.float_16.create(arena, @mulAdd(f16, m1, m2, a));
            },
            32 => {
                const m1 = mulend1.toFloat(f32);
                const m2 = mulend2.toFloat(f32);
                const a = addend.toFloat(f32);
-                return Value.Tag.float_32.create(arena, std.math.fma(f32, m1, m2, a));
+                return Value.Tag.float_32.create(arena, @mulAdd(f32, m1, m2, a));
            },
            64 => {
                const m1 = mulend1.toFloat(f64);
                const m2 = mulend2.toFloat(f64);
                const a = addend.toFloat(f64);
-                return Value.Tag.float_64.create(arena, std.math.fma(f64, m1, m2, a));
+                return Value.Tag.float_64.create(arena, @mulAdd(f64, m1, m2, a));
            },
            80 => {
-                if (true) {
-                    // TODO: missing f80 implementation of FMA in `std.math.fma` or compiler-rt
-                    @panic("TODO implement mulAdd for f80");
-                }
+                const m1 = mulend1.toFloat(f80);
+                const m2 = mulend2.toFloat(f80);
+                const a = addend.toFloat(f80);
+                return Value.Tag.float_80.create(arena, @mulAdd(f80, m1, m2, a));
            },
            128 => {
                const m1 = mulend1.toFloat(f128);
                const m2 = mulend2.toFloat(f128);
                const a = addend.toFloat(f128);
-                return Value.Tag.float_128.create(arena, std.math.fma(f128, m1, m2, a));
+                return Value.Tag.float_128.create(arena, @mulAdd(f128, m1, m2, a));
            },
            else => unreachable,
        }
--- a/test/behavior/muladd.zig
+++ b/test/behavior/muladd.zig
@ -2,8 +2,8 @@ const builtin = @import("builtin");
 const expect = @import("std").testing.expect;

 test "@mulAdd" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
@ -13,22 +13,22 @@ test "@mulAdd" {
 }

 fn testMulAdd() !void {
-    if (builtin.zig_backend == .stage1) {
-        const a: f16 = 5.5;
-        const b: f16 = 2.5;
-        const c: f16 = 6.25;
+    {
+        var a: f16 = 5.5;
+        var b: f16 = 2.5;
+        var c: f16 = 6.25;
        try expect(@mulAdd(f16, a, b, c) == 20);
    }
    {
-        const a: f32 = 5.5;
-        const b: f32 = 2.5;
-        const c: f32 = 6.25;
+        var a: f32 = 5.5;
+        var b: f32 = 2.5;
+        var c: f32 = 6.25;
        try expect(@mulAdd(f32, a, b, c) == 20);
    }
    {
-        const a: f64 = 5.5;
-        const b: f64 = 2.5;
-        const c: f64 = 6.25;
+        var a: f64 = 5.5;
+        var b: f64 = 2.5;
+        var c: f64 = 6.25;
        try expect(@mulAdd(f64, a, b, c) == 20);
    }
 }
@ -39,9 +39,7 @@ test "@mulAdd f80" {
        return error.SkipZigTest;
    }

-    // TODO: missing f80 implementation of FMA in `std.math.fma` or compiler-rt
-    // comptime try testMulAdd80();
-
+    comptime try testMulAdd80();
    try testMulAdd80();
 }

@ -53,11 +51,12 @@ fn testMulAdd80() !void {
 }

 test "@mulAdd f128" {
-    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
+
    if (builtin.os.tag == .macos and builtin.cpu.arch == .aarch64) {
        // https://github.com/ziglang/zig/issues/9900
        return error.SkipZigTest;
@ -68,8 +67,8 @@ test "@mulAdd f128" {
 }

 fn testMulAdd128() !void {
-    const a: f16 = 5.5;
-    const b: f128 = 2.5;
-    const c: f128 = 6.25;
+    var a: f16 = 5.5;
+    var b: f128 = 2.5;
+    var c: f128 = 6.25;
    try expect(@mulAdd(f128, a, b, c) == 20);
 }