diff --git a/src/Air.zig b/src/Air.zig
index 0968d95180..2dae8454cf 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -815,6 +815,8 @@ pub const VectorCmp = struct {
 /// 1. `Inst.Ref` for every inputs_len
 /// 2. for every outputs_len
 ///    - constraint: memory at this position is reinterpreted as a null
+///      terminated string.
+///    - name: memory at this position is reinterpreted as a null
 ///      terminated string. pad to the next u32 after the null byte.
 /// 3. for every inputs_len
 ///    - constraint: memory at this position is reinterpreted as a null
diff --git a/src/Sema.zig b/src/Sema.zig
index 87cb7cc9d2..9750111662 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -10535,7 +10535,11 @@ fn zirAsm(
     var output_type_bits = extra.data.output_type_bits;
     var needed_capacity: usize = @typeInfo(Air.Asm).Struct.fields.len + outputs_len + inputs_len;
 
-    const Output = struct { constraint: []const u8, ty: Type };
+    const Output = struct {
+        constraint: []const u8,
+        name: []const u8,
+        ty: Type,
+    };
     const output: ?Output = if (outputs_len == 0) null else blk: {
         const output = sema.code.extraData(Zir.Inst.Asm.Output, extra_i);
         extra_i = output.end;
@@ -10548,10 +10552,12 @@ fn zirAsm(
         }
 
         const constraint = sema.code.nullTerminatedString(output.data.constraint);
-        needed_capacity += constraint.len / 4 + 1;
+        const name = sema.code.nullTerminatedString(output.data.name);
+        needed_capacity += (constraint.len + name.len + (2 + 3)) / 4;
 
         break :blk Output{
             .constraint = constraint,
+            .name = name,
             .ty = try sema.resolveType(block, ret_ty_src, output.data.operand),
         };
     };
@@ -10573,7 +10579,7 @@ fn zirAsm(
 
         const constraint = sema.code.nullTerminatedString(input.data.constraint);
         const name = sema.code.nullTerminatedString(input.data.name);
-        needed_capacity += (constraint.len + name.len + 1) / 4 + 1;
+        needed_capacity += (constraint.len + name.len + (2 + 3)) / 4;
         inputs[arg_i] = .{ .c = constraint, .n = name };
     }
 
@@ -10611,7 +10617,9 @@ fn zirAsm(
         const buffer = mem.sliceAsBytes(sema.air_extra.unusedCapacitySlice());
         mem.copy(u8, buffer, o.constraint);
         buffer[o.constraint.len] = 0;
-        sema.air_extra.items.len += o.constraint.len / 4 + 1;
+        mem.copy(u8, buffer[o.constraint.len + 1 ..], o.name);
+        buffer[o.constraint.len + 1 + o.name.len] = 0;
+        sema.air_extra.items.len += (o.constraint.len + o.name.len + (2 + 3)) / 4;
     }
     for (inputs) |input| {
         const buffer = mem.sliceAsBytes(sema.air_extra.unusedCapacitySlice());
@@ -10619,7 +10627,7 @@ fn zirAsm(
         buffer[input.c.len] = 0;
         mem.copy(u8, buffer[input.c.len + 1 ..], input.n);
         buffer[input.c.len + 1 + input.n.len] = 0;
-        sema.air_extra.items.len += (input.c.len + input.n.len + 1) / 4 + 1;
+        sema.air_extra.items.len += (input.c.len + input.n.len + (2 + 3)) / 4;
     }
     for (clobbers) |clobber| {
         const buffer = mem.sliceAsBytes(sema.air_extra.unusedCapacitySlice());
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index 5ed7b63db3..3b27633f69 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -3272,10 +3272,12 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (output != .none) {
                 return self.fail("TODO implement codegen for non-expr asm", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             break constraint;
         } else null;
@@ -3283,10 +3285,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         for (inputs) |input| {
             const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
                 return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index 73f51f6481..87d51b0276 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -4078,10 +4078,12 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (output != .none) {
                 return self.fail("TODO implement codegen for non-expr asm", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             break constraint;
         } else null;
@@ -4089,10 +4091,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         for (inputs) |input| {
             const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
                 return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 61fddee207..96d30c31ce 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -2098,10 +2098,12 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (output != .none) {
                 return self.fail("TODO implement codegen for non-expr asm", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             break constraint;
         } else null;
@@ -2109,10 +2111,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         for (inputs) |input| {
             const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
                 return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
diff --git a/src/arch/sparcv9/CodeGen.zig b/src/arch/sparcv9/CodeGen.zig
index bcd8cf8eeb..7d93916fc1 100644
--- a/src/arch/sparcv9/CodeGen.zig
+++ b/src/arch/sparcv9/CodeGen.zig
@@ -642,10 +642,12 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (output != .none) {
                 return self.fail("TODO implement codegen for non-expr asm", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             break constraint;
         } else null;
@@ -653,10 +655,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         for (inputs) |input| {
             const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
                 return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 0103f5382f..a4f9c862d9 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -4739,10 +4739,12 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
             if (output != .none) {
                 return self.fail("TODO implement codegen for non-expr asm", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             break constraint;
         } else null;
@@ -4750,10 +4752,10 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void {
         for (inputs) |input| {
             const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (constraint.len < 3 or constraint[0] != '{' or constraint[constraint.len - 1] != '}') {
                 return self.fail("unrecognized asm input constraint: '{s}'", .{constraint});
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 2cd93d47fc..44b616c493 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -3018,10 +3018,12 @@ fn airAsm(f: *Function, inst: Air.Inst.Index) !CValue {
         if (output != .none) {
             return f.fail("TODO implement codegen for non-expr asm", .{});
         }
+        const extra_bytes = std.mem.sliceAsBytes(f.air.extra[extra_i..]);
         const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(f.air.extra[extra_i..]), 0);
+        const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
         // This equation accounts for the fact that even if we have exactly 4 bytes
         // for the string, we still use the next u32 for the null terminator.
-        extra_i += constraint.len / 4 + 1;
+        extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
         break constraint;
     } else null;
@@ -3031,10 +3033,12 @@ fn airAsm(f: *Function, inst: Air.Inst.Index) !CValue {
 
     const inputs_extra_begin = extra_i;
     for (inputs) |input, i| {
-        const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(f.air.extra[extra_i..]), 0);
+        const input_bytes = std.mem.sliceAsBytes(f.air.extra[extra_i..]);
+        const constraint = std.mem.sliceTo(input_bytes, 0);
+        const name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
         // This equation accounts for the fact that even if we have exactly 4 bytes
         // for the string, we still use the next u32 for the null terminator.
-        extra_i += constraint.len / 4 + 1;
+        extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
         if (constraint[0] == '{' and constraint[constraint.len - 1] == '}') {
             const reg = constraint[1 .. constraint.len - 1];
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index c9ea5bebac..63e7eece99 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4638,14 +4638,19 @@ pub const FuncGen = struct {
         var llvm_param_i: usize = 0;
         var total_i: usize = 0;
 
+        var name_map: std.StringArrayHashMapUnmanaged(void) = .{};
+        try name_map.ensureUnusedCapacity(arena, outputs.len + inputs.len);
+
         for (outputs) |output| {
             if (output != .none) {
                 return self.todo("implement inline asm with non-returned output", .{});
             }
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
             const constraint = std.mem.sliceTo(std.mem.sliceAsBytes(self.air.extra[extra_i..]), 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             try llvm_constraints.ensureUnusedCapacity(self.gpa, constraint.len + 1);
             if (total_i != 0) {
@@ -4654,17 +4659,17 @@ pub const FuncGen = struct {
             llvm_constraints.appendAssumeCapacity('=');
             llvm_constraints.appendSliceAssumeCapacity(constraint[1..]);
 
+            name_map.putAssumeCapacityNoClobber(name, {});
             total_i += 1;
         }
 
-        const input_start_extra_i = extra_i;
         for (inputs) |input| {
-            const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
-            const constraint = std.mem.sliceTo(input_bytes, 0);
-            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
+            const extra_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
+            const constraint = std.mem.sliceTo(extra_bytes, 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
             // for the string, we still use the next u32 for the null terminator.
-            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             const arg_llvm_value = try self.resolveInst(input);
 
@@ -4677,6 +4682,7 @@ pub const FuncGen = struct {
             }
             llvm_constraints.appendSliceAssumeCapacity(constraint);
 
+            name_map.putAssumeCapacityNoClobber(name, {});
             llvm_param_i += 1;
             total_i += 1;
         }
@@ -4739,20 +4745,11 @@ pub const FuncGen = struct {
                         const name = asm_source[name_start..i];
                         state = .start;
 
-                        extra_i = input_start_extra_i;
-                        for (inputs) |_, input_i| {
-                            const input_bytes = std.mem.sliceAsBytes(self.air.extra[extra_i..]);
-                            const constraint = std.mem.sliceTo(input_bytes, 0);
-                            const input_name = std.mem.sliceTo(input_bytes[constraint.len + 1 ..], 0);
-                            extra_i += (constraint.len + input_name.len + 1) / 4 + 1;
-
-                            if (std.mem.eql(u8, name, input_name)) {
-                                try rendered_template.writer().print("{d}", .{input_i});
-                                break;
-                            }
-                        } else {
-                            return self.todo("TODO validate asm in Sema", .{});
-                        }
+                        const index = name_map.getIndex(name) orelse {
+                            // we should validate the assembly in Sema; by now it is too late
+                            return self.todo("unknown input or output name: '{s}'", .{name});
+                        };
+                        try rendered_template.writer().print("{d}", .{index});
                     },
                     else => {},
                 },
diff --git a/src/print_air.zig b/src/print_air.zig
index 6e336e138b..c01d96ed7f 100644
--- a/src/print_air.zig
+++ b/src/print_air.zig
@@ -542,15 +542,19 @@ const Writer = struct {
         extra_i += inputs.len;
 
         for (outputs) |output| {
-            const constraint = w.air.nullTerminatedString(extra_i);
+            const extra_bytes = std.mem.sliceAsBytes(w.air.extra[extra_i..]);
+            const constraint = std.mem.sliceTo(extra_bytes, 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
+
             // This equation accounts for the fact that even if we have exactly 4 bytes
-            // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            // for the strings and their null terminators, we still use the next u32
+            // for the null terminator.
+            extra_i += (constraint.len + name.len + (2 + 3)) / 4;
 
             if (output == .none) {
-                try s.print(", -> {s}", .{constraint});
+                try s.print(", [{s}] -> {s}", .{ name, constraint });
             } else {
-                try s.print(", out {s} = (", .{constraint});
+                try s.print(", [{s}] out {s} = (", .{ name, constraint });
                 try w.writeOperand(s, inst, op_index, output);
                 op_index += 1;
                 try s.writeByte(')');
@@ -558,12 +562,15 @@ const Writer = struct {
         }
 
         for (inputs) |input| {
-            const constraint = w.air.nullTerminatedString(extra_i);
+            const extra_bytes = std.mem.sliceAsBytes(w.air.extra[extra_i..]);
+            const constraint = std.mem.sliceTo(extra_bytes, 0);
+            const name = std.mem.sliceTo(extra_bytes[constraint.len + 1 ..], 0);
             // This equation accounts for the fact that even if we have exactly 4 bytes
-            // for the string, we still use the next u32 for the null terminator.
-            extra_i += constraint.len / 4 + 1;
+            // for the strings and their null terminators, we still use the next u32
+            // for the null terminator.
+            extra_i += (constraint.len + name.len + 1) / 4 + 1;
 
-            try s.print(", in {s} = (", .{constraint});
+            try s.print(", [{s}] in {s} = (", .{ name, constraint });
             try w.writeOperand(s, inst, op_index, input);
             op_index += 1;
             try s.writeByte(')');
@@ -572,7 +579,8 @@ const Writer = struct {
         {
             var clobber_i: u32 = 0;
             while (clobber_i < clobbers_len) : (clobber_i += 1) {
-                const clobber = w.air.nullTerminatedString(extra_i);
+                const extra_bytes = std.mem.sliceAsBytes(w.air.extra[extra_i..]);
+                const clobber = std.mem.sliceTo(extra_bytes, 0);
                 // This equation accounts for the fact that even if we have exactly 4 bytes
                 // for the string, we still use the next u32 for the null terminator.
                 extra_i += clobber.len / 4 + 1;