From 259f784241fb44e0a1b570daaf31ba2b9f164106 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Tue, 3 May 2022 23:44:32 -0700
Subject: [PATCH 1/9] stage2: improve `@sizeOf` and `@alignOf` integers

Prior to this commit, the logic for ABI size and ABI alignment for
integers was naive and incorrect. This results in wasted hardware as
well as undefined behavior in the LLVM backend when we memset an
incorrect number of bytes to 0xaa due to disagreeing with LLVM about the
ABI size of integers.

This commit introduces a "max int align" value which is different per
Target. This value is used to derive the ABI size and alignment of all
integers.

This commit makes an interesting change from stage1, which treats
128-bit integers as 16-bytes aligned for x86_64-linux. stage1 is
incorrect. The maximum integer alignment on this system is only 8 bytes.
This change breaks the behavior test called "128-bit cmpxchg" because on
that target, 128-bit cmpxchg does require a 16-bytes aligned pointer to
a 128 bit integer. However, this alignment property does not belong on
*all* 128 bit integers - only on the pointer type in the `@cmpxchg`
builtin function prototype. The user can then use an alignment override
annotation on a 128-bit integer variable or struct field to obtain such
a pointer.
---
 lib/std/target.zig        |  77 ++++++++++++++++++++++
 src/codegen/llvm.zig      |   2 +-
 src/type.zig              |  38 ++++++-----
 test/behavior/align.zig   | 130 ++++++++++++++++++++++++++++++--------
 test/behavior/bitcast.zig |  15 +----
 test/behavior/struct.zig  |   9 +--
 6 files changed, 211 insertions(+), 60 deletions(-)

diff --git a/lib/std/target.zig b/lib/std/target.zig
index c23a42c963..777b0f0ec0 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1773,6 +1773,83 @@ pub const Target = struct {
             else => false,
         };
     }
+
+    pub inline fn maxIntAlignment(target: Target) u16 {
+        return switch (target.cpu.arch) {
+            .avr => 1,
+            .msp430 => 2,
+            .xcore => 4,
+
+            .arm,
+            .armeb,
+            .thumb,
+            .thumbeb,
+            .x86_64,
+            .hexagon,
+            .mips,
+            .mipsel,
+            .mips64,
+            .mips64el,
+            .powerpc,
+            .powerpcle,
+            .powerpc64,
+            .powerpc64le,
+            .r600,
+            .amdgcn,
+            .riscv32,
+            .riscv64,
+            .sparc,
+            .sparcv9,
+            .sparcel,
+            .s390x,
+            .lanai,
+            .wasm32,
+            .wasm64,
+            => 8,
+
+            .i386 => return switch (target.os.tag) {
+                .windows => 8,
+                else => 4,
+            },
+            .aarch64,
+            .aarch64_be,
+            .aarch64_32,
+            .bpfel,
+            .bpfeb,
+            .nvptx,
+            .nvptx64,
+            => 16,
+
+            // Below this comment are unverified and I have chosen a number
+            // based on ptrBitWidth.
+
+            .spu_2 => 2,
+
+            .csky,
+            .arc,
+            .m68k,
+            .tce,
+            .tcele,
+            .le32,
+            .amdil,
+            .hsail,
+            .spir,
+            .kalimba,
+            .renderscript32,
+            .spirv32,
+            .shave,
+            => 4,
+
+            .le64,
+            .amdil64,
+            .hsail64,
+            .spir64,
+            .renderscript64,
+            .ve,
+            .spirv64,
+            => 8,
+        };
+    }
 };
 
 test {
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 6c74fc8223..068067d765 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -7583,7 +7583,7 @@ pub const FuncGen = struct {
         const size_bytes = elem_ty.abiSize(target);
         _ = self.builder.buildMemCpy(
             self.builder.buildBitCast(ptr, llvm_ptr_u8, ""),
-            ptr_ty.ptrAlignment(target),
+            ptr_alignment,
             self.builder.buildBitCast(elem, llvm_ptr_u8, ""),
             elem_ty.abiAlignment(target),
             self.context.intType(Type.usize.intInfo(target).bits).constInt(size_bytes, .False),
diff --git a/src/type.zig b/src/type.zig
index c96c512d9a..44432b95f6 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -2788,11 +2788,6 @@ pub const Type = extern union {
                 return AbiAlignmentAdvanced{ .scalar = target_util.defaultFunctionAlignment(target) };
             },
 
-            .i16, .u16 => return AbiAlignmentAdvanced{ .scalar = 2 },
-            .i32, .u32 => return AbiAlignmentAdvanced{ .scalar = 4 },
-            .i64, .u64 => return AbiAlignmentAdvanced{ .scalar = 8 },
-            .u128, .i128 => return AbiAlignmentAdvanced{ .scalar = 16 },
-
             .isize,
             .usize,
             .single_const_pointer_to_comptime_int,
@@ -2865,14 +2860,15 @@ pub const Type = extern union {
             // ABI alignment of vectors?
             .vector => return AbiAlignmentAdvanced{ .scalar = 16 },
 
+            .i16, .u16 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(16, target) },
+            .i32, .u32 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(32, target) },
+            .i64, .u64 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(64, target) },
+            .u128, .i128 => return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(128, target) },
+
             .int_signed, .int_unsigned => {
                 const bits: u16 = ty.cast(Payload.Bits).?.data;
                 if (bits == 0) return AbiAlignmentAdvanced{ .scalar = 0 };
-                if (bits <= 8) return AbiAlignmentAdvanced{ .scalar = 1 };
-                if (bits <= 16) return AbiAlignmentAdvanced{ .scalar = 2 };
-                if (bits <= 32) return AbiAlignmentAdvanced{ .scalar = 4 };
-                if (bits <= 64) return AbiAlignmentAdvanced{ .scalar = 8 };
-                return AbiAlignmentAdvanced{ .scalar = 16 };
+                return AbiAlignmentAdvanced{ .scalar = intAbiAlignment(bits, target) };
             },
 
             .optional => {
@@ -3113,10 +3109,6 @@ pub const Type = extern union {
                 assert(elem_size >= payload.elem_type.abiAlignment(target));
                 return (payload.len + 1) * elem_size;
             },
-            .i16, .u16 => return 2,
-            .i32, .u32 => return 4,
-            .i64, .u64 => return 8,
-            .u128, .i128 => return 16,
 
             .isize,
             .usize,
@@ -3189,10 +3181,14 @@ pub const Type = extern union {
             .error_set_merged,
             => return 2, // TODO revisit this when we have the concept of the error tag type
 
+            .i16, .u16 => return intAbiSize(16, target),
+            .i32, .u32 => return intAbiSize(32, target),
+            .i64, .u64 => return intAbiSize(64, target),
+            .u128, .i128 => return intAbiSize(128, target),
             .int_signed, .int_unsigned => {
                 const bits: u16 = self.cast(Payload.Bits).?.data;
                 if (bits == 0) return 0;
-                return std.math.ceilPowerOfTwoPromote(u16, (bits + 7) / 8);
+                return intAbiSize(bits, target);
             },
 
             .optional => {
@@ -3234,6 +3230,18 @@ pub const Type = extern union {
         };
     }
 
+    fn intAbiSize(bits: u16, target: Target) u64 {
+        const alignment = intAbiAlignment(bits, target);
+        return std.mem.alignForwardGeneric(u64, (bits + 7) / 8, alignment);
+    }
+
+    fn intAbiAlignment(bits: u16, target: Target) u32 {
+        return @minimum(
+            std.math.ceilPowerOfTwoPromote(u16, (bits + 7) / 8),
+            target.maxIntAlignment(),
+        );
+    }
+
     /// Asserts the type has the bit size already resolved.
     pub fn bitSize(ty: Type, target: Target) u64 {
         return switch (ty.tag()) {
diff --git a/test/behavior/align.zig b/test/behavior/align.zig
index 3a35d1fcca..9d7ca9958a 100644
--- a/test/behavior/align.zig
+++ b/test/behavior/align.zig
@@ -47,41 +47,121 @@ fn expects4(x: *align(4) u32) void {
     x.* += 1;
 }
 
-test "alignment of structs" {
+test "alignment of struct with pointer has same alignment as usize" {
     try expect(@alignOf(struct {
         a: i32,
         b: *i32,
     }) == @alignOf(usize));
 }
 
-test "alignment of >= 128-bit integer type" {
-    try expect(@alignOf(u128) == 16);
-    try expect(@alignOf(u129) == 16);
-}
-
-test "alignment of struct with 128-bit field" {
-    try expect(@alignOf(struct {
+test "alignment and size of structs with 128-bit fields" {
+    const A = struct {
         x: u128,
-    }) == 16);
-
-    comptime {
-        try expect(@alignOf(struct {
-            x: u128,
-        }) == 16);
-    }
-}
-
-test "size of extern struct with 128-bit field" {
-    try expect(@sizeOf(extern struct {
+    };
+    const B = extern struct {
         x: u128,
         y: u8,
-    }) == 32);
+    };
+    const expected = switch (builtin.cpu.arch) {
+        .arm,
+        .armeb,
+        .thumb,
+        .thumbeb,
+        .x86_64,
+        .hexagon,
+        .mips,
+        .mipsel,
+        .mips64,
+        .mips64el,
+        .powerpc,
+        .powerpcle,
+        .powerpc64,
+        .powerpc64le,
+        .r600,
+        .amdgcn,
+        .riscv32,
+        .riscv64,
+        .sparc,
+        .sparcv9,
+        .sparcel,
+        .s390x,
+        .lanai,
+        .wasm32,
+        .wasm64,
+        => .{
+            .a_align = 8,
+            .a_size = 16,
 
+            .b_align = 8,
+            .b_size = 24,
+
+            .u128_align = 8,
+            .u128_size = 16,
+            .u129_align = 8,
+            .u129_size = 24,
+        },
+
+        .i386 => switch (builtin.os.tag) {
+            .windows => .{
+                .a_align = 8,
+                .a_size = 16,
+
+                .b_align = 8,
+                .b_size = 24,
+
+                .u128_align = 8,
+                .u128_size = 16,
+                .u129_align = 8,
+                .u129_size = 24,
+            },
+            else => .{
+                .a_align = 4,
+                .a_size = 16,
+
+                .b_align = 4,
+                .b_size = 20,
+
+                .u128_align = 4,
+                .u128_size = 16,
+                .u129_align = 4,
+                .u129_size = 20,
+            },
+        },
+
+        .aarch64,
+        .aarch64_be,
+        .aarch64_32,
+        .bpfel,
+        .bpfeb,
+        .nvptx,
+        .nvptx64,
+        => .{
+            .a_align = 16,
+            .a_size = 16,
+
+            .b_align = 16,
+            .b_size = 32,
+
+            .u128_align = 16,
+            .u128_size = 16,
+            .u129_align = 16,
+            .u129_size = 32,
+        },
+
+        else => return error.SkipZigTest,
+    };
     comptime {
-        try expect(@sizeOf(extern struct {
-            x: u128,
-            y: u8,
-        }) == 32);
+        std.debug.assert(@alignOf(A) == expected.a_align);
+        std.debug.assert(@sizeOf(A) == expected.a_size);
+
+        std.debug.assert(@alignOf(B) == expected.b_align);
+        std.debug.assert(@sizeOf(B) == expected.b_size);
+
+        std.debug.assert(@alignOf(u128) == expected.u128_align);
+        std.debug.assert(@sizeOf(u128) == expected.u128_size);
+
+        std.debug.assert(@alignOf(u129) == expected.u129_align);
+        std.debug.assert(@sizeOf(u129) == expected.u129_size);
     }
 }
 
@@ -328,7 +408,6 @@ test "read 128-bit field from default aligned struct in stack memory" {
         .nevermind = 1,
         .badguy = 12,
     };
-    try expect((@ptrToInt(&default_aligned.badguy) % 16) == 0);
     try expect(12 == default_aligned.badguy);
 }
 
@@ -345,7 +424,6 @@ test "read 128-bit field from default aligned struct in global memory" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest;
 
-    try expect((@ptrToInt(&default_aligned_global.badguy) % 16) == 0);
     try expect(12 == default_aligned_global.badguy);
 }
 
diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig
index 6be3cc15c4..e74a4c44f4 100644
--- a/test/behavior/bitcast.zig
+++ b/test/behavior/bitcast.zig
@@ -138,18 +138,9 @@ test "@bitCast packed structs at runtime and comptime" {
         fn doTheTest() !void {
             var full = Full{ .number = 0x1234 };
             var two_halves = @bitCast(Divided, full);
-            switch (native_endian) {
-                .Big => {
-                    try expect(two_halves.half1 == 0x12);
-                    try expect(two_halves.quarter3 == 0x3);
-                    try expect(two_halves.quarter4 == 0x4);
-                },
-                .Little => {
-                    try expect(two_halves.half1 == 0x34);
-                    try expect(two_halves.quarter3 == 0x2);
-                    try expect(two_halves.quarter4 == 0x1);
-                },
-            }
+            try expect(two_halves.half1 == 0x34);
+            try expect(two_halves.quarter3 == 0x2);
+            try expect(two_halves.quarter4 == 0x1);
         }
     };
     try S.doTheTest();
diff --git a/test/behavior/struct.zig b/test/behavior/struct.zig
index b05126db62..ed75268f7d 100644
--- a/test/behavior/struct.zig
+++ b/test/behavior/struct.zig
@@ -499,17 +499,14 @@ const Bitfields = packed struct {
     f7: u8,
 };
 
-test "native bit field understands endianness" {
-    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest;
+test "packed struct fields are ordered from LSB to MSB" {
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
 
-    var all: u64 = if (native_endian != .Little)
-        0x1111222233445677
-    else
-        0x7765443322221111;
+    var all: u64 = 0x7765443322221111;
     var bytes: [8]u8 = undefined;
     @memcpy(&bytes, @ptrCast([*]u8, &all), 8);
     var bitfields = @ptrCast(*Bitfields, &bytes).*;

From 5b1c0d922c1061706ae1673333fcfb1d8fdd4602 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 14:06:54 -0700
Subject: [PATCH 2/9] stage2: improve semantics of atomic operations

ZIR instructions updated: atomic_load, atomic_rmw, atomic_store, cmpxchg
These no longer construct a pointer type as the result location. This
solves a TODO that was preventing the pointer from possibly being
volatile, as well as properly handling allowzero and addrspace.
It also allows the pointer to be over-aligned, which may be needed
depending on the target. As a consequence, the element type needs to be
communicated in the ZIR. This is done by strategically making one of the
operands be ResultLoc.ty instead of ResultLoc.coerced_ty if possible, or
otherwise explicitly adding elem_type into the ZIR encoding, such as in
the case of atomic_load.

The pointer type of atomic operations is now checked in Sema by coercing
it to an expected pointer type, that maybe over-aligned according to
target requirements.

Together with the previous commit, Zig now has smaller alignment for
large integers, depending on the target, and yet still has type safety
for atomic operations that specially require higher alignment.
---
 src/AstGen.zig            |  60 +++-----------
 src/Sema.zig              | 162 ++++++++++++++++++++------------------
 src/Zir.zig               |   8 +-
 src/print_zir.zig         |  15 +++-
 src/target.zig            |  68 +++++++++++++++-
 test/behavior/atomics.zig |   2 +-
 6 files changed, 183 insertions(+), 132 deletions(-)

diff --git a/src/AstGen.zig b/src/AstGen.zig
index 1da1cf187e..d4895aa2e4 100644
--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@@ -7423,42 +7423,22 @@ fn builtinCall(
         },
 
         .atomic_load => {
-            const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = false,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
-            const result = try gz.addPlNode(.atomic_load, node, Zir.Inst.Bin{
+            const result = try gz.addPlNode(.atomic_load, node, Zir.Inst.AtomicLoad{
                 // zig fmt: off
-                .lhs = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-                .rhs = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[2]),
+                .elem_type = try typeExpr(gz, scope,                                        params[0]),
+                .ptr       = try expr    (gz, scope, .none,                                 params[1]),
+                .ordering  = try expr    (gz, scope, .{ .coerced_ty = .atomic_order_type }, params[2]),
                 // zig fmt: on
             });
             return rvalue(gz, rl, result, node);
         },
         .atomic_rmw => {
             const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = true,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
             const result = try gz.addPlNode(.atomic_rmw, node, Zir.Inst.AtomicRmw{
                 // zig fmt: off
-                .ptr       = try expr(gz, scope, .{ .coerced_ty = ptr_type },            params[1]),
+                .ptr       = try expr(gz, scope, .none,                                  params[1]),
                 .operation = try expr(gz, scope, .{ .coerced_ty = .atomic_rmw_op_type }, params[2]),
-                .operand   = try expr(gz, scope, .{ .coerced_ty = int_type },            params[3]),
+                .operand   = try expr(gz, scope, .{ .ty = int_type },                    params[3]),
                 .ordering  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type },  params[4]),
                 // zig fmt: on
             });
@@ -7466,20 +7446,10 @@ fn builtinCall(
         },
         .atomic_store => {
             const int_type = try typeExpr(gz, scope, params[0]);
-            // TODO allow this pointer type to be volatile
-            const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-                .ptr_type_simple = .{
-                    .is_allowzero = false,
-                    .is_mutable = true,
-                    .is_volatile = false,
-                    .size = .One,
-                    .elem_type = int_type,
-                },
-            } });
             const result = try gz.addPlNode(.atomic_store, node, Zir.Inst.AtomicStore{
                 // zig fmt: off
-                .ptr      = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-                .operand  = try expr(gz, scope, .{ .coerced_ty = int_type },           params[2]),
+                .ptr      = try expr(gz, scope, .none,                                 params[1]),
+                .operand  = try expr(gz, scope, .{ .ty = int_type },                   params[2]),
                 .ordering = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[3]),
                 // zig fmt: on
             });
@@ -7684,20 +7654,10 @@ fn cmpxchg(
     tag: Zir.Inst.Tag,
 ) InnerError!Zir.Inst.Ref {
     const int_type = try typeExpr(gz, scope, params[0]);
-    // TODO: allow this to be volatile
-    const ptr_type = try gz.add(.{ .tag = .ptr_type_simple, .data = .{
-        .ptr_type_simple = .{
-            .is_allowzero = false,
-            .is_mutable = true,
-            .is_volatile = false,
-            .size = .One,
-            .elem_type = int_type,
-        },
-    } });
     const result = try gz.addPlNode(tag, node, Zir.Inst.Cmpxchg{
         // zig fmt: off
-        .ptr            = try expr(gz, scope, .{ .coerced_ty = ptr_type },           params[1]),
-        .expected_value = try expr(gz, scope, .{ .coerced_ty = int_type },           params[2]),
+        .ptr            = try expr(gz, scope, .none,                                 params[1]),
+        .expected_value = try expr(gz, scope, .{ .ty = int_type },                   params[2]),
         .new_value      = try expr(gz, scope, .{ .coerced_ty = int_type },           params[3]),
         .success_order  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[4]),
         .failure_order  = try expr(gz, scope, .{ .coerced_ty = .atomic_order_type }, params[5]),
diff --git a/src/Sema.zig b/src/Sema.zig
index 3ad9550175..8125fc0cc1 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -14715,51 +14715,64 @@ fn checkNumericType(
     }
 }
 
-fn checkAtomicOperandType(
+/// Returns the casted pointer.
+fn checkAtomicPtrOperand(
     sema: *Sema,
     block: *Block,
-    ty_src: LazySrcLoc,
-    ty: Type,
-) CompileError!void {
-    var buffer: Type.Payload.Bits = undefined;
+    elem_ty: Type,
+    elem_ty_src: LazySrcLoc,
+    ptr: Air.Inst.Ref,
+    ptr_src: LazySrcLoc,
+    ptr_const: bool,
+) CompileError!Air.Inst.Ref {
     const target = sema.mod.getTarget();
-    const max_atomic_bits = target_util.largestAtomicBits(target);
-    const int_ty = switch (ty.zigTypeTag()) {
-        .Int => ty,
-        .Enum => ty.intTagType(&buffer),
-        .Float => {
-            const bit_count = ty.floatBits(target);
-            if (bit_count > max_atomic_bits) {
-                return sema.fail(
-                    block,
-                    ty_src,
-                    "expected {d}-bit float type or smaller; found {d}-bit float type",
-                    .{ max_atomic_bits, bit_count },
-                );
-            }
-            return;
-        },
-        .Bool => return, // Will be treated as `u8`.
-        else => {
-            if (ty.isPtrAtRuntime()) return;
+    var diag: target_util.AtomicPtrAlignmentDiagnostics = .{};
+    const alignment = target_util.atomicPtrAlignment(target, elem_ty, &diag) catch |err| switch (err) {
+        error.FloatTooBig => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected {d}-bit float type or smaller; found {d}-bit float type",
+            .{ diag.max_bits, diag.bits },
+        ),
+        error.IntTooBig => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected {d}-bit integer type or smaller; found {d}-bit integer type",
+            .{ diag.max_bits, diag.bits },
+        ),
+        error.BadType => return sema.fail(
+            block,
+            elem_ty_src,
+            "expected bool, integer, float, enum, or pointer type; found {}",
+            .{elem_ty.fmt(sema.mod)},
+        ),
+    };
 
-            return sema.fail(
-                block,
-                ty_src,
-                "expected bool, integer, float, enum, or pointer type; found {}",
-                .{ty.fmt(sema.mod)},
-            );
+    var wanted_ptr_data: Type.Payload.Pointer.Data = .{
+        .pointee_type = elem_ty,
+        .@"align" = alignment,
+        .@"addrspace" = .generic,
+        .mutable = !ptr_const,
+    };
+
+    const ptr_ty = sema.typeOf(ptr);
+    const ptr_data = switch (try ptr_ty.zigTypeTagOrPoison()) {
+        .Pointer => ptr_ty.ptrInfo().data,
+        else => {
+            const wanted_ptr_ty = try Type.ptr(sema.arena, sema.mod, wanted_ptr_data);
+            _ = try sema.coerce(block, wanted_ptr_ty, ptr, ptr_src);
+            unreachable;
         },
     };
-    const bit_count = int_ty.intInfo(target).bits;
-    if (bit_count > max_atomic_bits) {
-        return sema.fail(
-            block,
-            ty_src,
-            "expected {d}-bit integer type or smaller; found {d}-bit integer type",
-            .{ max_atomic_bits, bit_count },
-        );
-    }
+
+    wanted_ptr_data.@"addrspace" = ptr_data.@"addrspace";
+    wanted_ptr_data.@"allowzero" = ptr_data.@"allowzero";
+    wanted_ptr_data.@"volatile" = ptr_data.@"volatile";
+
+    const wanted_ptr_ty = try Type.ptr(sema.arena, sema.mod, wanted_ptr_data);
+    const casted_ptr = try sema.coerce(block, wanted_ptr_ty, ptr, ptr_src);
+
+    return casted_ptr;
 }
 
 fn checkPtrIsNotComptimeMutable(
@@ -15036,10 +15049,8 @@ fn zirCmpxchg(
     const success_order_src: LazySrcLoc = .{ .node_offset_builtin_call_arg4 = inst_data.src_node };
     const failure_order_src: LazySrcLoc = .{ .node_offset_builtin_call_arg5 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const ptr_ty = sema.typeOf(ptr);
-    const elem_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, elem_ty_src, elem_ty);
+    const expected_value = sema.resolveInst(extra.expected_value);
+    const elem_ty = sema.typeOf(expected_value);
     if (elem_ty.zigTypeTag() == .Float) {
         return sema.fail(
             block,
@@ -15048,7 +15059,8 @@ fn zirCmpxchg(
             .{elem_ty.fmt(sema.mod)},
         );
     }
-    const expected_value = try sema.coerce(block, elem_ty, sema.resolveInst(extra.expected_value), expected_src);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const new_value = try sema.coerce(block, elem_ty, sema.resolveInst(extra.new_value), new_value_src);
     const success_order = try sema.resolveAtomicOrder(block, success_order_src, extra.success_order);
     const failure_order = try sema.resolveAtomicOrder(block, failure_order_src, extra.failure_order);
@@ -15081,6 +15093,7 @@ fn zirCmpxchg(
                     // to become undef as well
                     return sema.addConstUndef(result_ty);
                 }
+                const ptr_ty = sema.typeOf(ptr);
                 const stored_val = (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) orelse break :rs ptr_src;
                 const result_val = if (stored_val.eql(expected_val, elem_ty, sema.mod)) blk: {
                     try sema.storePtr(block, src, ptr, new_value);
@@ -15487,17 +15500,16 @@ fn zirSelect(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.
 
 fn zirAtomicLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
     const inst_data = sema.code.instructions.items(.data)[inst].pl_node;
-    const extra = sema.code.extraData(Zir.Inst.Bin, inst_data.payload_index).data;
+    const extra = sema.code.extraData(Zir.Inst.AtomicLoad, inst_data.payload_index).data;
     // zig fmt: off
     const elem_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src    : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const order_src  : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.lhs);
-    const ptr_ty = sema.typeOf(ptr);
-    const elem_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, elem_ty_src, elem_ty);
-    const order = try sema.resolveAtomicOrder(block, order_src, extra.rhs);
+    const elem_ty = try sema.resolveType(block, elem_ty_src, extra.elem_type);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, true);
+    const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     switch (order) {
         .Release, .AcqRel => {
@@ -15516,7 +15528,7 @@ fn zirAtomicLoad(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!
     }
 
     if (try sema.resolveDefinedValue(block, ptr_src, ptr)) |ptr_val| {
-        if (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) |elem_val| {
+        if (try sema.pointerDeref(block, ptr_src, ptr_val, sema.typeOf(ptr))) |elem_val| {
             return sema.addConstant(elem_ty, elem_val);
         }
     }
@@ -15536,19 +15548,19 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     const extra = sema.code.extraData(Zir.Inst.AtomicRmw, inst_data.payload_index).data;
     const src = inst_data.src();
     // zig fmt: off
-    const operand_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
+    const elem_ty_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src       : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const op_src        : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     const operand_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg3 = inst_data.src_node };
     const order_src     : LazySrcLoc = .{ .node_offset_builtin_call_arg4 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const ptr_ty = sema.typeOf(ptr);
-    const operand_ty = ptr_ty.elemType();
-    try sema.checkAtomicOperandType(block, operand_ty_src, operand_ty);
+    const operand = sema.resolveInst(extra.operand);
+    const elem_ty = sema.typeOf(operand);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const op = try sema.resolveAtomicRmwOp(block, op_src, extra.operation);
 
-    switch (operand_ty.zigTypeTag()) {
+    switch (elem_ty.zigTypeTag()) {
         .Enum => if (op != .Xchg) {
             return sema.fail(block, op_src, "@atomicRmw with enum only allowed with .Xchg", .{});
         },
@@ -15561,7 +15573,6 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
         },
         else => {},
     }
-    const operand = try sema.coerce(block, operand_ty, sema.resolveInst(extra.operand), operand_src);
     const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     if (order == .Unordered) {
@@ -15569,8 +15580,8 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
     }
 
     // special case zero bit types
-    if (try sema.typeHasOnePossibleValue(block, operand_ty_src, operand_ty)) |val| {
-        return sema.addConstant(operand_ty, val);
+    if (try sema.typeHasOnePossibleValue(block, elem_ty_src, elem_ty)) |val| {
+        return sema.addConstant(elem_ty, val);
     }
 
     const runtime_src = if (try sema.resolveDefinedValue(block, ptr_src, ptr)) |ptr_val| rs: {
@@ -15581,22 +15592,23 @@ fn zirAtomicRmw(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!A
         };
         if (ptr_val.isComptimeMutablePtr()) {
             const target = sema.mod.getTarget();
+            const ptr_ty = sema.typeOf(ptr);
             const stored_val = (try sema.pointerDeref(block, ptr_src, ptr_val, ptr_ty)) orelse break :rs ptr_src;
             const new_val = switch (op) {
                 // zig fmt: off
                 .Xchg => operand_val,
-                .Add  => try stored_val.numberAddWrap(operand_val, operand_ty, sema.arena, target),
-                .Sub  => try stored_val.numberSubWrap(operand_val, operand_ty, sema.arena, target),
-                .And  => try stored_val.bitwiseAnd   (operand_val, operand_ty, sema.arena, target),
-                .Nand => try stored_val.bitwiseNand  (operand_val, operand_ty, sema.arena, target),
-                .Or   => try stored_val.bitwiseOr    (operand_val, operand_ty, sema.arena, target),
-                .Xor  => try stored_val.bitwiseXor   (operand_val, operand_ty, sema.arena, target),
-                .Max  =>     stored_val.numberMax    (operand_val,                         target),
-                .Min  =>     stored_val.numberMin    (operand_val,                         target),
+                .Add  => try stored_val.numberAddWrap(operand_val, elem_ty, sema.arena, target),
+                .Sub  => try stored_val.numberSubWrap(operand_val, elem_ty, sema.arena, target),
+                .And  => try stored_val.bitwiseAnd   (operand_val, elem_ty, sema.arena, target),
+                .Nand => try stored_val.bitwiseNand  (operand_val, elem_ty, sema.arena, target),
+                .Or   => try stored_val.bitwiseOr    (operand_val, elem_ty, sema.arena, target),
+                .Xor  => try stored_val.bitwiseXor   (operand_val, elem_ty, sema.arena, target),
+                .Max  =>     stored_val.numberMax    (operand_val,                      target),
+                .Min  =>     stored_val.numberMin    (operand_val,                      target),
                 // zig fmt: on
             };
-            try sema.storePtrVal(block, src, ptr_val, new_val, operand_ty);
-            return sema.addConstant(operand_ty, stored_val);
+            try sema.storePtrVal(block, src, ptr_val, new_val, elem_ty);
+            return sema.addConstant(elem_ty, stored_val);
         } else break :rs ptr_src;
     } else ptr_src;
 
@@ -15620,15 +15632,15 @@ fn zirAtomicStore(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError
     const extra = sema.code.extraData(Zir.Inst.AtomicStore, inst_data.payload_index).data;
     const src = inst_data.src();
     // zig fmt: off
-    const operand_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
+    const elem_ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
     const ptr_src       : LazySrcLoc = .{ .node_offset_builtin_call_arg1 = inst_data.src_node };
     const operand_src   : LazySrcLoc = .{ .node_offset_builtin_call_arg2 = inst_data.src_node };
     const order_src     : LazySrcLoc = .{ .node_offset_builtin_call_arg3 = inst_data.src_node };
     // zig fmt: on
-    const ptr = sema.resolveInst(extra.ptr);
-    const operand_ty = sema.typeOf(ptr).elemType();
-    try sema.checkAtomicOperandType(block, operand_ty_src, operand_ty);
-    const operand = try sema.coerce(block, operand_ty, sema.resolveInst(extra.operand), operand_src);
+    const operand = sema.resolveInst(extra.operand);
+    const elem_ty = sema.typeOf(operand);
+    const uncasted_ptr = sema.resolveInst(extra.ptr);
+    const ptr = try sema.checkAtomicPtrOperand(block, elem_ty, elem_ty_src, uncasted_ptr, ptr_src, false);
     const order = try sema.resolveAtomicOrder(block, order_src, extra.ordering);
 
     const air_tag: Air.Inst.Tag = switch (order) {
diff --git a/src/Zir.zig b/src/Zir.zig
index 9ef6d33b86..30edd6d0e1 100644
--- a/src/Zir.zig
+++ b/src/Zir.zig
@@ -903,7 +903,7 @@ pub const Inst = struct {
         /// Uses the `pl_node` union field with payload `Select`.
         select,
         /// Implements the `@atomicLoad` builtin.
-        /// Uses the `pl_node` union field with payload `Bin`.
+        /// Uses the `pl_node` union field with payload `AtomicLoad`.
         atomic_load,
         /// Implements the `@atomicRmw` builtin.
         /// Uses the `pl_node` union field with payload `AtomicRmw`.
@@ -3293,6 +3293,12 @@ pub const Inst = struct {
         ordering: Ref,
     };
 
+    pub const AtomicLoad = struct {
+        elem_type: Ref,
+        ptr: Ref,
+        ordering: Ref,
+    };
+
     pub const MulAdd = struct {
         mulend1: Ref,
         mulend2: Ref,
diff --git a/src/print_zir.zig b/src/print_zir.zig
index ac90c26cf6..a9d0f4690f 100644
--- a/src/print_zir.zig
+++ b/src/print_zir.zig
@@ -283,6 +283,7 @@ const Writer = struct {
             => try self.writeStructInit(stream, inst),
 
             .cmpxchg_strong, .cmpxchg_weak => try self.writeCmpxchg(stream, inst),
+            .atomic_load => try self.writeAtomicLoad(stream, inst),
             .atomic_store => try self.writeAtomicStore(stream, inst),
             .atomic_rmw => try self.writeAtomicRmw(stream, inst),
             .memcpy => try self.writeMemcpy(stream, inst),
@@ -351,7 +352,6 @@ const Writer = struct {
             .offset_of,
             .splat,
             .reduce,
-            .atomic_load,
             .bitcast,
             .vector_type,
             .maximum,
@@ -929,6 +929,19 @@ const Writer = struct {
         try self.writeSrc(stream, inst_data.src());
     }
 
+    fn writeAtomicLoad(self: *Writer, stream: anytype, inst: Zir.Inst.Index) !void {
+        const inst_data = self.code.instructions.items(.data)[inst].pl_node;
+        const extra = self.code.extraData(Zir.Inst.AtomicLoad, inst_data.payload_index).data;
+
+        try self.writeInstRef(stream, extra.elem_type);
+        try stream.writeAll(", ");
+        try self.writeInstRef(stream, extra.ptr);
+        try stream.writeAll(", ");
+        try self.writeInstRef(stream, extra.ordering);
+        try stream.writeAll(") ");
+        try self.writeSrc(stream, inst_data.src());
+    }
+
     fn writeAtomicStore(self: *Writer, stream: anytype, inst: Zir.Inst.Index) !void {
         const inst_data = self.code.instructions.items(.data)[inst].pl_node;
         const extra = self.code.extraData(Zir.Inst.AtomicStore, inst_data.payload_index).data;
diff --git a/src/target.zig b/src/target.zig
index 27ed1118db..7818d496eb 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const llvm = @import("codegen/llvm/bindings.zig");
+const Type = @import("type.zig").Type;
 
 pub const ArchOsAbi = struct {
     arch: std.Target.Cpu.Arch,
@@ -543,10 +544,28 @@ pub fn needUnwindTables(target: std.Target) bool {
     return target.os.tag == .windows;
 }
 
-/// TODO this was ported from stage1 but it does not take into account CPU features,
-/// which can affect this value. Audit this!
-pub fn largestAtomicBits(target: std.Target) u32 {
-    return switch (target.cpu.arch) {
+pub const AtomicPtrAlignmentError = error{
+    FloatTooBig,
+    IntTooBig,
+    BadType,
+};
+
+pub const AtomicPtrAlignmentDiagnostics = struct {
+    bits: u16 = undefined,
+    max_bits: u16 = undefined,
+};
+
+/// If ABI alignment of `ty` is OK for atomic operations, returs 0.
+/// Otherwise returns the alignment required on a pointer for the target
+/// to perform atomic operations.
+pub fn atomicPtrAlignment(
+    target: std.Target,
+    ty: Type,
+    diags: *AtomicPtrAlignmentDiagnostics,
+) AtomicPtrAlignmentError!u32 {
+    // TODO this was ported from stage1 but it does not take into account CPU features,
+    // which can affect this value. Audit this!
+    const max_atomic_bits: u16 = switch (target.cpu.arch) {
         .avr,
         .msp430,
         .spu_2,
@@ -611,6 +630,47 @@ pub fn largestAtomicBits(target: std.Target) u32 {
 
         .x86_64 => 128,
     };
+
+    var buffer: Type.Payload.Bits = undefined;
+
+    const int_ty = switch (ty.zigTypeTag()) {
+        .Int => ty,
+        .Enum => ty.intTagType(&buffer),
+        .Float => {
+            const bit_count = ty.floatBits(target);
+            if (bit_count > max_atomic_bits) {
+                diags.* = .{
+                    .bits = bit_count,
+                    .max_bits = max_atomic_bits,
+                };
+                return error.FloatTooBig;
+            }
+            if (target.cpu.arch == .x86_64 and bit_count > 64) {
+                return 16;
+            }
+            return 0;
+        },
+        .Bool => return 0,
+        else => {
+            if (ty.isPtrAtRuntime()) return 0;
+            return error.BadType;
+        },
+    };
+
+    const bit_count = int_ty.intInfo(target).bits;
+    if (bit_count > max_atomic_bits) {
+        diags.* = .{
+            .bits = bit_count,
+            .max_bits = max_atomic_bits,
+        };
+        return error.IntTooBig;
+    }
+
+    if (target.cpu.arch == .x86_64 and bit_count > 64) {
+        return 16;
+    }
+
+    return 0;
 }
 
 pub fn defaultAddressSpace(
diff --git a/test/behavior/atomics.zig b/test/behavior/atomics.zig
index 71e17d9b4c..62471c5ea0 100644
--- a/test/behavior/atomics.zig
+++ b/test/behavior/atomics.zig
@@ -127,7 +127,7 @@ test "128-bit cmpxchg" {
 }
 
 fn test_u128_cmpxchg() !void {
-    var x: u128 = 1234;
+    var x: u128 align(16) = 1234;
     if (@cmpxchgWeak(u128, &x, 99, 5678, .SeqCst, .SeqCst)) |x1| {
         try expect(x1 == 1234);
     } else {

From f21c11a7f7fb84db4289a6f735dea012387606d6 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 15:22:41 -0700
Subject: [PATCH 3/9] stage2: change x86_64 max int alignment from 8 to 16

For x86_64, LLVMABIAlignmentOfType(i128) reports 8. However I think 16
is a better number for two reasons:
1. Better machine code when loading into SIMD register.
2. The C ABI wants 16 for extern structs.
---
 lib/std/target.zig      | 7 ++++++-
 src/target.zig          | 9 +--------
 test/behavior/align.zig | 5 ++++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/lib/std/target.zig b/lib/std/target.zig
index 777b0f0ec0..aab9d5accc 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1784,7 +1784,6 @@ pub const Target = struct {
             .armeb,
             .thumb,
             .thumbeb,
-            .x86_64,
             .hexagon,
             .mips,
             .mipsel,
@@ -1811,6 +1810,12 @@ pub const Target = struct {
                 .windows => 8,
                 else => 4,
             },
+
+            // For x86_64, LLVMABIAlignmentOfType(i128) reports 8. However I think 16
+            // is a better number because of two reasons:
+            // 1. Better machine code when loading into SIMD register.
+            // 2. The C ABI wants 16 for extern structs.
+            .x86_64,
             .aarch64,
             .aarch64_be,
             .aarch64_32,
diff --git a/src/target.zig b/src/target.zig
index 7818d496eb..9249ed1b60 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -555,7 +555,7 @@ pub const AtomicPtrAlignmentDiagnostics = struct {
     max_bits: u16 = undefined,
 };
 
-/// If ABI alignment of `ty` is OK for atomic operations, returs 0.
+/// If ABI alignment of `ty` is OK for atomic operations, returns 0.
 /// Otherwise returns the alignment required on a pointer for the target
 /// to perform atomic operations.
 pub fn atomicPtrAlignment(
@@ -645,9 +645,6 @@ pub fn atomicPtrAlignment(
                 };
                 return error.FloatTooBig;
             }
-            if (target.cpu.arch == .x86_64 and bit_count > 64) {
-                return 16;
-            }
             return 0;
         },
         .Bool => return 0,
@@ -666,10 +663,6 @@ pub fn atomicPtrAlignment(
         return error.IntTooBig;
     }
 
-    if (target.cpu.arch == .x86_64 and bit_count > 64) {
-        return 16;
-    }
-
     return 0;
 }
 
diff --git a/test/behavior/align.zig b/test/behavior/align.zig
index 9d7ca9958a..393908d5bd 100644
--- a/test/behavior/align.zig
+++ b/test/behavior/align.zig
@@ -55,6 +55,9 @@ test "alignment of struct with pointer has same alignment as usize" {
 }
 
 test "alignment and size of structs with 128-bit fields" {
+    if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
+
     const A = struct {
         x: u128,
     };
@@ -67,7 +70,6 @@ test "alignment and size of structs with 128-bit fields" {
         .armeb,
         .thumb,
         .thumbeb,
-        .x86_64,
         .hexagon,
         .mips,
         .mipsel,
@@ -128,6 +130,7 @@ test "alignment and size of structs with 128-bit fields" {
             },
         },
 
+        .x86_64,
         .aarch64,
         .aarch64_be,
         .aarch64_32,

From 2f6a01d0c39542e7d88c9af14e869b820fd156cc Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 17:31:12 -0700
Subject: [PATCH 4/9] stage1: fix `@sizeOf` for 128-bit integer types

---
 src/stage1/analyze.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/stage1/analyze.cpp b/src/stage1/analyze.cpp
index aef4966ee7..34dff556e0 100644
--- a/src/stage1/analyze.cpp
+++ b/src/stage1/analyze.cpp
@@ -7686,6 +7686,7 @@ ZigType *make_int_type(CodeGen *g, bool is_signed, uint32_t size_in_bits) {
             // However for some targets, LLVM incorrectly reports this as 8.
             // See: https://github.com/ziglang/zig/issues/2987
             entry->abi_align = 16;
+            entry->abi_size = align_forward(entry->abi_size, entry->abi_align);
         }
     }
 

From af7e945a7dc00a2a5055d9770b9ecda253d64a8e Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 18:45:59 -0700
Subject: [PATCH 5/9] stage2: fix `@sizeOf` for structs with comptime fields

---
 src/type.zig              | 2 +-
 test/behavior/align.zig   | 4 ++++
 test/behavior/bitcast.zig | 4 ++++
 test/behavior/struct.zig  | 6 ++++++
 4 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/type.zig b/src/type.zig
index 44432b95f6..ddeec596e1 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -5177,7 +5177,7 @@ pub const Type = extern union {
 
             const field = it.struct_obj.fields.values()[it.field];
             defer it.field += 1;
-            if (!field.ty.hasRuntimeBits())
+            if (!field.ty.hasRuntimeBits() or field.is_comptime)
                 return FieldOffset{ .field = it.field, .offset = it.offset };
 
             const field_align = field.normalAlignment(it.target);
diff --git a/test/behavior/align.zig b/test/behavior/align.zig
index 393908d5bd..5a3a76beb2 100644
--- a/test/behavior/align.zig
+++ b/test/behavior/align.zig
@@ -55,6 +55,10 @@ test "alignment of struct with pointer has same alignment as usize" {
 }
 
 test "alignment and size of structs with 128-bit fields" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
+    }
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
 
diff --git a/test/behavior/bitcast.zig b/test/behavior/bitcast.zig
index e74a4c44f4..4922b9473e 100644
--- a/test/behavior/bitcast.zig
+++ b/test/behavior/bitcast.zig
@@ -120,6 +120,10 @@ test "bitcast generates a temporary value" {
 }
 
 test "@bitCast packed structs at runtime and comptime" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
+    }
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest;
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest;
diff --git a/test/behavior/struct.zig b/test/behavior/struct.zig
index ed75268f7d..24365d49b7 100644
--- a/test/behavior/struct.zig
+++ b/test/behavior/struct.zig
@@ -500,6 +500,10 @@ const Bitfields = packed struct {
 };
 
 test "packed struct fields are ordered from LSB to MSB" {
+    if (builtin.zig_backend == .stage1) {
+        // stage1 gets the wrong answer for a lot of targets
+        return error.SkipZigTest;
+    }
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_c) return error.SkipZigTest; // TODO
@@ -971,6 +975,8 @@ test "comptime struct field" {
         comptime b: i32 = 1234,
     };
 
+    comptime std.debug.assert(@sizeOf(T) == 4);
+
     var foo: T = undefined;
     comptime try expect(foo.b == 1234);
 }

From 0bebb688fbc4c6ffb88a2b0aefcf536143bb5f2e Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 19:11:02 -0700
Subject: [PATCH 6/9] stage2: change max int align from 8 to 16 for more ISAs

These targets now have a similar disagreement with LLVM about the
alignment of 128-bit integers as x86_64:
 * riscv64
 * powerpc64
 * powerpc64le
 * mips64
 * mips64el
 * sparcv9

See #2987
---
 lib/std/target.zig      | 30 +++++++++++++++---------------
 test/behavior/align.zig | 12 ++++++------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/lib/std/target.zig b/lib/std/target.zig
index aab9d5accc..2db1415cd3 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1787,18 +1787,12 @@ pub const Target = struct {
             .hexagon,
             .mips,
             .mipsel,
-            .mips64,
-            .mips64el,
             .powerpc,
             .powerpcle,
-            .powerpc64,
-            .powerpc64le,
             .r600,
             .amdgcn,
             .riscv32,
-            .riscv64,
             .sparc,
-            .sparcv9,
             .sparcel,
             .s390x,
             .lanai,
@@ -1812,10 +1806,20 @@ pub const Target = struct {
             },
 
             // For x86_64, LLVMABIAlignmentOfType(i128) reports 8. However I think 16
-            // is a better number because of two reasons:
+            // is a better number for two reasons:
             // 1. Better machine code when loading into SIMD register.
             // 2. The C ABI wants 16 for extern structs.
+            // 3. 16-byte cmpxchg needs 16-byte alignment.
+            // Same logic for riscv64, powerpc64, mips64, sparcv9.
             .x86_64,
+            .riscv64,
+            .powerpc64,
+            .powerpc64le,
+            .mips64,
+            .mips64el,
+            .sparcv9,
+
+            // Even LLVMABIAlignmentOfType(i128) agrees on these targets.
             .aarch64,
             .aarch64_be,
             .aarch64_32,
@@ -1825,11 +1829,9 @@ pub const Target = struct {
             .nvptx64,
             => 16,
 
-            // Below this comment are unverified and I have chosen a number
-            // based on ptrBitWidth.
-
-            .spu_2 => 2,
-
+            // Below this comment are unverified but based on the fact that C requires
+            // int128_t to be 16 bytes aligned, it's a safe default.
+            .spu_2,
             .csky,
             .arc,
             .m68k,
@@ -1843,8 +1845,6 @@ pub const Target = struct {
             .renderscript32,
             .spirv32,
             .shave,
-            => 4,
-
             .le64,
             .amdil64,
             .hsail64,
@@ -1852,7 +1852,7 @@ pub const Target = struct {
             .renderscript64,
             .ve,
             .spirv64,
-            => 8,
+            => 16,
         };
     }
 };
diff --git a/test/behavior/align.zig b/test/behavior/align.zig
index 5a3a76beb2..18a55f6653 100644
--- a/test/behavior/align.zig
+++ b/test/behavior/align.zig
@@ -77,18 +77,12 @@ test "alignment and size of structs with 128-bit fields" {
         .hexagon,
         .mips,
         .mipsel,
-        .mips64,
-        .mips64el,
         .powerpc,
         .powerpcle,
-        .powerpc64,
-        .powerpc64le,
         .r600,
         .amdgcn,
         .riscv32,
-        .riscv64,
         .sparc,
-        .sparcv9,
         .sparcel,
         .s390x,
         .lanai,
@@ -134,6 +128,12 @@ test "alignment and size of structs with 128-bit fields" {
             },
         },
 
+        .mips64,
+        .mips64el,
+        .powerpc64,
+        .powerpc64le,
+        .riscv64,
+        .sparcv9,
         .x86_64,
         .aarch64,
         .aarch64_be,

From 1b432b557608bd047afaad71ffb7bd2ddf23c444 Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 20:38:53 -0700
Subject: [PATCH 7/9] stage2: implement global assembly

So far it's supported by the LLVM backend only. I recommend for the
other backends to wait for the resolution of #10761 before adding
support for this feature.
---
 src/Module.zig                | 15 +++++++++++++++
 src/Sema.zig                  | 28 +++++++++++++++++++++++-----
 src/codegen/llvm.zig          | 14 ++++++++++++++
 src/codegen/llvm/bindings.zig |  3 +++
 test/behavior/asm.zig         |  1 -
 5 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/src/Module.zig b/src/Module.zig
index 55ec1fdd2c..864663ada2 100644
--- a/src/Module.zig
+++ b/src/Module.zig
@@ -151,6 +151,8 @@ allocated_decls: std.SegmentedList(Decl, 0) = .{},
 /// When a Decl object is freed from `allocated_decls`, it is pushed into this stack.
 decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{},
 
+global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{},
+
 const MonomorphedFuncsSet = std.HashMapUnmanaged(
     *Fn,
     void,
@@ -2831,6 +2833,7 @@ pub fn deinit(mod: *Module) void {
 
     mod.decls_free_list.deinit(gpa);
     mod.allocated_decls.deinit(gpa);
+    mod.global_assembly.deinit(gpa);
 }
 
 pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
@@ -2842,6 +2845,9 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void {
         if (decl.deletion_flag) {
             assert(mod.deletion_set.swapRemove(decl_index));
         }
+        if (mod.global_assembly.fetchRemove(decl_index)) |kv| {
+            gpa.free(kv.value);
+        }
         if (decl.has_tv) {
             if (decl.getInnerNamespace()) |namespace| {
                 namespace.destroyDecls(mod);
@@ -5714,3 +5720,12 @@ pub fn markDeclAlive(mod: *Module, decl: *Decl) void {
 fn markDeclIndexAlive(mod: *Module, decl_index: Decl.Index) void {
     return mod.markDeclAlive(mod.declPtr(decl_index));
 }
+
+pub fn addGlobalAssembly(mod: *Module, decl_index: Decl.Index, source: []const u8) !void {
+    try mod.global_assembly.ensureUnusedCapacity(mod.gpa, 1);
+
+    const duped_source = try mod.gpa.dupe(u8, source);
+    errdefer mod.gpa.free(duped_source);
+
+    mod.global_assembly.putAssumeCapacityNoClobber(decl_index, duped_source);
+}
diff --git a/src/Sema.zig b/src/Sema.zig
index 8125fc0cc1..9b4977c616 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -10517,16 +10517,35 @@ fn zirAsm(
     const is_volatile = @truncate(u1, extended.small >> 15) != 0;
     const is_global_assembly = sema.func == null;
 
-    if (block.is_comptime and !is_global_assembly) {
-        try sema.requireRuntimeBlock(block, src);
-    }
-
     if (extra.data.asm_source == 0) {
         // This can move to become an AstGen error after inline assembly improvements land
         // and stage1 code matches stage2 code.
         return sema.fail(block, src, "assembly code must use string literal syntax", .{});
     }
 
+    const asm_source = sema.code.nullTerminatedString(extra.data.asm_source);
+
+    if (is_global_assembly) {
+        if (outputs_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support outputs", .{});
+        }
+        if (inputs_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support inputs", .{});
+        }
+        if (clobbers_len != 0) {
+            return sema.fail(block, src, "module-level assembly does not support clobbers", .{});
+        }
+        if (is_volatile) {
+            return sema.fail(block, src, "volatile keyword is redundant on module-level assembly", .{});
+        }
+        try sema.mod.addGlobalAssembly(sema.owner_decl_index, asm_source);
+        return Air.Inst.Ref.void_value;
+    }
+
+    if (block.is_comptime) {
+        try sema.requireRuntimeBlock(block, src);
+    }
+
     if (outputs_len > 1) {
         return sema.fail(block, src, "TODO implement Sema for asm with more than 1 output", .{});
     }
@@ -10591,7 +10610,6 @@ fn zirAsm(
         needed_capacity += name.*.len / 4 + 1;
     }
 
-    const asm_source = sema.code.nullTerminatedString(extra.data.asm_source);
     needed_capacity += (asm_source.len + 3) / 4;
 
     const gpa = sema.gpa;
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 068067d765..65289b1a8c 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -476,6 +476,19 @@ pub const Object = struct {
         _ = builder.buildRet(is_lt);
     }
 
+    fn genModuleLevelAssembly(object: *Object, comp: *Compilation) !void {
+        const mod = comp.bin_file.options.module.?;
+        if (mod.global_assembly.count() == 0) return;
+        var buffer = std.ArrayList(u8).init(comp.gpa);
+        defer buffer.deinit();
+        var it = mod.global_assembly.iterator();
+        while (it.next()) |kv| {
+            try buffer.appendSlice(kv.value_ptr.*);
+            try buffer.append('\n');
+        }
+        object.llvm_module.setModuleInlineAsm2(buffer.items.ptr, buffer.items.len - 1);
+    }
+
     pub fn flushModule(self: *Object, comp: *Compilation, prog_node: *std.Progress.Node) !void {
         var sub_prog_node = prog_node.start("LLVM Emit Object", 0);
         sub_prog_node.activate();
@@ -484,6 +497,7 @@ pub const Object = struct {
 
         try self.genErrorNameTable(comp);
         try self.genCmpLtErrorsLenFunction(comp);
+        try self.genModuleLevelAssembly(comp);
 
         if (self.di_builder) |dib| {
             // When lowering debug info for pointers, we emitted the element types as
diff --git a/src/codegen/llvm/bindings.zig b/src/codegen/llvm/bindings.zig
index 560b9544dd..4732e0dd49 100644
--- a/src/codegen/llvm/bindings.zig
+++ b/src/codegen/llvm/bindings.zig
@@ -381,6 +381,9 @@ pub const Module = opaque {
 
     pub const createDIBuilder = ZigLLVMCreateDIBuilder;
     extern fn ZigLLVMCreateDIBuilder(module: *const Module, allow_unresolved: bool) *DIBuilder;
+
+    pub const setModuleInlineAsm2 = LLVMSetModuleInlineAsm2;
+    extern fn LLVMSetModuleInlineAsm2(M: *const Module, Asm: [*]const u8, Len: usize) void;
 };
 
 pub const lookupIntrinsicID = LLVMLookupIntrinsicID;
diff --git a/test/behavior/asm.zig b/test/behavior/asm.zig
index dab6f12127..9aa95a3a0b 100644
--- a/test/behavior/asm.zig
+++ b/test/behavior/asm.zig
@@ -23,7 +23,6 @@ test "module level assembly" {
     if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO
     if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO
-    if (builtin.zig_backend == .stage2_llvm) return error.SkipZigTest; // TODO
 
     if (is_x86_64_linux) {
         try expect(this_is_my_alias() == 1234);

From 17fc44dd1287163c25832c40f7e81cd5532e52bd Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 21:10:03 -0700
Subject: [PATCH 8/9] LLVM: fix x86_64 sysv C ABI for extern structs with
 sub-64 bit integers

---
 src/codegen/llvm.zig | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 65289b1a8c..c0c576cdf1 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -8030,7 +8030,8 @@ fn lowerFnRetTy(dg: *DeclGen, fn_info: Type.Payload.Function.Data) !*const llvm.
                             }
                         }
                         if (classes[0] == .integer and classes[1] == .none) {
-                            return llvm_types_buffer[0];
+                            const abi_size = fn_info.return_type.abiSize(target);
+                            return dg.context.intType(@intCast(c_uint, abi_size * 8));
                         }
                         return dg.context.structType(&llvm_types_buffer, llvm_types_index, .False);
                     },
@@ -8124,7 +8125,8 @@ fn lowerFnParamTy(dg: *DeclGen, cc: std.builtin.CallingConvention, ty: Type) !*c
                             }
                         }
                         if (classes[0] == .integer and classes[1] == .none) {
-                            return llvm_types_buffer[0];
+                            const abi_size = ty.abiSize(target);
+                            return dg.context.intType(@intCast(c_uint, abi_size * 8));
                         }
                         return dg.context.structType(&llvm_types_buffer, llvm_types_index, .False);
                     },

From 44252f4d352d53afd86d678c0b0a40b3f681c7eb Mon Sep 17 00:00:00 2001
From: Andrew Kelley <andrew@ziglang.org>
Date: Wed, 4 May 2022 22:57:57 -0700
Subject: [PATCH 9/9] LLVM: fix C ABI for windows

 * sret logic needed a check for hasRuntimeBits()
 * lower f128 on windows targets with the "sse" class rather than
   "memory". For reference, clang emits a compile error when __float128
   is used with the MSVC ABI, saying that this type is not supported.
   The docs for the x64 calling convention have both of these sentences:
   - "Any argument that doesn't fit in 8 bytes, or isn't 1, 2, 4, or 8 bytes,
     must be passed by reference."
   - "All floating point operations are done using the 16 XMM registers."
 * For i128, however, it is clear that the Windows calling convention
   wants such an object to be passed by reference. I fixed the LLVM
   lowering for function parameters to make this work.
---
 src/arch/x86_64/abi.zig | 17 +++++++++--------
 src/codegen/llvm.zig    | 27 ++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/arch/x86_64/abi.zig b/src/arch/x86_64/abi.zig
index f388c5c93a..da2e3da394 100644
--- a/src/arch/x86_64/abi.zig
+++ b/src/arch/x86_64/abi.zig
@@ -12,13 +12,10 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
     // and the registers used for those arguments. Any argument that doesn't fit in 8
     // bytes, or isn't 1, 2, 4, or 8 bytes, must be passed by reference. A single argument
     // is never spread across multiple registers."
+    // "All floating point operations are done using the 16 XMM registers."
     // "Structs and unions of size 8, 16, 32, or 64 bits, and __m64 types, are passed
     // as if they were integers of the same size."
-    switch (ty.abiSize(target)) {
-        1, 2, 4, 8 => {},
-        else => return .memory,
-    }
-    return switch (ty.zigTypeTag()) {
+    switch (ty.zigTypeTag()) {
         .Pointer,
         .Int,
         .Bool,
@@ -33,9 +30,13 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
         .ErrorUnion,
         .AnyFrame,
         .Frame,
-        => .integer,
+        => switch (ty.abiSize(target)) {
+            0 => unreachable,
+            1, 2, 4, 8 => return .integer,
+            else => return .memory,
+        },
 
-        .Float, .Vector => .sse,
+        .Float, .Vector => return .sse,
 
         .Type,
         .ComptimeFloat,
@@ -47,7 +48,7 @@ pub fn classifyWindows(ty: Type, target: Target) Class {
         .Opaque,
         .EnumLiteral,
         => unreachable,
-    };
+    }
 }
 
 /// There are a maximum of 8 possible return slots. Returned values are in
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index c0c576cdf1..426ef7c378 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -644,7 +644,17 @@ pub const Object = struct {
             if (!param_ty.hasRuntimeBitsIgnoreComptime()) continue;
 
             const llvm_arg_i = @intCast(c_uint, args.items.len) + param_offset;
-            try args.append(llvm_func.getParam(llvm_arg_i));
+            const param = llvm_func.getParam(llvm_arg_i);
+            // It is possible for the calling convention to make the argument's by-reference nature
+            // disagree with our canonical value for it, in which case we must dereference here.
+            const need_deref = !param_ty.isPtrAtRuntime() and !isByRef(param_ty) and
+                (param.typeOf().getTypeKind() == .Pointer);
+            const loaded_param = if (!need_deref) param else l: {
+                const load_inst = builder.buildLoad(param, "");
+                load_inst.setAlignment(param_ty.abiAlignment(target));
+                break :l load_inst;
+            };
+            try args.append(loaded_param);
         }
 
         var di_file: ?*llvm.DIFile = null;
@@ -3743,6 +3753,19 @@ pub const FuncGen = struct {
                 arg_ptr.setAlignment(alignment);
                 const store_inst = self.builder.buildStore(llvm_arg, arg_ptr);
                 store_inst.setAlignment(alignment);
+
+                if (abi_llvm_ty.getTypeKind() == .Pointer) {
+                    // In this case, the calling convention wants a pointer, but
+                    // we have a value.
+                    if (arg_ptr.typeOf() == abi_llvm_ty) {
+                        try llvm_args.append(arg_ptr);
+                        continue;
+                    }
+                    const casted_ptr = self.builder.buildBitCast(arg_ptr, abi_llvm_ty, "");
+                    try llvm_args.append(casted_ptr);
+                    continue;
+                }
+
                 break :p self.builder.buildBitCast(arg_ptr, ptr_abi_ty, "");
             };
 
@@ -7931,6 +7954,8 @@ fn llvmFieldIndex(
 }
 
 fn firstParamSRet(fn_info: Type.Payload.Function.Data, target: std.Target) bool {
+    if (!fn_info.return_type.hasRuntimeBitsIgnoreComptime()) return false;
+
     switch (fn_info.cc) {
         .Unspecified, .Inline => return isByRef(fn_info.return_type),
         .C => switch (target.cpu.arch) {