diff --git a/lib/compiler_rt/atomics.zig b/lib/compiler_rt/atomics.zig
index 6935a858aa..2a16fa51d5 100644
--- a/lib/compiler_rt/atomics.zig
+++ b/lib/compiler_rt/atomics.zig
@@ -35,6 +35,17 @@ const largest_atomic_size = switch (arch) {
     else => @sizeOf(usize),
 };
 
+// The size (in bytes) of the smallest atomic object that the architecture can
+// perform fetch/exchange atomically. Note, this does not encompass load and store.
+// Objects smaller than this threshold are implemented in terms of compare-exchange
+// of a larger value.
+const smallest_atomic_fetch_exch_size = switch (arch) {
+    // On AMDGPU, there are no instructions for atomic operations other than load and store
+    // (as of LLVM 15), and so these need to be implemented in terms of atomic CAS.
+    .amdgcn => @sizeOf(u32),
+    else => @sizeOf(u8),
+};
+
 const cache_line_size = 64;
 
 const SpinlockTable = struct {
@@ -214,6 +225,31 @@ inline fn atomic_exchange_N(comptime T: type, ptr: *T, val: T, model: i32) T {
         const value = ptr.*;
         ptr.* = val;
         return value;
+    } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
+        // Machine does not support this type, but it does support a larger type.
+        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
+
+        const addr = @ptrToInt(ptr);
+        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
+
+        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
+
+        // Put the interesting bits at the right position (branch has dynamic RHS).
+        const shifted_value = @as(WideAtomic, val) << inner_shift;
+        // Mask that guards the bits we care about
+        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
+        while (true) {
+            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
+            // Insert new bytes in old value.
+            const wide_new = wide_old & ~mask | shifted_value;
+            // CAS the new value until the result stabilizes.
+            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
+                // Mask-and-Shift back the old bits to get the old value.
+                return @truncate(T, (wide_old & mask) >> inner_shift);
+            }
+        }
     } else {
         return @atomicRmw(T, ptr, .Xchg, val, .SeqCst);
     }
@@ -298,6 +334,38 @@ inline fn fetch_op_N(comptime T: type, comptime op: std.builtin.AtomicRmwOp, ptr
         };
 
         return value;
+    } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
+        // Machine does not support this type, but it does support a larger type.
+        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
+
+        const addr = @ptrToInt(ptr);
+        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
+
+        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
+
+        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
+
+        while (true) {
+            // Compute new wide value with updated bits.
+            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
+            const old = @truncate(T, (wide_old & mask) >> inner_shift);
+            const new = switch (op) {
+                .Add => old +% val,
+                .Sub => old -% val,
+                .And => old & val,
+                .Nand => ~(old & val),
+                .Or => old | val,
+                .Xor => old ^ val,
+                else => @compileError("unsupported atomic op"),
+            };
+            const wide_new = wide_old & ~mask | (@as(WideAtomic, new) << inner_shift);
+            // CAS the new value until the result stabilizes.
+            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
+                return old;
+            }
+        }
     }
 
     return @atomicRmw(T, ptr, op, val, .SeqCst);