From 5dd3d17c201f83b853fa1f1bf5e459fb1582c3cf Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 6 Aug 2022 18:22:30 +0200
Subject: [PATCH 01/17] amdgpu: add amdhsa/amdpal ctype abi sizes

---
 src/type.zig | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/type.zig b/src/type.zig
index ea2b6c30c2..5ac9726727 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -6728,6 +6728,13 @@ pub const CType = enum {
                 },
             },
 
+            .amdhsa, .amdpal => switch (self) {
+                .short, .ushort => return 16,
+                .int, .uint => return 32,
+                .long, .ulong, .longlong, .ulonglong => return 64,
+                .longdouble => return 128,
+            },
+
             .cloudabi,
             .kfreebsd,
             .lv2,
@@ -6737,13 +6744,11 @@ pub const CType = enum {
             .aix,
             .cuda,
             .nvcl,
-            .amdhsa,
             .ps4,
             .ps5,
             .elfiamcu,
             .mesa3d,
             .contiki,
-            .amdpal,
             .hermit,
             .hurd,
             .opencl,

From 6b69dcfdd28145791da43979474fec29a13e24d0 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 6 Aug 2022 18:22:56 +0200
Subject: [PATCH 02/17] amdgpu: add AmdgpuKernel calling convention

---
 lib/std/builtin.zig      |  1 +
 src/Sema.zig             |  4 ++++
 src/codegen/llvm.zig     |  6 +++++-
 src/stage1/all_types.hpp |  3 ++-
 src/stage1/analyze.cpp   | 12 +++++++++---
 src/stage1/codegen.cpp   |  6 +++++-
 src/stage1/ir.cpp        |  1 +
 7 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/lib/std/builtin.zig b/lib/std/builtin.zig
index a2d3bcd870..87e8e90df8 100644
--- a/lib/std/builtin.zig
+++ b/lib/std/builtin.zig
@@ -157,6 +157,7 @@ pub const CallingConvention = enum {
     SysV,
     Win64,
     PtxKernel,
+    AmdgpuKernel,
 };
 
 /// This data structure is used by the Zig language code generation and
diff --git a/src/Sema.zig b/src/Sema.zig
index e8a64c6e4f..e28d80e9f1 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -8141,6 +8141,10 @@ fn funcCommon(
                 .nvptx, .nvptx64 => null,
                 else => @as([]const u8, "nvptx and nvptx64"),
             },
+            .AmdgpuKernel => switch (arch) {
+                .amdgcn => null,
+                else => @as([]const u8, "amdgcn"),
+            },
         }) |allowed_platform| {
             return sema.fail(block, cc_src, "callconv '{s}' is only available on {s}, not {s}", .{
                 @tagName(cc_workaround),
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 7038606611..7f411c8373 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4425,7 +4425,7 @@ pub const FuncGen = struct {
                 .cmp_lt  => try self.airCmp(inst, .lt, false),
                 .cmp_lte => try self.airCmp(inst, .lte, false),
                 .cmp_neq => try self.airCmp(inst, .neq, false),
-                
+
                 .cmp_eq_optimized  => try self.airCmp(inst, .eq, true),
                 .cmp_gt_optimized  => try self.airCmp(inst, .gt, true),
                 .cmp_gte_optimized => try self.airCmp(inst, .gte, true),
@@ -9807,6 +9807,10 @@ fn toLlvmCallConv(cc: std.builtin.CallingConvention, target: std.Target) llvm.Ca
             .nvptx, .nvptx64 => .PTX_Kernel,
             else => unreachable,
         },
+        .AmdgpuKernel => return switch (target.cpu.arch) {
+            .amdgcn => .AMDGPU_KERNEL,
+            else => unreachable,
+        },
     };
 }
 
diff --git a/src/stage1/all_types.hpp b/src/stage1/all_types.hpp
index d4a2abece9..f427548fdb 100644
--- a/src/stage1/all_types.hpp
+++ b/src/stage1/all_types.hpp
@@ -85,7 +85,8 @@ enum CallingConvention {
     CallingConventionAAPCSVFP,
     CallingConventionSysV,
     CallingConventionWin64,
-    CallingConventionPtxKernel
+    CallingConventionPtxKernel,
+    CallingConventionAmdgpuKernel
 };
 
 // Stage 1 supports only the generic address space
diff --git a/src/stage1/analyze.cpp b/src/stage1/analyze.cpp
index f0cad841be..15b8789997 100644
--- a/src/stage1/analyze.cpp
+++ b/src/stage1/analyze.cpp
@@ -993,6 +993,7 @@ const char *calling_convention_name(CallingConvention cc) {
         case CallingConventionSysV: return "SysV";
         case CallingConventionWin64: return "Win64";
         case CallingConventionPtxKernel: return "PtxKernel";
+        case CallingConventionAmdgpuKernel: return "AmdgpuKernel";
     }
     zig_unreachable();
 }
@@ -1017,6 +1018,7 @@ bool calling_convention_allows_zig_types(CallingConvention cc) {
         case CallingConventionAAPCSVFP:
         case CallingConventionSysV:
         case CallingConventionWin64:
+        case CallingConventionAmdgpuKernel:
             return false;
     }
     zig_unreachable();
@@ -2019,6 +2021,9 @@ Error emit_error_unless_callconv_allowed_for_target(CodeGen *g, AstNode *source_
                 allowed_platforms = "nvptx and nvptx64";
             }
             break;
+      case CallingConventionAmdgpuKernel:
+          if (g->zig_target->arch != ZigLLVM_amdgcn)
+              allowed_platforms = "amdgcn and amdpal";
 
     }
     if (allowed_platforms != nullptr) {
@@ -3857,6 +3862,7 @@ static void resolve_decl_fn(CodeGen *g, TldFn *tld_fn) {
                 case CallingConventionSysV:
                 case CallingConventionWin64:
                 case CallingConventionPtxKernel:
+                case CallingConventionAmdgpuKernel:
                     add_fn_export(g, fn_table_entry, buf_ptr(&fn_table_entry->symbol_name),
                                   GlobalLinkageIdStrong, fn_cc);
                     break;
@@ -6012,7 +6018,7 @@ Error type_has_bits2(CodeGen *g, ZigType *type_entry, bool *result) {
 
 bool fn_returns_c_abi_small_struct(FnTypeId *fn_type_id) {
     ZigType *type = fn_type_id->return_type;
-    return !calling_convention_allows_zig_types(fn_type_id->cc) && 
+    return !calling_convention_allows_zig_types(fn_type_id->cc) &&
         type->id == ZigTypeIdStruct && type->abi_size <= 16;
 }
 
@@ -8698,7 +8704,7 @@ static LLVMTypeRef llvm_int_for_size(size_t size) {
 static LLVMTypeRef llvm_sse_for_size(size_t size) {
     if (size > 4)
         return LLVMDoubleType();
-    else 
+    else
         return LLVMFloatType();
 }
 
@@ -8756,7 +8762,7 @@ static Error resolve_llvm_c_abi_type(CodeGen *g, ZigType *ty) {
 
             LLVMTypeRef return_elem_types[] = {
                 LLVMVoidType(),
-                LLVMVoidType(), 
+                LLVMVoidType(),
             };
             for (uint32_t i = 0; i <= eightbyte_index; i += 1) {
                 if (type_classes[i] == X64CABIClass_INTEGER) {
diff --git a/src/stage1/codegen.cpp b/src/stage1/codegen.cpp
index bd572bb96c..6e1593ccca 100644
--- a/src/stage1/codegen.cpp
+++ b/src/stage1/codegen.cpp
@@ -216,6 +216,9 @@ static ZigLLVM_CallingConv get_llvm_cc(CodeGen *g, CallingConvention cc) {
             assert(g->zig_target->arch == ZigLLVM_nvptx ||
                 g->zig_target->arch == ZigLLVM_nvptx64);
                 return ZigLLVM_PTX_Kernel;
+        case CallingConventionAmdgpuKernel:
+            assert(g->zig_target->arch == ZigLLVM_amdgcn);
+            return ZigLLVM_AMDGPU_KERNEL;
 
     }
     zig_unreachable();
@@ -364,6 +367,7 @@ static bool cc_want_sret_attr(CallingConvention cc) {
         case CallingConventionSysV:
         case CallingConventionWin64:
         case CallingConventionPtxKernel:
+        case CallingConventionAmdgpuKernel:
             return true;
         case CallingConventionAsync:
         case CallingConventionUnspecified:
@@ -3515,7 +3519,7 @@ static LLVMValueRef gen_soft_float_to_int_op(CodeGen *g, LLVMValueRef value_ref,
 
     // Handle integers of non-pot bitsize by shortening them on the output
     if (result_type != wider_type) {
-        result = gen_widen_or_shorten(g, false, wider_type, result_type, result); 
+        result = gen_widen_or_shorten(g, false, wider_type, result_type, result);
     }
 
     return result;
diff --git a/src/stage1/ir.cpp b/src/stage1/ir.cpp
index a5428945a9..c5f15c5cc9 100644
--- a/src/stage1/ir.cpp
+++ b/src/stage1/ir.cpp
@@ -11753,6 +11753,7 @@ static Stage1AirInst *ir_analyze_instruction_export(IrAnalyze *ira, Stage1ZirIns
                 case CallingConventionSysV:
                 case CallingConventionWin64:
                 case CallingConventionPtxKernel:
+                case CallingConventionAmdgpuKernel:
                     add_fn_export(ira->codegen, fn_entry, buf_ptr(symbol_name), global_linkage_id, cc);
                     fn_entry->section_name = section_name;
                     break;

From d638b2e29f8799889b53f559bdde40fe03f2b665 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 6 Aug 2022 19:04:35 +0200
Subject: [PATCH 03/17] stage 2: grammar

---
 src/Sema.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index e28d80e9f1..505764da8a 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -5897,7 +5897,7 @@ fn analyzeCall(
             },
             else => {},
         }
-        return sema.fail(block, func_src, "type '{}' not a function", .{callee_ty.fmt(sema.mod)});
+        return sema.fail(block, func_src, "type '{}' is not a function", .{callee_ty.fmt(sema.mod)});
     };
 
     const func_ty_info = func_ty.fnInfo();

From 5859d8458f919676be804835aaeb3ed844a67542 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Wed, 10 Aug 2022 13:33:02 +0200
Subject: [PATCH 04/17] big int: make Mutable.normalize const

---
 lib/std/math/big/int.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index e7c8ed72a2..889f0928e0 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1845,7 +1845,7 @@ pub const Mutable = struct {
     /// [1, 2, 3, 4, 0] -> [1, 2, 3, 4]
     /// [1, 2, 0, 0, 0] -> [1, 2]
     /// [0, 0, 0, 0, 0] -> [0]
-    fn normalize(r: *Mutable, length: usize) void {
+    pub fn normalize(r: *Mutable, length: usize) void {
         r.len = llnormalize(r.limbs[0..length]);
     }
 };

From 3392de87dcf516ed2e3fd1904372195bdc75c0eb Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 20 Aug 2022 12:42:27 +0200
Subject: [PATCH 05/17] allow global/local/shared address spaces on amdgcn

---
 src/Sema.zig         | 6 ++++--
 src/codegen/llvm.zig | 8 ++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index 505764da8a..b357beafdf 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -30303,13 +30303,15 @@ pub fn analyzeAddrspace(
     const address_space = addrspace_tv.val.toEnum(std.builtin.AddressSpace);
     const target = sema.mod.getTarget();
     const arch = target.cpu.arch;
-    const is_gpu = arch == .nvptx or arch == .nvptx64;
+    const is_nv = arch == .nvptx or arch == .nvptx64;
+    const is_gpu = is_nv or arch == .amdgcn;
 
     const supported = switch (address_space) {
         .generic => true,
         .gs, .fs, .ss => (arch == .i386 or arch == .x86_64) and ctx == .pointer,
         // TODO: check that .shared and .local are left uninitialized
-        .global, .param, .shared, .local => is_gpu,
+        .param => is_nv,
+        .global, .shared, .local => is_gpu,
         .constant => is_gpu and (ctx == .constant),
     };
 
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 7f411c8373..d0433be2b9 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -2659,6 +2659,14 @@ pub const DeclGen = struct {
                 .local => llvm.address_space.nvptx.local,
                 else => unreachable,
             },
+            .amdgcn => switch (address_space) {
+                .generic => llvm.address_space.flat,
+                .global => llvm.address_space.amdgpu.global,
+                .constant => llvm.address_space.amdgpu.constant,
+                .shared => llvm.address_space.amdgpu.local,
+                .local => llvm.address_space.amdgpu.private,
+                else => unreachable,
+            }.
             else => switch (address_space) {
                 .generic => llvm.address_space.default,
                 else => unreachable,

From 5f3b91437f5cd23bcae66227932555b7abe32669 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 21 Aug 2022 01:32:19 +0200
Subject: [PATCH 06/17] stage2: improve addrspace handling

This commit changes the way Zig is intended to deal with variable
declaration for exotic targets. Where previously the idea was to
enfore local/global variables to be placed into their respective
address spaces, depending on the target, this is now fixed to the
generic address space.

To facilitate this for targets where local variables _must_ be
generated into a specific address space (ex. amdgcn where locals
must be generated into the private address space), the variable
allocations (alloca) are generated into the right address space
and then addrspace-casted back to the generic address space. While this
could be less efficient in theory, LLVM will hopefull deal with figuring
out the actual correct address space for a pointer for us. HIP seems to
do the same thing in this regard.

Global variables are handled in a similar way.
---
 src/codegen/llvm.zig          | 307 ++++++++++++++++++++--------------
 src/codegen/llvm/bindings.zig |   9 +
 src/zig_llvm.cpp              |  17 +-
 src/zig_llvm.h                |   2 +
 4 files changed, 203 insertions(+), 132 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index d0433be2b9..f400e841ea 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -929,8 +929,7 @@ pub const Object = struct {
                     if (isByRef(param_ty)) {
                         const alignment = param_ty.abiAlignment(target);
                         const param_llvm_ty = param.typeOf();
-                        const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty);
-                        arg_ptr.setAlignment(alignment);
+                        const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty, alignment, target);
                         const store_inst = builder.buildStore(param, arg_ptr);
                         store_inst.setAlignment(alignment);
                         args.appendAssumeCapacity(arg_ptr);
@@ -974,8 +973,7 @@ pub const Object = struct {
                         param_ty.abiAlignment(target),
                         dg.object.target_data.abiAlignmentOfType(int_llvm_ty),
                     );
-                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty);
-                    arg_ptr.setAlignment(alignment);
+                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty, alignment, target);
                     const casted_ptr = builder.buildBitCast(arg_ptr, int_ptr_llvm_ty, "");
                     const store_inst = builder.buildStore(param, casted_ptr);
                     store_inst.setAlignment(alignment);
@@ -1026,8 +1024,7 @@ pub const Object = struct {
                     const param_ty = fn_info.param_types[it.zig_index - 1];
                     const param_llvm_ty = try dg.lowerType(param_ty);
                     const param_alignment = param_ty.abiAlignment(target);
-                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty);
-                    arg_ptr.setAlignment(param_alignment);
+                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty, param_alignment, target);
                     var field_types_buf: [8]*llvm.Type = undefined;
                     const field_types = field_types_buf[0..llvm_ints.len];
                     for (llvm_ints) |int_bits, i| {
@@ -1058,8 +1055,7 @@ pub const Object = struct {
                     const param_ty = fn_info.param_types[it.zig_index - 1];
                     const param_llvm_ty = try dg.lowerType(param_ty);
                     const param_alignment = param_ty.abiAlignment(target);
-                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty);
-                    arg_ptr.setAlignment(param_alignment);
+                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty, param_alignment, target);
                     var field_types_buf: [8]*llvm.Type = undefined;
                     const field_types = field_types_buf[0..llvm_floats.len];
                     for (llvm_floats) |float_bits, i| {
@@ -1103,8 +1099,7 @@ pub const Object = struct {
                     llvm_arg_i += 1;
 
                     const alignment = param_ty.abiAlignment(target);
-                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty);
-                    arg_ptr.setAlignment(alignment);
+                    const arg_ptr = buildAllocaInner(builder, llvm_func, false, param_llvm_ty, alignment, target);
                     const casted_ptr = builder.buildBitCast(arg_ptr, param.typeOf().pointerType(0), "");
                     _ = builder.buildStore(param, casted_ptr);
 
@@ -2404,21 +2399,27 @@ pub const DeclGen = struct {
                     // mismatch, because we don't have the LLVM type until the *value* is created,
                     // whereas the global needs to be created based on the type alone, because
                     // lowering the value may reference the global as a pointer.
+                    const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+                    const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
                     const new_global = dg.object.llvm_module.addGlobalInAddressSpace(
                         llvm_init.typeOf(),
                         "",
-                        dg.llvmAddressSpace(decl.@"addrspace"),
+                        llvm_global_addrspace,
                     );
                     new_global.setLinkage(global.getLinkage());
                     new_global.setUnnamedAddr(global.getUnnamedAddress());
                     new_global.setAlignment(global.getAlignment());
                     if (decl.@"linksection") |section| new_global.setSection(section);
                     new_global.setInitializer(llvm_init);
-                    // replaceAllUsesWith requires the type to be unchanged. So we bitcast
+                    // replaceAllUsesWith requires the type to be unchanged. So we convert
                     // the new global to the old type and use that as the thing to replace
                     // old uses.
-                    const new_global_ptr = new_global.constBitCast(global.typeOf());
-                    global.replaceAllUsesWith(new_global_ptr);
+                    const new_global_ptr = if (llvm_addrspace != llvm_global_addrspace)
+                        new_global.constAddrSpaceCast(llvm_init.typeOf().pointerType(llvm_addrspace))
+                    else
+                        new_global;
+                    const new_global_casted_ptr = new_global_ptr.constBitCast(global.typeOf());
+                    global.replaceAllUsesWith(new_global_casted_ptr);
                     dg.object.decl_map.putAssumeCapacity(decl_index, new_global);
                     new_global.takeName(global);
                     global.deleteGlobal();
@@ -2465,7 +2466,7 @@ pub const DeclGen = struct {
         const fqn = try decl.getFullyQualifiedName(dg.module);
         defer dg.gpa.free(fqn);
 
-        const llvm_addrspace = dg.llvmAddressSpace(decl.@"addrspace");
+        const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
         const llvm_fn = dg.llvmModule().addFunctionInAddressSpace(fqn, fn_type, llvm_addrspace);
         gop.value_ptr.* = llvm_fn;
 
@@ -2613,9 +2614,15 @@ pub const DeclGen = struct {
         const fqn = try decl.getFullyQualifiedName(dg.module);
         defer dg.gpa.free(fqn);
 
+        const target = dg.module.getTarget();
+
         const llvm_type = try dg.lowerType(decl.ty);
-        const llvm_addrspace = dg.llvmAddressSpace(decl.@"addrspace");
-        const llvm_global = dg.object.llvm_module.addGlobalInAddressSpace(llvm_type, fqn, llvm_addrspace);
+        const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+        const llvm_global = dg.object.llvm_module.addGlobalInAddressSpace(
+            llvm_type,
+            fqn,
+            toLlvmGlobalAddressSpace(llvm_addrspace, target),
+        );
         gop.value_ptr.* = llvm_global;
 
         // This is needed for declarations created by `@extern`.
@@ -2640,40 +2647,6 @@ pub const DeclGen = struct {
         return llvm_global;
     }
 
-    fn llvmAddressSpace(self: DeclGen, address_space: std.builtin.AddressSpace) c_uint {
-        const target = self.module.getTarget();
-        return switch (target.cpu.arch) {
-            .i386, .x86_64 => switch (address_space) {
-                .generic => llvm.address_space.default,
-                .gs => llvm.address_space.x86.gs,
-                .fs => llvm.address_space.x86.fs,
-                .ss => llvm.address_space.x86.ss,
-                else => unreachable,
-            },
-            .nvptx, .nvptx64 => switch (address_space) {
-                .generic => llvm.address_space.default,
-                .global => llvm.address_space.nvptx.global,
-                .constant => llvm.address_space.nvptx.constant,
-                .param => llvm.address_space.nvptx.param,
-                .shared => llvm.address_space.nvptx.shared,
-                .local => llvm.address_space.nvptx.local,
-                else => unreachable,
-            },
-            .amdgcn => switch (address_space) {
-                .generic => llvm.address_space.flat,
-                .global => llvm.address_space.amdgpu.global,
-                .constant => llvm.address_space.amdgpu.constant,
-                .shared => llvm.address_space.amdgpu.local,
-                .local => llvm.address_space.amdgpu.private,
-                else => unreachable,
-            }.
-            else => switch (address_space) {
-                .generic => llvm.address_space.default,
-                else => unreachable,
-            },
-        };
-    }
-
     fn isUnnamedType(dg: *DeclGen, ty: Type, val: *llvm.Value) bool {
         // Once `lowerType` succeeds, successive calls to it with the same Zig type
         // are guaranteed to succeed. So if a call to `lowerType` fails here it means
@@ -2739,7 +2712,7 @@ pub const DeclGen = struct {
                     return dg.context.structType(&fields, fields.len, .False);
                 }
                 const ptr_info = t.ptrInfo().data;
-                const llvm_addrspace = dg.llvmAddressSpace(ptr_info.@"addrspace");
+                const llvm_addrspace = toLlvmAddressSpace(ptr_info.@"addrspace", target);
                 if (ptr_info.host_size != 0) {
                     return dg.context.intType(ptr_info.host_size * 8).pointerType(llvm_addrspace);
                 }
@@ -3268,11 +3241,18 @@ pub const DeclGen = struct {
                     const decl_index = tv.val.castTag(.variable).?.data.owner_decl;
                     const decl = dg.module.declPtr(decl_index);
                     dg.module.markDeclAlive(decl);
-                    const val = try dg.resolveGlobalDecl(decl_index);
                     const llvm_var_type = try dg.lowerType(tv.ty);
-                    const llvm_addrspace = dg.llvmAddressSpace(decl.@"addrspace");
-                    const llvm_type = llvm_var_type.pointerType(llvm_addrspace);
-                    return val.constBitCast(llvm_type);
+                    const llvm_var_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+                    const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_var_addrspace, target);
+                    const llvm_var_ptr_type = llvm_var_type.pointerType(llvm_global_addrspace);
+
+                    const val = try dg.resolveGlobalDecl(decl_index);
+                    const val_ptr = val.constBitCast(llvm_var_ptr_type);
+                    if (llvm_global_addrspace != llvm_var_addrspace) {
+                        const llvm_ptr_type = llvm_var_type.pointerType(llvm_var_addrspace);
+                        return val_ptr.constAddrSpaceCast(llvm_ptr_type);
+                    }
+                    return val_ptr;
                 },
                 .slice => {
                     const slice = tv.val.castTag(.slice).?.data;
@@ -4069,11 +4049,20 @@ pub const DeclGen = struct {
 
         self.module.markDeclAlive(decl);
 
-        const llvm_val = if (is_fn_body)
+        const llvm_decl_val = if (is_fn_body)
             try self.resolveLlvmFunction(decl_index)
         else
             try self.resolveGlobalDecl(decl_index);
 
+        const target = self.module.getTarget();
+        const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+        const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
+        const llvm_val = if (llvm_addrspace != llvm_global_addrspace) blk: {
+            const llvm_decl_ty = try self.lowerType(decl.ty);
+            const llvm_decl_ptr_ty = llvm_decl_ty.pointerType(llvm_addrspace);
+            break :blk llvm_decl_val.constAddrSpaceCast(llvm_decl_ptr_ty);
+        } else llvm_decl_val;
+
         const llvm_type = try self.lowerType(tv.ty);
         if (tv.ty.zigTypeTag() == .Int) {
             return llvm_val.constPtrToInt(llvm_type);
@@ -4339,7 +4328,9 @@ pub const FuncGen = struct {
         // We have an LLVM value but we need to create a global constant and
         // set the value as its initializer, and then return a pointer to the global.
         const target = self.dg.module.getTarget();
-        const global = self.dg.object.llvm_module.addGlobal(llvm_val.typeOf(), "");
+        const llvm_addrspace = toLlvmAddressSpace(.generic, target);
+        const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
+        const global = self.dg.object.llvm_module.addGlobalInAddressSpace(llvm_val.typeOf(), "", llvm_global_addrspace);
         global.setInitializer(llvm_val);
         global.setLinkage(.Private);
         global.setGlobalConstant(.True);
@@ -4349,8 +4340,10 @@ pub const FuncGen = struct {
         // the type of global constants might not match the type it is supposed to
         // be, and so we must bitcast the pointer at the usage sites.
         const wanted_llvm_ty = try self.dg.lowerType(ty);
-        const wanted_llvm_ptr_ty = wanted_llvm_ty.pointerType(0);
-        const casted_ptr = global.constBitCast(wanted_llvm_ptr_ty);
+        const wanted_bitcasted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_global_addrspace);
+        const bitcasted_ptr = global.constBitCast(wanted_bitcasted_llvm_ptr_ty);
+        const wanted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_addrspace);
+        const casted_ptr = bitcasted_ptr.constAddrSpaceCast(wanted_llvm_ptr_ty);
         gop.value_ptr.* = casted_ptr;
         return casted_ptr;
     }
@@ -4606,8 +4599,7 @@ pub const FuncGen = struct {
 
         const ret_ptr = if (!sret) null else blk: {
             const llvm_ret_ty = try self.dg.lowerType(return_type);
-            const ret_ptr = self.buildAlloca(llvm_ret_ty);
-            ret_ptr.setAlignment(return_type.abiAlignment(target));
+            const ret_ptr = self.buildAlloca(llvm_ret_ty, return_type.abiAlignment(target));
             try llvm_args.append(ret_ptr);
             break :blk ret_ptr;
         };
@@ -4654,8 +4646,7 @@ pub const FuncGen = struct {
                 } else {
                     const alignment = param_ty.abiAlignment(target);
                     const param_llvm_ty = llvm_arg.typeOf();
-                    const arg_ptr = self.buildAlloca(param_llvm_ty);
-                    arg_ptr.setAlignment(alignment);
+                    const arg_ptr = self.buildAlloca(param_llvm_ty, alignment);
                     const store_inst = self.builder.buildStore(llvm_arg, arg_ptr);
                     store_inst.setAlignment(alignment);
                     try llvm_args.append(arg_ptr);
@@ -4682,8 +4673,7 @@ pub const FuncGen = struct {
                         param_ty.abiAlignment(target),
                         self.dg.object.target_data.abiAlignmentOfType(int_llvm_ty),
                     );
-                    const int_ptr = self.buildAlloca(int_llvm_ty);
-                    int_ptr.setAlignment(alignment);
+                    const int_ptr = self.buildAlloca(int_llvm_ty, alignment);
                     const param_llvm_ty = try self.dg.lowerType(param_ty);
                     const casted_ptr = self.builder.buildBitCast(int_ptr, param_llvm_ty.pointerType(0), "");
                     const store_inst = self.builder.buildStore(llvm_arg, casted_ptr);
@@ -4709,7 +4699,7 @@ pub const FuncGen = struct {
                 const llvm_arg = try self.resolveInst(arg);
                 const is_by_ref = isByRef(param_ty);
                 const arg_ptr = if (is_by_ref) llvm_arg else p: {
-                    const p = self.buildAlloca(llvm_arg.typeOf());
+                    const p = self.buildAlloca(llvm_arg.typeOf(), null);
                     const store_inst = self.builder.buildStore(llvm_arg, p);
                     store_inst.setAlignment(param_ty.abiAlignment(target));
                     break :p p;
@@ -4738,7 +4728,7 @@ pub const FuncGen = struct {
                 const llvm_arg = try self.resolveInst(arg);
                 const is_by_ref = isByRef(param_ty);
                 const arg_ptr = if (is_by_ref) llvm_arg else p: {
-                    const p = self.buildAlloca(llvm_arg.typeOf());
+                    const p = self.buildAlloca(llvm_arg.typeOf(), null);
                     const store_inst = self.builder.buildStore(llvm_arg, p);
                     store_inst.setAlignment(param_ty.abiAlignment(target));
                     break :p p;
@@ -4775,7 +4765,7 @@ pub const FuncGen = struct {
                 const arg_ty = self.air.typeOf(arg);
                 var llvm_arg = try self.resolveInst(arg);
                 if (!isByRef(arg_ty)) {
-                    const p = self.buildAlloca(llvm_arg.typeOf());
+                    const p = self.buildAlloca(llvm_arg.typeOf(), null);
                     const store_inst = self.builder.buildStore(llvm_arg, p);
                     store_inst.setAlignment(arg_ty.abiAlignment(target));
                     llvm_arg = store_inst;
@@ -4832,9 +4822,8 @@ pub const FuncGen = struct {
             // In this case the function return type is honoring the calling convention by having
             // a different LLVM type than the usual one. We solve this here at the callsite
             // by bitcasting a pointer to our canonical type, then loading it if necessary.
-            const rp = self.buildAlloca(llvm_ret_ty);
             const alignment = return_type.abiAlignment(target);
-            rp.setAlignment(alignment);
+            const rp = self.buildAlloca(llvm_ret_ty, alignment);
             const ptr_abi_ty = abi_ret_ty.pointerType(0);
             const casted_ptr = self.builder.buildBitCast(rp, ptr_abi_ty, "");
             const store_inst = self.builder.buildStore(call, casted_ptr);
@@ -4851,9 +4840,8 @@ pub const FuncGen = struct {
         if (isByRef(return_type)) {
             // our by-ref status disagrees with sret so we must allocate, store,
             // and return the allocation pointer.
-            const rp = self.buildAlloca(llvm_ret_ty);
             const alignment = return_type.abiAlignment(target);
-            rp.setAlignment(alignment);
+            const rp = self.buildAlloca(llvm_ret_ty, alignment);
             const store_inst = self.builder.buildStore(call, rp);
             store_inst.setAlignment(alignment);
             return rp;
@@ -4912,8 +4900,7 @@ pub const FuncGen = struct {
             return null;
         }
 
-        const rp = self.buildAlloca(llvm_ret_ty);
-        rp.setAlignment(alignment);
+        const rp = self.buildAlloca(llvm_ret_ty, alignment);
         const store_inst = self.builder.buildStore(operand, rp);
         store_inst.setAlignment(alignment);
         const casted_ptr = self.builder.buildBitCast(rp, ptr_abi_ty, "");
@@ -6031,8 +6018,7 @@ pub const FuncGen = struct {
                     llvm_param_types[llvm_param_i] = arg_llvm_value.typeOf();
                 } else {
                     const alignment = arg_ty.abiAlignment(target);
-                    const arg_ptr = self.buildAlloca(arg_llvm_value.typeOf());
-                    arg_ptr.setAlignment(alignment);
+                    const arg_ptr = self.buildAlloca(arg_llvm_value.typeOf(), alignment);
                     const store_inst = self.builder.buildStore(arg_llvm_value, arg_ptr);
                     store_inst.setAlignment(alignment);
                     llvm_param_values[llvm_param_i] = arg_ptr;
@@ -6533,8 +6519,7 @@ pub const FuncGen = struct {
         const llvm_optional_ty = try self.dg.lowerType(optional_ty);
         if (isByRef(optional_ty)) {
             const target = self.dg.module.getTarget();
-            const optional_ptr = self.buildAlloca(llvm_optional_ty);
-            optional_ptr.setAlignment(optional_ty.abiAlignment(target));
+            const optional_ptr = self.buildAlloca(llvm_optional_ty, optional_ty.abiAlignment(target));
             const payload_ptr = self.builder.buildStructGEP(llvm_optional_ty, optional_ptr, 0, "");
             var ptr_ty_payload: Type.Payload.ElemType = .{
                 .base = .{ .tag = .single_mut_pointer },
@@ -6567,8 +6552,7 @@ pub const FuncGen = struct {
         const payload_offset = errUnionPayloadOffset(payload_ty, target);
         const error_offset = errUnionErrorOffset(payload_ty, target);
         if (isByRef(err_un_ty)) {
-            const result_ptr = self.buildAlloca(err_un_llvm_ty);
-            result_ptr.setAlignment(err_un_ty.abiAlignment(target));
+            const result_ptr = self.buildAlloca(err_un_llvm_ty, err_un_ty.abiAlignment(target));
             const err_ptr = self.builder.buildStructGEP(err_un_llvm_ty, result_ptr, error_offset, "");
             const store_inst = self.builder.buildStore(ok_err_code, err_ptr);
             store_inst.setAlignment(Type.anyerror.abiAlignment(target));
@@ -6602,8 +6586,7 @@ pub const FuncGen = struct {
         const payload_offset = errUnionPayloadOffset(payload_ty, target);
         const error_offset = errUnionErrorOffset(payload_ty, target);
         if (isByRef(err_un_ty)) {
-            const result_ptr = self.buildAlloca(err_un_llvm_ty);
-            result_ptr.setAlignment(err_un_ty.abiAlignment(target));
+            const result_ptr = self.buildAlloca(err_un_llvm_ty, err_un_ty.abiAlignment(target));
             const err_ptr = self.builder.buildStructGEP(err_un_llvm_ty, result_ptr, error_offset, "");
             const store_inst = self.builder.buildStore(operand, err_ptr);
             store_inst.setAlignment(Type.anyerror.abiAlignment(target));
@@ -7021,9 +7004,8 @@ pub const FuncGen = struct {
 
         if (isByRef(dest_ty)) {
             const target = self.dg.module.getTarget();
-            const alloca_inst = self.buildAlloca(llvm_dest_ty);
             const result_alignment = dest_ty.abiAlignment(target);
-            alloca_inst.setAlignment(result_alignment);
+            const alloca_inst = self.buildAlloca(llvm_dest_ty, result_alignment);
             {
                 const field_ptr = self.builder.buildStructGEP(llvm_dest_ty, alloca_inst, result_index, "");
                 const store_inst = self.builder.buildStore(result, field_ptr);
@@ -7373,9 +7355,8 @@ pub const FuncGen = struct {
 
         if (isByRef(dest_ty)) {
             const target = self.dg.module.getTarget();
-            const alloca_inst = self.buildAlloca(llvm_dest_ty);
             const result_alignment = dest_ty.abiAlignment(target);
-            alloca_inst.setAlignment(result_alignment);
+            const alloca_inst = self.buildAlloca(llvm_dest_ty, result_alignment);
             {
                 const field_ptr = self.builder.buildStructGEP(llvm_dest_ty, alloca_inst, result_index, "");
                 const store_inst = self.builder.buildStore(result, field_ptr);
@@ -7653,7 +7634,7 @@ pub const FuncGen = struct {
             if (!result_is_ref) {
                 return self.dg.todo("implement bitcast vector to non-ref array", .{});
             }
-            const array_ptr = self.buildAlloca(llvm_dest_ty);
+            const array_ptr = self.buildAlloca(llvm_dest_ty, null);
             const bitcast_ok = elem_ty.bitSize(target) == elem_ty.abiSize(target) * 8;
             if (bitcast_ok) {
                 const llvm_vector_ty = try self.dg.lowerType(operand_ty);
@@ -7729,8 +7710,7 @@ pub const FuncGen = struct {
         if (result_is_ref) {
             // Bitcast the result pointer, then store.
             const alignment = @maximum(operand_ty.abiAlignment(target), inst_ty.abiAlignment(target));
-            const result_ptr = self.buildAlloca(llvm_dest_ty);
-            result_ptr.setAlignment(alignment);
+            const result_ptr = self.buildAlloca(llvm_dest_ty, alignment);
             const operand_llvm_ty = try self.dg.lowerType(operand_ty);
             const casted_ptr = self.builder.buildBitCast(result_ptr, operand_llvm_ty.pointerType(0), "");
             const store_inst = self.builder.buildStore(operand, casted_ptr);
@@ -7743,8 +7723,7 @@ pub const FuncGen = struct {
             // but LLVM won't let us bitcast struct values.
             // Therefore, we store operand to bitcasted alloca, then load for result.
             const alignment = @maximum(operand_ty.abiAlignment(target), inst_ty.abiAlignment(target));
-            const result_ptr = self.buildAlloca(llvm_dest_ty);
-            result_ptr.setAlignment(alignment);
+            const result_ptr = self.buildAlloca(llvm_dest_ty, alignment);
             const operand_llvm_ty = try self.dg.lowerType(operand_ty);
             const casted_ptr = self.builder.buildBitCast(result_ptr, operand_llvm_ty.pointerType(0), "");
             const store_inst = self.builder.buildStore(operand, casted_ptr);
@@ -7820,11 +7799,9 @@ pub const FuncGen = struct {
         if (!pointee_type.isFnOrHasRuntimeBitsIgnoreComptime()) return self.dg.lowerPtrToVoid(ptr_ty);
 
         const pointee_llvm_ty = try self.dg.lowerType(pointee_type);
-        const alloca_inst = self.buildAlloca(pointee_llvm_ty);
         const target = self.dg.module.getTarget();
         const alignment = ptr_ty.ptrAlignment(target);
-        alloca_inst.setAlignment(alignment);
-        return alloca_inst;
+        return self.buildAlloca(pointee_llvm_ty, alignment);
     }
 
     fn airRetPtr(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
@@ -7835,15 +7812,13 @@ pub const FuncGen = struct {
         if (self.ret_ptr) |ret_ptr| return ret_ptr;
         const ret_llvm_ty = try self.dg.lowerType(ret_ty);
         const target = self.dg.module.getTarget();
-        const alloca_inst = self.buildAlloca(ret_llvm_ty);
-        alloca_inst.setAlignment(ptr_ty.ptrAlignment(target));
-        return alloca_inst;
+        return self.buildAlloca(ret_llvm_ty, ptr_ty.ptrAlignment(target));
     }
 
     /// Use this instead of builder.buildAlloca, because this function makes sure to
     /// put the alloca instruction at the top of the function!
-    fn buildAlloca(self: *FuncGen, llvm_ty: *llvm.Type) *llvm.Value {
-        return buildAllocaInner(self.builder, self.llvm_func, self.di_scope != null, llvm_ty);
+    fn buildAlloca(self: *FuncGen, llvm_ty: *llvm.Type, alignment: ?c_uint) *llvm.Value {
+        return buildAllocaInner(self.builder, self.llvm_func, self.di_scope != null, llvm_ty, alignment, self.dg.module.getTarget());
     }
 
     fn airStore(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
@@ -8801,10 +8776,9 @@ pub const FuncGen = struct {
 
                 if (isByRef(result_ty)) {
                     const llvm_u32 = self.context.intType(32);
-                    const alloca_inst = self.buildAlloca(llvm_result_ty);
                     // TODO in debug builds init to undef so that the padding will be 0xaa
                     // even if we fully populate the fields.
-                    alloca_inst.setAlignment(result_ty.abiAlignment(target));
+                    const alloca_inst = self.buildAlloca(llvm_result_ty, result_ty.abiAlignment(target));
 
                     var indices: [2]*llvm.Value = .{ llvm_u32.constNull(), undefined };
                     for (elements) |elem, i| {
@@ -8842,8 +8816,7 @@ pub const FuncGen = struct {
                 assert(isByRef(result_ty));
 
                 const llvm_usize = try self.dg.lowerType(Type.usize);
-                const alloca_inst = self.buildAlloca(llvm_result_ty);
-                alloca_inst.setAlignment(result_ty.abiAlignment(target));
+                const alloca_inst = self.buildAlloca(llvm_result_ty, result_ty.abiAlignment(target));
 
                 const array_info = result_ty.arrayInfo();
                 var elem_ptr_payload: Type.Payload.Pointer = .{
@@ -8918,7 +8891,7 @@ pub const FuncGen = struct {
         // necessarily match the format that we need, depending on which tag is active. We
         // must construct the correct unnamed struct type here and bitcast, in order to
         // then set the fields appropriately.
-        const result_ptr = self.buildAlloca(union_llvm_ty);
+        const result_ptr = self.buildAlloca(union_llvm_ty, null);
         const llvm_payload = try self.resolveInst(extra.init);
         assert(union_obj.haveFieldTypes());
         const field = union_obj.fields.values()[extra.field_index];
@@ -9234,9 +9207,8 @@ pub const FuncGen = struct {
 
         if (isByRef(optional_ty)) {
             const target = self.dg.module.getTarget();
-            const alloca_inst = self.buildAlloca(optional_llvm_ty);
             const payload_alignment = optional_ty.abiAlignment(target);
-            alloca_inst.setAlignment(payload_alignment);
+            const alloca_inst = self.buildAlloca(optional_llvm_ty, payload_alignment);
 
             {
                 const field_ptr = self.builder.buildStructGEP(optional_llvm_ty, alloca_inst, 0, "");
@@ -9360,8 +9332,7 @@ pub const FuncGen = struct {
             if (isByRef(info.pointee_type)) {
                 const result_align = info.pointee_type.abiAlignment(target);
                 const max_align = @maximum(result_align, ptr_alignment);
-                const result_ptr = self.buildAlloca(elem_llvm_ty);
-                result_ptr.setAlignment(max_align);
+                const result_ptr = self.buildAlloca(elem_llvm_ty, max_align);
                 const llvm_ptr_u8 = self.context.intType(8).pointerType(0);
                 const llvm_usize = self.context.intType(Type.usize.intInfo(target).bits);
                 const size_bytes = info.pointee_type.abiSize(target);
@@ -9394,8 +9365,7 @@ pub const FuncGen = struct {
 
         if (isByRef(info.pointee_type)) {
             const result_align = info.pointee_type.abiAlignment(target);
-            const result_ptr = self.buildAlloca(elem_llvm_ty);
-            result_ptr.setAlignment(result_align);
+            const result_ptr = self.buildAlloca(elem_llvm_ty, result_align);
 
             const same_size_int = self.context.intType(elem_bits);
             const truncated_int = self.builder.buildTrunc(shifted_value, same_size_int, "");
@@ -9519,8 +9489,7 @@ pub const FuncGen = struct {
             .x86_64 => {
                 const array_llvm_ty = usize_llvm_ty.arrayType(6);
                 const array_ptr = fg.valgrind_client_request_array orelse a: {
-                    const array_ptr = fg.buildAlloca(array_llvm_ty);
-                    array_ptr.setAlignment(usize_alignment);
+                    const array_ptr = fg.buildAlloca(array_llvm_ty, usize_alignment);
                     fg.valgrind_client_request_array = array_ptr;
                     break :a array_ptr;
                 };
@@ -9822,6 +9791,74 @@ fn toLlvmCallConv(cc: std.builtin.CallingConvention, target: std.Target) llvm.Ca
     };
 }
 
+/// Convert a zig-address space to an llvm address space.
+fn toLlvmAddressSpace(address_space: std.builtin.AddressSpace, target: std.Target) c_uint {
+    return switch (target.cpu.arch) {
+        .i386, .x86_64 => switch (address_space) {
+            .generic => llvm.address_space.default,
+            .gs => llvm.address_space.x86.gs,
+            .fs => llvm.address_space.x86.fs,
+            .ss => llvm.address_space.x86.ss,
+            else => unreachable,
+        },
+        .nvptx, .nvptx64 => switch (address_space) {
+            .generic => llvm.address_space.default,
+            .global => llvm.address_space.nvptx.global,
+            .constant => llvm.address_space.nvptx.constant,
+            .param => llvm.address_space.nvptx.param,
+            .shared => llvm.address_space.nvptx.shared,
+            .local => llvm.address_space.nvptx.local,
+            else => unreachable,
+        },
+        .amdgcn => switch (address_space) {
+            .generic => llvm.address_space.amdgpu.flat,
+            .global => llvm.address_space.amdgpu.global,
+            .constant => llvm.address_space.amdgpu.constant,
+            .shared => llvm.address_space.amdgpu.local,
+            .local => llvm.address_space.amdgpu.private,
+            else => unreachable,
+        },
+        else => switch (address_space) {
+            .generic => llvm.address_space.default,
+            else => unreachable,
+        },
+    };
+}
+
+/// On some targets, local values that are in the generic address space must be generated into a
+/// different address, space and then cast back to the generic address space.
+/// For example, on GPUs local variable declarations must be generated into the local address space.
+/// This function returns the address space local values should be generated into.
+fn llvmAllocaAddressSpace(target: std.Target) c_uint {
+    return switch (target.cpu.arch) {
+        // On amdgcn, locals should be generated into the private address space.
+        // To make Zig not impossible to use, these are then converted to addresses in the
+        // generic address space and treates as regular pointers. This is the way that HIP also does it.
+        .amdgcn => llvm.address_space.amdgpu.private,
+        else => llvm.address_space.default,
+    };
+}
+
+/// On some targets, global values that are in the generic address space must be generated into a
+/// different address space, and then cast back to the generic address space.
+fn llvmDefaultGlobalAddressSpace(target: std.Target) c_uint {
+    return switch (target.cpu.arch) {
+        // On amdgcn, globals must be explicitly allocated and uploaded so that the program can access
+        // them.
+        .amdgcn => llvm.address_space.amdgpu.global,
+        else => llvm.address_space.default,
+    };
+}
+
+/// If `llvm_addrspace` is generic, convert it to the actual address space that globals
+/// should be stored in by default.
+fn toLlvmGlobalAddressSpace(llvm_addrspace: c_uint, target: std.Target) c_uint {
+    return if (llvm_addrspace == llvm.address_space.default)
+        llvmDefaultGlobalAddressSpace(target)
+    else
+        llvm_addrspace;
+}
+
 /// Take into account 0 bit fields and padding. Returns null if an llvm
 /// field could not be found.
 /// This only happens if you want the field index of a zero sized field at
@@ -10523,25 +10560,43 @@ fn buildAllocaInner(
     llvm_func: *llvm.Value,
     di_scope_non_null: bool,
     llvm_ty: *llvm.Type,
+    maybe_alignment: ?c_uint,
+    target: std.Target,
 ) *llvm.Value {
-    const prev_block = builder.getInsertBlock();
-    const prev_debug_location = builder.getCurrentDebugLocation2();
-    defer {
-        builder.positionBuilderAtEnd(prev_block);
-        if (di_scope_non_null) {
-            builder.setCurrentDebugLocation2(prev_debug_location);
+    const address_space = llvmAllocaAddressSpace(target);
+
+    const alloca = blk: {
+        const prev_block = builder.getInsertBlock();
+        const prev_debug_location = builder.getCurrentDebugLocation2();
+        defer {
+            builder.positionBuilderAtEnd(prev_block);
+            if (di_scope_non_null) {
+                builder.setCurrentDebugLocation2(prev_debug_location);
+            }
         }
+
+        const entry_block = llvm_func.getFirstBasicBlock().?;
+        if (entry_block.getFirstInstruction()) |first_inst| {
+            builder.positionBuilder(entry_block, first_inst);
+        } else {
+            builder.positionBuilderAtEnd(entry_block);
+        }
+        builder.clearCurrentDebugLocation();
+
+        break :blk builder.buildAllocaInAddressSpace(llvm_ty, address_space, "");
+    };
+
+    if (maybe_alignment) |alignment| {
+        alloca.setAlignment(alignment);
     }
 
-    const entry_block = llvm_func.getFirstBasicBlock().?;
-    if (entry_block.getFirstInstruction()) |first_inst| {
-        builder.positionBuilder(entry_block, first_inst);
-    } else {
-        builder.positionBuilderAtEnd(entry_block);
+    // The pointer returned from this function should have the generic address space,
+    // if this isn't the case then cast it to the generic address space.
+    if (address_space != llvm.address_space.default) {
+        return builder.buildAddrSpaceCast(alloca, llvm_ty.pointerType(llvm.address_space.default), "");
     }
-    builder.clearCurrentDebugLocation();
 
-    return builder.buildAlloca(llvm_ty, "");
+    return alloca;
 }
 
 fn errUnionPayloadOffset(payload_ty: Type, target: std.Target) u1 {
diff --git a/src/codegen/llvm/bindings.zig b/src/codegen/llvm/bindings.zig
index 96f4477daa..a5b01d6ddf 100644
--- a/src/codegen/llvm/bindings.zig
+++ b/src/codegen/llvm/bindings.zig
@@ -171,6 +171,9 @@ pub const Value = opaque {
     pub const constAdd = LLVMConstAdd;
     extern fn LLVMConstAdd(LHSConstant: *Value, RHSConstant: *Value) *Value;
 
+    pub const constAddrSpaceCast = LLVMConstAddrSpaceCast;
+    extern fn LLVMConstAddrSpaceCast(ConstantVal: *Value, ToType: *Type) *Value;
+
     pub const setWeak = LLVMSetWeak;
     extern fn LLVMSetWeak(CmpXchgInst: *Value, IsWeak: Bool) void;
 
@@ -956,6 +959,12 @@ pub const Builder = opaque {
 
     pub const setFastMath = ZigLLVMSetFastMath;
     extern fn ZigLLVMSetFastMath(B: *Builder, on_state: bool) void;
+
+    pub const buildAddrSpaceCast = LLVMBuildAddrSpaceCast;
+    extern fn LLVMBuildAddrSpaceCast(B: *Builder, Val: *Value, DestTy: *Type, Name: [*:0]const u8) *Value;
+
+    pub const buildAllocaInAddressSpace = ZigLLVMBuildAllocaInAddressSpace;
+    extern fn ZigLLVMBuildAllocaInAddressSpace(B: *Builder, Ty: *Type, AddressSpace: c_uint, Name: [*:0]const u8) *Value;
 };
 
 pub const MDString = opaque {
diff --git a/src/zig_llvm.cpp b/src/zig_llvm.cpp
index 4f9cd76c6a..b5edb336a5 100644
--- a/src/zig_llvm.cpp
+++ b/src/zig_llvm.cpp
@@ -512,22 +512,22 @@ LLVMValueRef ZigLLVMBuildUSubSat(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRe
 
 LLVMValueRef ZigLLVMBuildSMulFixSat(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *name) {
     llvm::Type* types[1] = {
-        unwrap(LHS)->getType(), 
+        unwrap(LHS)->getType(),
     };
     // pass scale = 0 as third argument
     llvm::Value* values[3] = {unwrap(LHS), unwrap(RHS), unwrap(B)->getInt32(0)};
-    
+
     CallInst *call_inst = unwrap(B)->CreateIntrinsic(Intrinsic::smul_fix_sat, types, values, nullptr, name);
     return wrap(call_inst);
 }
 
 LLVMValueRef ZigLLVMBuildUMulFixSat(LLVMBuilderRef B, LLVMValueRef LHS, LLVMValueRef RHS, const char *name) {
     llvm::Type* types[1] = {
-        unwrap(LHS)->getType(), 
+        unwrap(LHS)->getType(),
     };
     // pass scale = 0 as third argument
     llvm::Value* values[3] = {unwrap(LHS), unwrap(RHS), unwrap(B)->getInt32(0)};
-    
+
     CallInst *call_inst = unwrap(B)->CreateIntrinsic(Intrinsic::umul_fix_sat, types, values, nullptr, name);
     return wrap(call_inst);
 }
@@ -808,7 +808,7 @@ void ZigLLVMSetCurrentDebugLocation2(LLVMBuilderRef builder, unsigned int line,
         unsigned int column, ZigLLVMDIScope *scope, ZigLLVMDILocation *inlined_at)
 {
     DIScope* di_scope = reinterpret_cast<DIScope*>(scope);
-    DebugLoc debug_loc = DILocation::get(di_scope->getContext(), line, column, di_scope, 
+    DebugLoc debug_loc = DILocation::get(di_scope->getContext(), line, column, di_scope,
         reinterpret_cast<DILocation *>(inlined_at), false);
     unwrap(builder)->SetCurrentDebugLocation(debug_loc);
 }
@@ -1177,9 +1177,14 @@ LLVMValueRef ZigLLVMBuildAShrExact(LLVMBuilderRef builder, LLVMValueRef LHS, LLV
     return wrap(unwrap(builder)->CreateAShr(unwrap(LHS), unwrap(RHS), name, true));
 }
 
+LLVMValueRef ZigLLVMBuildAllocaInAddressSpace(LLVMBuilderRef builder, LLVMTypeRef Ty,
+        unsigned AddressSpace, const char *Name) {
+  return wrap(unwrap(builder)->CreateAlloca(unwrap(Ty), AddressSpace, nullptr, Name));
+}
+
 void ZigLLVMSetTailCall(LLVMValueRef Call) {
     unwrap<CallInst>(Call)->setTailCallKind(CallInst::TCK_MustTail);
-} 
+}
 
 void ZigLLVMSetCallSret(LLVMValueRef Call, LLVMTypeRef return_type) {
     CallInst *call_inst = unwrap<CallInst>(Call);
diff --git a/src/zig_llvm.h b/src/zig_llvm.h
index 7fdddda6a4..1a4d5481b6 100644
--- a/src/zig_llvm.h
+++ b/src/zig_llvm.h
@@ -162,6 +162,8 @@ ZIG_EXTERN_C LLVMValueRef ZigLLVMBuildLShrExact(LLVMBuilderRef builder, LLVMValu
 ZIG_EXTERN_C LLVMValueRef ZigLLVMBuildAShrExact(LLVMBuilderRef builder, LLVMValueRef LHS, LLVMValueRef RHS,
         const char *name);
 
+ZIG_EXTERN_C LLVMValueRef ZigLLVMBuildAllocaInAddressSpace(LLVMBuilderRef builder, LLVMTypeRef Ty, unsigned AddressSpace,
+        const char *Name);
 
 ZIG_EXTERN_C struct ZigLLVMDIType *ZigLLVMCreateDebugPointerType(struct ZigLLVMDIBuilder *dibuilder,
         struct ZigLLVMDIType *pointee_type, uint64_t size_in_bits, uint64_t align_in_bits, const char *name);

From 5d429b03e3d43e937e2b517d594275034a873959 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Mon, 22 Aug 2022 00:21:31 +0200
Subject: [PATCH 07/17] stage2: add @addrSpaceCast builtin

---
 src/Air.zig                  |  5 ++++
 src/AstGen.zig               |  8 ++++++
 src/BuiltinFn.zig            |  8 ++++++
 src/Liveness.zig             |  2 ++
 src/Module.zig               |  2 +-
 src/Sema.zig                 | 54 ++++++++++++++++++++++++++++++++++--
 src/Zir.zig                  |  3 ++
 src/arch/aarch64/CodeGen.zig |  1 +
 src/arch/arm/CodeGen.zig     |  1 +
 src/arch/riscv64/CodeGen.zig |  1 +
 src/arch/sparc64/CodeGen.zig |  1 +
 src/arch/wasm/CodeGen.zig    |  1 +
 src/arch/x86_64/CodeGen.zig  |  1 +
 src/codegen/c.zig            |  1 +
 src/codegen/llvm.zig         | 12 ++++++++
 src/print_air.zig            |  1 +
 src/print_zir.zig            |  1 +
 17 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/src/Air.zig b/src/Air.zig
index 46ba297003..57479af590 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -729,6 +729,10 @@ pub const Inst = struct {
         /// Sets the operand as the current error return trace,
         set_err_return_trace,
 
+        /// Convert the address space of a pointer.
+        /// Uses the `ty_op` field.
+        addrspace_cast,
+
         pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag {
             switch (op) {
                 .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt,
@@ -1138,6 +1142,7 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index) Type {
         .popcount,
         .byte_swap,
         .bit_reverse,
+        .addrspace_cast,
         => return air.getRefType(datas[inst].ty_op.ty),
 
         .loop,
diff --git a/src/AstGen.zig b/src/AstGen.zig
index 7534a0d2cc..7bb2ef765c 100644
--- a/src/AstGen.zig
+++ b/src/AstGen.zig
@@ -7789,6 +7789,14 @@ fn builtinCall(
             });
             return rvalue(gz, rl, result, node);
         },
+        .addrspace_cast => {
+            const result = try gz.addExtendedPayload(.addrspace_cast, Zir.Inst.BinNode{
+                .lhs = try comptimeExpr(gz, scope, .{ .ty = .address_space_type }, params[0]),
+                .rhs = try expr(gz, scope, .none, params[1]),
+                .node = gz.nodeIndexToRelative(node),
+            });
+            return rvalue(gz, rl, result, node);
+        },
 
         // zig fmt: off
         .has_decl  => return hasDeclOrField(gz, scope, rl, node, params[0], params[1], .has_decl),
diff --git a/src/BuiltinFn.zig b/src/BuiltinFn.zig
index 3a13dde1ab..eb878873a0 100644
--- a/src/BuiltinFn.zig
+++ b/src/BuiltinFn.zig
@@ -2,6 +2,7 @@ const std = @import("std");
 
 pub const Tag = enum {
     add_with_overflow,
+    addrspace_cast,
     align_cast,
     align_of,
     as,
@@ -152,6 +153,13 @@ pub const list = list: {
                 .param_count = 4,
             },
         },
+        .{
+            "@addrSpaceCast",
+            .{
+                .tag = .addrspace_cast,
+                .param_count = 2,
+            },
+        },
         .{
             "@alignCast",
             .{
diff --git a/src/Liveness.zig b/src/Liveness.zig
index 5a4bd2265e..54a5041e8b 100644
--- a/src/Liveness.zig
+++ b/src/Liveness.zig
@@ -268,6 +268,7 @@ pub fn categorizeOperand(
         .bit_reverse,
         .splat,
         .error_set_has_value,
+        .addrspace_cast,
         => {
             const o = air_datas[inst].ty_op;
             if (o.operand == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
@@ -844,6 +845,7 @@ fn analyzeInst(
         .bit_reverse,
         .splat,
         .error_set_has_value,
+        .addrspace_cast,
         => {
             const o = inst_datas[inst].ty_op;
             return trackOperands(a, new_set, inst, main_tomb, .{ o.operand, .none, .none });
diff --git a/src/Module.zig b/src/Module.zig
index 44502ab564..7d87bdba53 100644
--- a/src/Module.zig
+++ b/src/Module.zig
@@ -4617,7 +4617,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool {
                 .constant => target_util.defaultAddressSpace(target, .global_constant),
                 else => unreachable,
             },
-            else => |addrspace_ref| try sema.analyzeAddrspace(&block_scope, address_space_src, addrspace_ref, addrspace_ctx),
+            else => |addrspace_ref| try sema.analyzeAddressSpace(&block_scope, address_space_src, addrspace_ref, addrspace_ctx),
         };
     };
 
diff --git a/src/Sema.zig b/src/Sema.zig
index b357beafdf..9a6c2acb14 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -975,8 +975,9 @@ fn analyzeBodyInner(
                     .reify                 => try sema.zirReify(             block, extended, inst),
                     .builtin_async_call    => try sema.zirBuiltinAsyncCall(  block, extended),
                     .cmpxchg               => try sema.zirCmpxchg(           block, extended),
-
+                    .addrspace_cast        => try sema.zirAddrSpaceCast(     block, extended),
                     // zig fmt: on
+
                     .fence => {
                         try sema.zirFence(block, extended);
                         i += 1;
@@ -16250,7 +16251,7 @@ fn zirPtrType(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
     const address_space = if (inst_data.flags.has_addrspace) blk: {
         const ref = @intToEnum(Zir.Inst.Ref, sema.code.extra[extra_i]);
         extra_i += 1;
-        break :blk try sema.analyzeAddrspace(block, addrspace_src, ref, .pointer);
+        break :blk try sema.analyzeAddressSpace(block, addrspace_src, ref, .pointer);
     } else .generic;
 
     const bit_offset = if (inst_data.flags.has_bit_range) blk: {
@@ -18170,6 +18171,53 @@ fn reifyStruct(
     return sema.analyzeDeclVal(block, src, new_decl_index);
 }
 
+fn zirAddrSpaceCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.InstData) CompileError!Air.Inst.Ref {
+    const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data;
+    const src = LazySrcLoc.nodeOffset(extra.node);
+    const addrspace_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = extra.node };
+    const ptr_src: LazySrcLoc = .{ .node_offset_builtin_call_arg1 = extra.node };
+
+    const dest_addrspace = try sema.analyzeAddressSpace(block, addrspace_src, extra.lhs, .pointer);
+    const ptr = try sema.resolveInst(extra.rhs);
+    const ptr_ty = sema.typeOf(ptr);
+
+    // TODO in addition to pointers, this instruction is supposed to work for
+    // pointer-like optionals and slices.
+    try sema.checkPtrOperand(block, ptr_src, ptr_ty);
+
+    // TODO check address space cast validity.
+    const src_addrspace = ptr_ty.ptrAddressSpace();
+    _ = src_addrspace;
+
+    const ptr_info = ptr_ty.ptrInfo().data;
+    const dest_ty = try Type.ptr(sema.arena, sema.mod, .{
+        .pointee_type = ptr_info.pointee_type,
+        .@"align" = ptr_info.@"align",
+        .@"addrspace" = dest_addrspace,
+        .mutable = ptr_info.mutable,
+        .@"allowzero" = ptr_info.@"allowzero",
+        .@"volatile" = ptr_info.@"volatile",
+        .size = ptr_info.size,
+    });
+
+    if (try sema.resolveMaybeUndefVal(block, ptr_src, ptr)) |val| {
+        // Pointer value should compatible with both address spaces.
+        // TODO: Figure out why this generates an invalid bitcast.
+        return sema.addConstant(dest_ty, val);
+    }
+
+    try sema.requireRuntimeBlock(block, src, ptr_src);
+    // TODO: Address space cast safety?
+
+    return block.addInst(.{
+        .tag = .addrspace_cast,
+        .data = .{ .ty_op = .{
+            .ty = try sema.addType(dest_ty),
+            .operand = ptr,
+        } },
+    });
+}
+
 fn zirTypeName(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref {
     const inst_data = sema.code.instructions.items(.data)[inst].un_node;
     const ty_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = inst_data.src_node };
@@ -30292,7 +30340,7 @@ pub const AddressSpaceContext = enum {
     pointer,
 };
 
-pub fn analyzeAddrspace(
+pub fn analyzeAddressSpace(
     sema: *Sema,
     block: *Block,
     src: LazySrcLoc,
diff --git a/src/Zir.zig b/src/Zir.zig
index 890109fcb0..351330b7c4 100644
--- a/src/Zir.zig
+++ b/src/Zir.zig
@@ -1969,6 +1969,9 @@ pub const Inst = struct {
         /// `small` 0=>weak 1=>strong
         /// `operand` is payload index to `Cmpxchg`.
         cmpxchg,
+        /// Implement the builtin `@addrSpaceCast`
+        /// `Operand` is payload index to `BinNode`. `lhs` is dest type, `rhs` is operand.
+        addrspace_cast,
 
         pub const InstData = struct {
             opcode: Extended,
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index ed3a281b80..2758fd36df 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -677,6 +677,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .union_init      => try self.airUnionInit(inst),
             .prefetch        => try self.airPrefetch(inst),
             .mul_add         => try self.airMulAdd(inst),
+            .addrspace_cast  => return self.fail("TODO implement addrspace_cast", .{}),
 
             .@"try"          => try self.airTry(inst),
             .try_ptr         => try self.airTryPtr(inst),
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index 95dfb2eea3..855951f5fa 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -690,6 +690,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .union_init      => try self.airUnionInit(inst),
             .prefetch        => try self.airPrefetch(inst),
             .mul_add         => try self.airMulAdd(inst),
+            .addrspace_cast  => return self.fail("TODO implement addrspace_cast", .{}),
 
             .@"try"          => try self.airTry(inst),
             .try_ptr         => try self.airTryPtr(inst),
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 87e81748f9..dd31bfb6f7 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -604,6 +604,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .union_init      => try self.airUnionInit(inst),
             .prefetch        => try self.airPrefetch(inst),
             .mul_add         => try self.airMulAdd(inst),
+            .addrspace_cast  => @panic("TODO"),
 
             .@"try"          => @panic("TODO"),
             .try_ptr         => @panic("TODO"),
diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig
index cfcfedf7cc..6217119f34 100644
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@@ -618,6 +618,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .union_init      => @panic("TODO try self.airUnionInit(inst)"),
             .prefetch        => try self.airPrefetch(inst),
             .mul_add         => @panic("TODO try self.airMulAdd(inst)"),
+            .addrspace_cast  => @panic("TODO try self.airAddrSpaceCast(int)"),
 
             .@"try"          => try self.airTry(inst),
             .try_ptr         => @panic("TODO try self.airTryPtr(inst)"),
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index f27957d3f5..619addfba1 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -1699,6 +1699,7 @@ fn genInst(self: *Self, inst: Air.Inst.Index) !WValue {
         .set_err_return_trace,
         .is_named_enum_value,
         .error_set_has_value,
+        .addrspace_cast,
         => |tag| return self.fail("TODO: Implement wasm inst: {s}", .{@tagName(tag)}),
 
         .add_optimized,
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 27eb11c649..abba07b0e8 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -695,6 +695,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .union_init      => try self.airUnionInit(inst),
             .prefetch        => try self.airPrefetch(inst),
             .mul_add         => try self.airMulAdd(inst),
+            .addrspace_cast  => return self.fail("TODO implement addrspace_cast", .{}),
 
             .@"try"          => try self.airTry(inst),
             .try_ptr         => try self.airTryPtr(inst),
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index b25e05e118..072091d9b2 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -1871,6 +1871,7 @@ fn genBody(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, OutO
             .aggregate_init   => try airAggregateInit(f, inst),
             .union_init       => try airUnionInit(f, inst),
             .prefetch         => try airPrefetch(f, inst),
+            .addrspace_cast   => return f.fail("TODO: C backend: implement addrspace_cast", .{}),
 
             .@"try"  => try airTry(f, inst),
             .try_ptr => try airTryPtr(f, inst),
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index f400e841ea..aebd7a7dd0 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -4512,6 +4512,7 @@ pub const FuncGen = struct {
                 .aggregate_init => try self.airAggregateInit(inst),
                 .union_init     => try self.airUnionInit(inst),
                 .prefetch       => try self.airPrefetch(inst),
+                .addrspace_cast => try self.airAddrSpaceCast(inst),
 
                 .is_named_enum_value => try self.airIsNamedEnumValue(inst),
                 .error_set_has_value => try self.airErrorSetHasValue(inst),
@@ -9045,6 +9046,17 @@ pub const FuncGen = struct {
         return null;
     }
 
+    fn airAddrSpaceCast(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const ty_op = self.air.instructions.items(.data)[inst].ty_op;
+        const inst_ty = self.air.typeOfIndex(inst);
+        const operand = try self.resolveInst(ty_op.operand);
+
+        const llvm_dest_ty = try self.dg.lowerType(inst_ty);
+        return self.builder.buildAddrSpaceCast(operand, llvm_dest_ty, "");
+    }
+
     fn softF80TruncOrExt(
         self: *FuncGen,
         operand: *llvm.Value,
diff --git a/src/print_air.zig b/src/print_air.zig
index fb6f7e6cf2..d3523c0fc6 100644
--- a/src/print_air.zig
+++ b/src/print_air.zig
@@ -244,6 +244,7 @@ const Writer = struct {
             .byte_swap,
             .bit_reverse,
             .error_set_has_value,
+            .addrspace_cast,
             => try w.writeTyOp(s, inst),
 
             .block,
diff --git a/src/print_zir.zig b/src/print_zir.zig
index 8f055e9ddd..f2a79d53a4 100644
--- a/src/print_zir.zig
+++ b/src/print_zir.zig
@@ -512,6 +512,7 @@ const Writer = struct {
             .err_set_cast,
             .wasm_memory_grow,
             .prefetch,
+            .addrspace_cast,
             => {
                 const inst_data = self.code.extraData(Zir.Inst.BinNode, extended.operand).data;
                 const src = LazySrcLoc.nodeOffset(inst_data.node);

From 9f14681473140cd79e6d38cb2bb46a90c1be1259 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 27 Aug 2022 12:55:28 +0200
Subject: [PATCH 08/17] stage2: check address space cast validity

---
 lib/std/target.zig | 11 +++++++++++
 src/Sema.zig       | 12 ++++++++++--
 src/target.zig     | 21 ++++++++++++++++++++-
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/lib/std/target.zig b/lib/std/target.zig
index b6a8a8b9c0..139df629c5 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1157,6 +1157,17 @@ pub const Target = struct {
                 };
             }
 
+            /// Returns whether this architecture supporst the address space
+            pub fn supportsAddressSpace(arch: Arch, address_space: std.builtin.AddressSpace) bool {
+                const is_nvptx = arch == .nvptx or arch == .nvptx64;
+                return switch (address_space) {
+                    .generic => true,
+                    .fs, .gs, .ss => arch == .x86_64 or arch == .i386,
+                    .global, .constant, .local, .shared => arch == .amdgcn or is_nvptx,
+                    .param => is_nvptx,
+                };
+            }
+
             pub fn ptrBitWidth(arch: Arch) u16 {
                 switch (arch) {
                     .avr,
diff --git a/src/Sema.zig b/src/Sema.zig
index 9a6c2acb14..9d05d53536 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -18181,13 +18181,21 @@ fn zirAddrSpaceCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.Inst
     const ptr = try sema.resolveInst(extra.rhs);
     const ptr_ty = sema.typeOf(ptr);
 
+
     // TODO in addition to pointers, this instruction is supposed to work for
     // pointer-like optionals and slices.
     try sema.checkPtrOperand(block, ptr_src, ptr_ty);
 
-    // TODO check address space cast validity.
     const src_addrspace = ptr_ty.ptrAddressSpace();
-    _ = src_addrspace;
+    if (!target_util.addrSpaceCastIsValid(sema.mod.getTarget(), src_addrspace, dest_addrspace)) {
+        const msg = msg: {
+            const msg = try sema.errMsg(block, src, "invalid address space cast", .{});
+            errdefer msg.destroy(sema.gpa);
+            try sema.errNote(block, src, msg, "address space '{s}' is not compatible with address space '{s}'", .{ @tagName(src_addrspace), @tagName(dest_addrspace) });
+            break :msg msg;
+        };
+        return sema.failWithOwnedErrorMsg(msg);
+    }
 
     const ptr_info = ptr_ty.ptrInfo().data;
     const dest_ty = try Type.ptr(sema.arena, sema.mod, .{
diff --git a/src/target.zig b/src/target.zig
index b7da04e548..3fbaf6abc4 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -1,5 +1,6 @@
 const std = @import("std");
 const Type = @import("type.zig").Type;
+const AddressSpace = std.builtin.AddressSpace;
 
 pub const ArchOsAbi = struct {
     arch: std.Target.Cpu.Arch,
@@ -635,12 +636,30 @@ pub fn defaultAddressSpace(
         /// Query the default address space for functions themselves.
         function,
     },
-) std.builtin.AddressSpace {
+) AddressSpace {
     _ = target;
     _ = context;
     return .generic;
 }
 
+/// Returns true if pointers in `from` can be converted to a pointer in `to`.
+pub fn addrSpaceCastIsValid(
+    target: std.Target,
+    from: AddressSpace,
+    to: AddressSpace,
+) bool {
+    const arch = target.cpu.arch;
+    switch (arch) {
+        .x86_64, .i386 => return arch.supportsAddressSpace(from) and arch.supportsAddressSpace(to),
+        .amdgcn => {
+            const to_generic = arch.supportsAddressSpace(from) and to == .generic;
+            const from_generic = arch.supportsAddressSpace(to) and from == .generic;
+            return to_generic or from_generic;
+        },
+        else => return from == .generic and to == .generic,
+    }
+}
+
 pub fn llvmMachineAbi(target: std.Target) ?[:0]const u8 {
     const have_float = switch (target.abi) {
         .gnuilp32 => return "ilp32",

From fb9a7dad178398326033dfcddedf41b59227bc70 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 17 Sep 2022 00:24:51 +0200
Subject: [PATCH 09/17] add @addrSpaceCast to langref

---
 doc/langref.html.in | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/doc/langref.html.in b/doc/langref.html.in
index effa974f22..872c305252 100644
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@@ -7956,6 +7956,15 @@ fn readFile(allocator: Allocator, filename: []const u8) ![]u8 {
       The {#syntax#}comptime{#endsyntax#} keyword on a parameter means that the parameter must be known
       at compile time.
       </p>
+      {#header_open|@addrSpaceCast#}
+      <pre>{#syntax#}@addrSpaceCast(comptime addrspace: std.builtin.AddressSpace, ptr: anytype) anytype{#endsyntax#}</pre>
+      <p>
+      Converts a pointer from one address space to another. Depending on the current target and
+      address spaces, this cast may be a no-op, a complex operation, or illegal. If the cast is
+      legal, then the resulting pointer points to the same memory location as the pointer operand.
+      It is always valid to cast a pointer between the same address spaces.
+      </p>
+      {#header_close#}
       {#header_open|@addWithOverflow#}
       <pre>{#syntax#}@addWithOverflow(comptime T: type, a: T, b: T, result: *T) bool{#endsyntax#}</pre>
       <p>

From 76ad9cb10eeae5257eb51838953d0e3bcb993fa3 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 17 Sep 2022 01:24:13 +0200
Subject: [PATCH 10/17] backport @addrSpaceCast to stage 1

---
 src/Sema.zig             |  1 -
 src/stage1/all_types.hpp | 14 ++++++++++++
 src/stage1/analyze.cpp   |  5 +++++
 src/stage1/astgen.cpp    | 34 +++++++++++++++++++++++++++++
 src/stage1/codegen.cpp   |  3 ++-
 src/stage1/ir.cpp        | 47 ++++++++++++++++++++++++++++++++++++++++
 src/stage1/ir_print.cpp  | 13 +++++++++++
 7 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index 9d05d53536..d597075df2 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -18181,7 +18181,6 @@ fn zirAddrSpaceCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.Inst
     const ptr = try sema.resolveInst(extra.rhs);
     const ptr_ty = sema.typeOf(ptr);
 
-
     // TODO in addition to pointers, this instruction is supposed to work for
     // pointer-like optionals and slices.
     try sema.checkPtrOperand(block, ptr_src, ptr_ty);
diff --git a/src/stage1/all_types.hpp b/src/stage1/all_types.hpp
index f427548fdb..88dac9107f 100644
--- a/src/stage1/all_types.hpp
+++ b/src/stage1/all_types.hpp
@@ -95,6 +95,11 @@ enum AddressSpace {
     AddressSpaceGS,
     AddressSpaceFS,
     AddressSpaceSS,
+    AddressSpaceGlobal,
+    AddressSpaceConstant,
+    AddressSpaceParam,
+    AddressSpaceShared,
+    AddressSpaceLocal
 };
 
 // This one corresponds to the builtin.zig enum.
@@ -1842,6 +1847,7 @@ enum BuiltinFnId {
     BuiltinFnIdMaximum,
     BuiltinFnIdMinimum,
     BuiltinFnIdPrefetch,
+    BuiltinFnIdAddrSpaceCast,
 };
 
 struct BuiltinFnEntry {
@@ -2673,6 +2679,7 @@ enum Stage1ZirInstId : uint8_t {
     Stage1ZirInstIdWasmMemoryGrow,
     Stage1ZirInstIdSrc,
     Stage1ZirInstIdPrefetch,
+    Stage1ZirInstIdAddrSpaceCast,
 };
 
 // ir_render_* functions in codegen.cpp consume Gen instructions and produce LLVM IR.
@@ -4169,6 +4176,13 @@ struct Stage1AirInstAlignCast {
     Stage1AirInst *target;
 };
 
+struct Stage1ZirInstAddrSpaceCast {
+    Stage1ZirInst base;
+
+    Stage1ZirInst *addrspace;
+    Stage1ZirInst *ptr;
+};
+
 struct Stage1ZirInstSetAlignStack {
     Stage1ZirInst base;
 
diff --git a/src/stage1/analyze.cpp b/src/stage1/analyze.cpp
index 15b8789997..a102f2e340 100644
--- a/src/stage1/analyze.cpp
+++ b/src/stage1/analyze.cpp
@@ -1030,6 +1030,11 @@ const char *address_space_name(AddressSpace as) {
         case AddressSpaceGS: return "gs";
         case AddressSpaceFS: return "fs";
         case AddressSpaceSS: return "ss";
+        case AddressSpaceGlobal: return "global";
+        case AddressSpaceConstant: return "constant";
+        case AddressSpaceParam: return "param";
+        case AddressSpaceShared: return "shared";
+        case AddressSpaceLocal: return "local";
     }
     zig_unreachable();
 }
diff --git a/src/stage1/astgen.cpp b/src/stage1/astgen.cpp
index 9eea2e650e..2d053a9e3c 100644
--- a/src/stage1/astgen.cpp
+++ b/src/stage1/astgen.cpp
@@ -351,6 +351,8 @@ void destroy_instruction_src(Stage1ZirInst *inst) {
             return heap::c_allocator.destroy(reinterpret_cast<Stage1ZirInstSrc *>(inst));
         case Stage1ZirInstIdPrefetch:
             return heap::c_allocator.destroy(reinterpret_cast<Stage1ZirInstPrefetch *>(inst));
+        case Stage1ZirInstIdAddrSpaceCast:
+            return heap::c_allocator.destroy(reinterpret_cast<Stage1ZirInstAddrSpaceCast *>(inst));
     }
     zig_unreachable();
 }
@@ -947,6 +949,10 @@ static constexpr Stage1ZirInstId ir_inst_id(Stage1ZirInstPrefetch *) {
     return Stage1ZirInstIdPrefetch;
 }
 
+static constexpr Stage1ZirInstId ir_inst_id(Stage1ZirInstAddrSpaceCast *) {
+    return Stage1ZirInstIdAddrSpaceCast;
+}
+
 template<typename T>
 static T *ir_create_instruction(Stage1AstGen *ag, Scope *scope, AstNode *source_node) {
     T *special_instruction = heap::c_allocator.create<T>();
@@ -2572,6 +2578,19 @@ static Stage1ZirInst *ir_build_align_cast_src(Stage1AstGen *ag, Scope *scope, As
     return &instruction->base;
 }
 
+static Stage1ZirInst *ir_build_addrspace_cast(Stage1AstGen *ag, Scope *scope, AstNode *source_node,
+        Stage1ZirInst *addrspace, Stage1ZirInst *ptr)
+{
+    Stage1ZirInstAddrSpaceCast *instruction = ir_build_instruction<Stage1ZirInstAddrSpaceCast>(ag, scope, source_node);
+    instruction->addrspace = addrspace;
+    instruction->ptr = ptr;
+
+    ir_ref_instruction(addrspace, ag->current_basic_block);
+    ir_ref_instruction(ptr, ag->current_basic_block);
+
+    return &instruction->base;
+}
+
 static Stage1ZirInst *ir_build_resolve_result(Stage1AstGen *ag, Scope *scope, AstNode *source_node,
         ResultLoc *result_loc, Stage1ZirInst *ty)
 {
@@ -5459,6 +5478,21 @@ static Stage1ZirInst *astgen_builtin_fn_call(Stage1AstGen *ag, Scope *scope, Ast
                 Stage1ZirInst *ir_extern = ir_build_prefetch(ag, scope, node, ptr_value, casted_options_value);
                 return ir_lval_wrap(ag, scope, ir_extern, lval, result_loc);
             }
+        case BuiltinFnIdAddrSpaceCast:
+            {
+                AstNode *arg0_node = node->data.fn_call_expr.params.at(0);
+                Stage1ZirInst *arg0_value = astgen_node(ag, arg0_node, scope);
+                if (arg0_value == ag->codegen->invalid_inst_src)
+                    return arg0_value;
+
+                AstNode* arg1_node = node->data.fn_call_expr.params.at(1);
+                Stage1ZirInst *arg1_value = astgen_node(ag, arg1_node, scope);
+                if (arg1_value == ag->codegen->invalid_inst_src)
+                    return arg1_value;
+
+                Stage1ZirInst *addrspace_cast = ir_build_addrspace_cast(ag, scope, node, arg0_value, arg1_value);
+                return ir_lval_wrap(ag, scope, addrspace_cast, lval, result_loc);
+            }
     }
     zig_unreachable();
 }
diff --git a/src/stage1/codegen.cpp b/src/stage1/codegen.cpp
index 6e1593ccca..2a05ba44d1 100644
--- a/src/stage1/codegen.cpp
+++ b/src/stage1/codegen.cpp
@@ -4371,7 +4371,7 @@ static LLVMValueRef ir_render_binary_not(CodeGen *g, Stage1Air *executable,
 
 static LLVMValueRef gen_soft_float_neg(CodeGen *g, ZigType *operand_type, LLVMValueRef operand) {
     uint32_t vector_len = operand_type->id == ZigTypeIdVector ? operand_type->data.vector.len : 0;
-    uint16_t num_bits = operand_type->id == ZigTypeIdVector ? 
+    uint16_t num_bits = operand_type->id == ZigTypeIdVector ?
         operand_type->data.vector.elem_type->data.floating.bit_count :
         operand_type->data.floating.bit_count;
 
@@ -10085,6 +10085,7 @@ static void define_builtin_fns(CodeGen *g) {
     create_builtin_fn(g, BuiltinFnIdMaximum, "maximum", 2);
     create_builtin_fn(g, BuiltinFnIdMinimum, "minimum", 2);
     create_builtin_fn(g, BuiltinFnIdPrefetch, "prefetch", 2);
+    create_builtin_fn(g, BuiltinFnIdAddrSpaceCast, "addrSpaceCast", 2);
 }
 
 static const char *bool_to_str(bool b) {
diff --git a/src/stage1/ir.cpp b/src/stage1/ir.cpp
index c5f15c5cc9..9627384fbc 100644
--- a/src/stage1/ir.cpp
+++ b/src/stage1/ir.cpp
@@ -23746,6 +23746,50 @@ static Stage1AirInst *ir_analyze_instruction_align_cast(IrAnalyze *ira, Stage1Zi
     return result;
 }
 
+static bool ir_resolve_addrspace(IrAnalyze *ira, Stage1AirInst *value, AddressSpace *out) {
+    if (type_is_invalid(value->value->type))
+        return false;
+
+    ZigType *addrspace_type = get_builtin_type(ira->codegen, "AddressSpace");
+
+    Stage1AirInst *casted_value = ir_implicit_cast(ira, value, addrspace_type);
+    if (type_is_invalid(casted_value->value->type))
+        return false;
+
+    ZigValue *const_val = ir_resolve_const(ira, casted_value, UndefBad);
+    if (!const_val)
+        return false;
+
+    *out = (AddressSpace)bigint_as_u32(&const_val->data.x_enum_tag);
+    return true;
+}
+
+static Stage1AirInst *ir_analyze_instruction_addrspace_cast(IrAnalyze *ira, Stage1ZirInstAddrSpaceCast *instruction) {
+    Stage1AirInst *ptr_inst = instruction->ptr->child;
+    ZigType *ptr_type = ptr_inst->value->type;
+    if (type_is_invalid(ptr_type))
+        return ira->codegen->invalid_inst_gen;
+
+    AddressSpace addrspace;
+    if (!ir_resolve_addrspace(ira, instruction->addrspace->child, &addrspace))
+        return ira->codegen->invalid_inst_gen;
+
+    if (addrspace != AddressSpaceGeneric) {
+        ir_add_error_node(ira, instruction->addrspace->source_node, buf_sprintf(
+            "address space '%s' not available in stage 1 compiler, must be .generic",
+            address_space_name(addrspace)));
+        return ira->codegen->invalid_inst_gen;
+    }
+
+    if (is_slice(ptr_type) || get_src_ptr_type(ptr_type) != nullptr) {
+        ir_add_error_node(ira, instruction->ptr->source_node,
+                buf_sprintf("expected pointer or slice, found '%s'", buf_ptr(&ptr_type->name)));
+        return ira->codegen->invalid_inst_gen;
+    }
+
+    return ptr_inst;
+}
+
 static Stage1AirInst *ir_analyze_instruction_set_align_stack(IrAnalyze *ira, Stage1ZirInstSetAlignStack *instruction) {
     uint32_t align_bytes;
     Stage1AirInst *align_bytes_inst = instruction->align_bytes->child;
@@ -25451,6 +25495,8 @@ static Stage1AirInst *ir_analyze_instruction_base(IrAnalyze *ira, Stage1ZirInst
             return ir_analyze_instruction_src(ira, (Stage1ZirInstSrc *)instruction);
         case Stage1ZirInstIdPrefetch:
             return ir_analyze_instruction_prefetch(ira, (Stage1ZirInstPrefetch *)instruction);
+        case Stage1ZirInstIdAddrSpaceCast:
+            return ir_analyze_instruction_addrspace_cast(ira, (Stage1ZirInstAddrSpaceCast *)instruction);
     }
     zig_unreachable();
 }
@@ -25832,6 +25878,7 @@ bool ir_inst_src_has_side_effects(Stage1ZirInst *instruction) {
         case Stage1ZirInstIdWasmMemorySize:
         case Stage1ZirInstIdSrc:
         case Stage1ZirInstIdReduce:
+        case Stage1ZirInstIdAddrSpaceCast:
             return false;
 
         case Stage1ZirInstIdAsm:
diff --git a/src/stage1/ir_print.cpp b/src/stage1/ir_print.cpp
index 9296242a3e..366e48004c 100644
--- a/src/stage1/ir_print.cpp
+++ b/src/stage1/ir_print.cpp
@@ -373,6 +373,8 @@ const char* ir_inst_src_type_str(Stage1ZirInstId id) {
             return "SrcSrc";
         case Stage1ZirInstIdPrefetch:
             return "SrcPrefetch";
+        case Stage1ZirInstIdAddrSpaceCast:
+            return "SrcAddrSpaceCast";
     }
     zig_unreachable();
 }
@@ -2382,6 +2384,14 @@ static void ir_print_align_cast(IrPrintSrc *irp, Stage1ZirInstAlignCast *instruc
     fprintf(irp->f, ")");
 }
 
+static void ir_print_addrspace_cast(IrPrintSrc *irp, Stage1ZirInstAddrSpaceCast *instruction) {
+    fprintf(irp->f, "@addrSpaceCast(");
+    ir_print_other_inst_src(irp, instruction->addrspace);
+    fprintf(irp->f, ",");
+    ir_print_other_inst_src(irp, instruction->ptr);
+    fprintf(irp->f, ")");
+}
+
 static void ir_print_align_cast(IrPrintGen *irp, Stage1AirInstAlignCast *instruction) {
     fprintf(irp->f, "@alignCast(");
     ir_print_other_inst_gen(irp, instruction->target);
@@ -3127,6 +3137,9 @@ static void ir_print_inst_src(IrPrintSrc *irp, Stage1ZirInst *instruction, bool
         case Stage1ZirInstIdPrefetch:
             ir_print_prefetch(irp, (Stage1ZirInstPrefetch *)instruction);
             break;
+        case Stage1ZirInstIdAddrSpaceCast:
+            ir_print_addrspace_cast(irp, (Stage1ZirInstAddrSpaceCast *)instruction);
+            break;
     }
     fprintf(irp->f, "\n");
 }

From 8894d1c45eb01fa3fbcc9173bac729e5812307ed Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 18 Sep 2022 13:21:49 +0200
Subject: [PATCH 11/17] stage2: f128 improvements for targets that do not
 support it

---
 src/codegen/llvm.zig | 98 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index aebd7a7dd0..9c8551854b 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -7576,6 +7576,8 @@ pub const FuncGen = struct {
         const src_bits = operand_ty.floatBits(target);
         if (!backendSupportsF80(target) and (src_bits == 80 or dest_bits == 80)) {
             return softF80TruncOrExt(self, operand, src_bits, dest_bits);
+        } else if (!backendSupportsF128(target) and (src_bits == 128 or dest_bits == 128)) {
+            return softF128TruncOrExt(self, operand, src_bits, dest_bits);
         }
         const dest_llvm_ty = try self.dg.lowerType(dest_ty);
         return self.builder.buildFPTrunc(operand, dest_llvm_ty, "");
@@ -7594,6 +7596,8 @@ pub const FuncGen = struct {
         const src_bits = operand_ty.floatBits(target);
         if (!backendSupportsF80(target) and (src_bits == 80 or dest_bits == 80)) {
             return softF80TruncOrExt(self, operand, src_bits, dest_bits);
+        } else if (!backendSupportsF128(target) and (src_bits == 128 or dest_bits == 128)) {
+            return softF128TruncOrExt(self, operand, src_bits, dest_bits);
         }
         const dest_llvm_ty = try self.dg.lowerType(self.air.typeOfIndex(inst));
         return self.builder.buildFPExt(operand, dest_llvm_ty, "");
@@ -9138,6 +9142,88 @@ pub const FuncGen = struct {
         return self.builder.buildBitCast(result, final_cast_llvm_ty, "");
     }
 
+    fn softF128TruncOrExt(
+        self: *FuncGen,
+        operand: *llvm.Value,
+        src_bits: u16,
+        dest_bits: u16,
+    ) !?*llvm.Value {
+        const target = self.dg.module.getTarget();
+
+        var param_llvm_ty: *llvm.Type = self.context.fp128Type();
+        var ret_llvm_ty: *llvm.Type = param_llvm_ty;
+        var fn_name: [*:0]const u8 = undefined;
+        var arg = operand;
+        var final_cast: ?*llvm.Type = null;
+
+        assert(src_bits == 128 or dest_bits == 128);
+
+        // TODO: Implement proper names and compiler-rt functions for this!!
+        if (src_bits == 128) switch (dest_bits) {
+            16 => {
+                // See corresponding condition at definition of
+                // __truncxfhf2 in compiler-rt.
+                if (target.cpu.arch.isAARCH64()) {
+                    ret_llvm_ty = self.context.halfType();
+                } else {
+                    ret_llvm_ty = self.context.intType(16);
+                    final_cast = self.context.halfType();
+                }
+                fn_name = "__trunctfhf2";
+            },
+            32 => {
+                ret_llvm_ty = self.context.floatType();
+                fn_name = "__trunctfsf2";
+            },
+            64 => {
+                ret_llvm_ty = self.context.doubleType();
+                fn_name = "__trunctfdf2";
+            },
+            80 => {
+                ret_llvm_ty = self.context.intType(80);
+                fn_name = "__trunctfxf2";
+            },
+            128 => return operand,
+            else => unreachable,
+        } else switch (src_bits) {
+            16 => {
+                // See corresponding condition at definition of
+                // __extendhftf2 in compiler-rt.
+                param_llvm_ty = if (target.cpu.arch.isAARCH64())
+                    self.context.halfType()
+                else
+                    self.context.intType(16);
+                arg = self.builder.buildBitCast(arg, param_llvm_ty, "");
+                fn_name = "__extendhftf2";
+            },
+            32 => {
+                param_llvm_ty = self.context.floatType();
+                fn_name = "__extendsftf2";
+            },
+            64 => {
+                param_llvm_ty = self.context.doubleType();
+                fn_name = "__extenddftf2";
+            },
+            80 => {
+                param_llvm_ty = self.context.intType(80);
+                fn_name = "__extendxftf2";
+            },
+            128 => return operand,
+            else => unreachable,
+        }
+
+        const llvm_fn = self.dg.object.llvm_module.getNamedFunction(fn_name) orelse f: {
+            const param_types = [_]*llvm.Type{param_llvm_ty};
+            const fn_type = llvm.functionType(ret_llvm_ty, &param_types, param_types.len, .False);
+            break :f self.dg.object.llvm_module.addFunction(fn_name, fn_type);
+        };
+
+        var args: [1]*llvm.Value = .{arg};
+        const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &args, args.len, .C, .Auto, "");
+        const final_cast_llvm_ty = final_cast orelse return result;
+        return self.builder.buildBitCast(result, final_cast_llvm_ty, "");
+    }
+
     fn getErrorNameTable(self: *FuncGen) !*llvm.Value {
         if (self.dg.object.error_name_table) |table| {
             return table;
@@ -10489,13 +10575,23 @@ fn backendSupportsF16(target: std.Target) bool {
     };
 }
 
+/// This function returns true if we expect LLVM to lower f128 correctly,
+/// and false if we expect LLVm to crash if it encounters and f128 type
+/// or if it produces miscompilations.
+fn backendSupportsF128(target: std.Target) bool {
+    return switch (target.cpu.arch) {
+        .amdgcn => false,
+        else => true,
+    };
+}
+
 /// LLVM does not support all relevant intrinsics for all targets, so we
 /// may need to manually generate a libc call
 fn intrinsicsAllowed(scalar_ty: Type, target: std.Target) bool {
     return switch (scalar_ty.tag()) {
         .f16 => backendSupportsF16(target),
         .f80 => target.longDoubleIs(f80) and backendSupportsF80(target),
-        .f128 => target.longDoubleIs(f128),
+        .f128 => target.longDoubleIs(f128) and backendSupportsF128(target),
         else => true,
     };
 }

From 9006cd9d09da083e89b58e19c0091924e1e4849f Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 18 Sep 2022 15:27:20 +0200
Subject: [PATCH 12/17] compiler_rt: cmpxchg-based atomic fetch/exchange for
 small types

Some architectures (AMDGPU) do not support atomic exchange/fetch for
small types (for AMDGPU: 8- and 16-bit ints). For these types
atomic fetch and atomic exchange needs to be implemeted using atomic
operations on a wider type using cmpxchg.
---
 lib/compiler_rt/atomics.zig | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/lib/compiler_rt/atomics.zig b/lib/compiler_rt/atomics.zig
index 6935a858aa..2a16fa51d5 100644
--- a/lib/compiler_rt/atomics.zig
+++ b/lib/compiler_rt/atomics.zig
@@ -35,6 +35,17 @@ const largest_atomic_size = switch (arch) {
     else => @sizeOf(usize),
 };
 
+// The size (in bytes) of the smallest atomic object that the architecture can
+// perform fetch/exchange atomically. Note, this does not encompass load and store.
+// Objects smaller than this threshold are implemented in terms of compare-exchange
+// of a larger value.
+const smallest_atomic_fetch_exch_size = switch (arch) {
+    // On AMDGPU, there are no instructions for atomic operations other than load and store
+    // (as of LLVM 15), and so these need to be implemented in terms of atomic CAS.
+    .amdgcn => @sizeOf(u32),
+    else => @sizeOf(u8),
+};
+
 const cache_line_size = 64;
 
 const SpinlockTable = struct {
@@ -214,6 +225,31 @@ inline fn atomic_exchange_N(comptime T: type, ptr: *T, val: T, model: i32) T {
         const value = ptr.*;
         ptr.* = val;
         return value;
+    } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
+        // Machine does not support this type, but it does support a larger type.
+        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
+
+        const addr = @ptrToInt(ptr);
+        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
+
+        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
+
+        // Put the interesting bits at the right position (branch has dynamic RHS).
+        const shifted_value = @as(WideAtomic, val) << inner_shift;
+        // Mask that guards the bits we care about
+        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
+        while (true) {
+            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
+            // Insert new bytes in old value.
+            const wide_new = wide_old & ~mask | shifted_value;
+            // CAS the new value until the result stabilizes.
+            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
+                // Mask-and-Shift back the old bits to get the old value.
+                return @truncate(T, (wide_old & mask) >> inner_shift);
+            }
+        }
     } else {
         return @atomicRmw(T, ptr, .Xchg, val, .SeqCst);
     }
@@ -298,6 +334,38 @@ inline fn fetch_op_N(comptime T: type, comptime op: std.builtin.AtomicRmwOp, ptr
         };
 
         return value;
+    } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
+        // Machine does not support this type, but it does support a larger type.
+        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
+
+        const addr = @ptrToInt(ptr);
+        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
+
+        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
+        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
+
+        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
+
+        while (true) {
+            // Compute new wide value with updated bits.
+            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
+            const old = @truncate(T, (wide_old & mask) >> inner_shift);
+            const new = switch (op) {
+                .Add => old +% val,
+                .Sub => old -% val,
+                .And => old & val,
+                .Nand => ~(old & val),
+                .Or => old | val,
+                .Xor => old ^ val,
+                else => @compileError("unsupported atomic op"),
+            };
+            const wide_new = wide_old & ~mask | (@as(WideAtomic, new) << inner_shift);
+            // CAS the new value until the result stabilizes.
+            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
+                return old;
+            }
+        }
     }
 
     return @atomicRmw(T, ptr, op, val, .SeqCst);

From 86f40d3ff6529baaeba1f0e7e5d7d2f0bfce00a1 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 18 Sep 2022 15:29:19 +0200
Subject: [PATCH 13/17] zig_libc: do not call abort() on amdgpu

---
 lib/c.zig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/c.zig b/lib/c.zig
index 4a6ca18782..4f40a351fc 100644
--- a/lib/c.zig
+++ b/lib/c.zig
@@ -64,10 +64,10 @@ pub fn panic(msg: []const u8, error_return_trace: ?*std.builtin.StackTrace, _: ?
     if (builtin.is_test) {
         std.debug.panic("{s}", .{msg});
     }
-    if (native_os != .freestanding and native_os != .other) {
-        std.os.abort();
+    switch (native_os) {
+        .freestanding, .other, .amdhsa, .amdpal => while (true) {},
+        else => std.os.abort(),
     }
-    while (true) {}
 }
 
 extern fn main(argc: c_int, argv: [*:null]?[*:0]u8) c_int;

From aa20295d24e77a3de586b203182d4e1cad69d475 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 18 Sep 2022 16:19:34 +0200
Subject: [PATCH 14/17] compiler_rt: apply protty suggestions

---
 lib/compiler_rt/atomics.zig | 109 ++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 62 deletions(-)

diff --git a/lib/compiler_rt/atomics.zig b/lib/compiler_rt/atomics.zig
index 2a16fa51d5..8f02600564 100644
--- a/lib/compiler_rt/atomics.zig
+++ b/lib/compiler_rt/atomics.zig
@@ -217,6 +217,31 @@ fn __atomic_store_8(dst: *u64, value: u64, model: i32) callconv(.C) void {
     return atomic_store_N(u64, dst, value, model);
 }
 
+fn wideUpdate(comptime T: type, ptr: *T, val: T, update: anytype) T {
+    const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
+
+    const addr = @ptrToInt(ptr);
+    const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
+    const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
+
+    const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
+    const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
+
+    const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
+
+    var wide_old = @atomicLoad(WideAtomic, wide_ptr, .SeqCst);
+    while (true) {
+        const old = @truncate(T, (wide_old & mask) >> inner_shift);
+        const new = update(val, old);
+        const wide_new = wide_old & ~mask | (@as(WideAtomic, new) << inner_shift);
+        if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst)) |new_wide_old| {
+            wide_old = new_wide_old;
+        } else {
+            return old;
+        }
+    }
+}
+
 inline fn atomic_exchange_N(comptime T: type, ptr: *T, val: T, model: i32) T {
     _ = model;
     if (@sizeOf(T) > largest_atomic_size) {
@@ -227,29 +252,13 @@ inline fn atomic_exchange_N(comptime T: type, ptr: *T, val: T, model: i32) T {
         return value;
     } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
         // Machine does not support this type, but it does support a larger type.
-        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
-
-        const addr = @ptrToInt(ptr);
-        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
-        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
-
-        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
-        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
-
-        // Put the interesting bits at the right position (branch has dynamic RHS).
-        const shifted_value = @as(WideAtomic, val) << inner_shift;
-        // Mask that guards the bits we care about
-        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
-        while (true) {
-            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
-            // Insert new bytes in old value.
-            const wide_new = wide_old & ~mask | shifted_value;
-            // CAS the new value until the result stabilizes.
-            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
-                // Mask-and-Shift back the old bits to get the old value.
-                return @truncate(T, (wide_old & mask) >> inner_shift);
+        const Updater = struct {
+            fn update(new: T, old: T) T {
+                _ = old;
+                return new;
             }
-        }
+        };
+        return wideUpdate(T, ptr, val, Updater.update);
     } else {
         return @atomicRmw(T, ptr, .Xchg, val, .SeqCst);
     }
@@ -318,54 +327,30 @@ fn __atomic_compare_exchange_8(ptr: *u64, expected: *u64, desired: u64, success:
 
 inline fn fetch_op_N(comptime T: type, comptime op: std.builtin.AtomicRmwOp, ptr: *T, val: T, model: i32) T {
     _ = model;
+    const Updater = struct {
+        fn update(new: T, old: T) T {
+            return switch (op) {
+                .Add => old +% new,
+                .Sub => old -% new,
+                .And => old & new,
+                .Nand => ~(old & new),
+                .Or => old | new,
+                .Xor => old ^ new,
+                else => @compileError("unsupported atomic op"),
+            };
+        }
+    };
+
     if (@sizeOf(T) > largest_atomic_size) {
         var sl = spinlocks.get(@ptrToInt(ptr));
         defer sl.release();
 
         const value = ptr.*;
-        ptr.* = switch (op) {
-            .Add => value +% val,
-            .Sub => value -% val,
-            .And => value & val,
-            .Nand => ~(value & val),
-            .Or => value | val,
-            .Xor => value ^ val,
-            else => @compileError("unsupported atomic op"),
-        };
-
+        ptr.* = Updater.update(val, value);
         return value;
     } else if (@sizeOf(T) < smallest_atomic_fetch_exch_size) {
         // Machine does not support this type, but it does support a larger type.
-        const WideAtomic = std.meta.Int(.unsigned, smallest_atomic_fetch_exch_size * 8);
-
-        const addr = @ptrToInt(ptr);
-        const wide_addr = addr & ~(@as(T, smallest_atomic_fetch_exch_size) - 1);
-        const wide_ptr = @alignCast(smallest_atomic_fetch_exch_size, @intToPtr(*WideAtomic, wide_addr));
-
-        const inner_offset = addr & (@as(T, smallest_atomic_fetch_exch_size) - 1);
-        const inner_shift = @intCast(std.math.Log2Int(T), inner_offset * 8);
-
-        const mask = @as(WideAtomic, std.math.maxInt(T)) << inner_shift;
-
-        while (true) {
-            // Compute new wide value with updated bits.
-            const wide_old = @atomicLoad(WideAtomic, wide_ptr, .Acquire);
-            const old = @truncate(T, (wide_old & mask) >> inner_shift);
-            const new = switch (op) {
-                .Add => old +% val,
-                .Sub => old -% val,
-                .And => old & val,
-                .Nand => ~(old & val),
-                .Or => old | val,
-                .Xor => old ^ val,
-                else => @compileError("unsupported atomic op"),
-            };
-            const wide_new = wide_old & ~mask | (@as(WideAtomic, new) << inner_shift);
-            // CAS the new value until the result stabilizes.
-            if (@cmpxchgWeak(WideAtomic, wide_ptr, wide_old, wide_new, .SeqCst, .SeqCst) == null) {
-                return old;
-            }
-        }
+        return wideUpdate(T, ptr, val, Updater.update);
     }
 
     return @atomicRmw(T, ptr, op, val, .SeqCst);

From ad747739594805546e0d52d112dfd4a75978c8c7 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sat, 24 Sep 2022 16:16:52 +0200
Subject: [PATCH 15/17] make addrSpaceCast work with optionals; forbid
 ptrCast'ing address spaces

---
 src/Sema.zig | 24 +++++++++++-------------
 src/type.zig |  6 ++++++
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index d597075df2..c81d267bcd 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -18181,11 +18181,10 @@ fn zirAddrSpaceCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.Inst
     const ptr = try sema.resolveInst(extra.rhs);
     const ptr_ty = sema.typeOf(ptr);
 
-    // TODO in addition to pointers, this instruction is supposed to work for
-    // pointer-like optionals and slices.
     try sema.checkPtrOperand(block, ptr_src, ptr_ty);
 
-    const src_addrspace = ptr_ty.ptrAddressSpace();
+    var ptr_info = ptr_ty.ptrInfo().data;
+    const src_addrspace = ptr_info.@"addrspace";
     if (!target_util.addrSpaceCastIsValid(sema.mod.getTarget(), src_addrspace, dest_addrspace)) {
         const msg = msg: {
             const msg = try sema.errMsg(block, src, "invalid address space cast", .{});
@@ -18196,16 +18195,12 @@ fn zirAddrSpaceCast(sema: *Sema, block: *Block, extended: Zir.Inst.Extended.Inst
         return sema.failWithOwnedErrorMsg(msg);
     }
 
-    const ptr_info = ptr_ty.ptrInfo().data;
-    const dest_ty = try Type.ptr(sema.arena, sema.mod, .{
-        .pointee_type = ptr_info.pointee_type,
-        .@"align" = ptr_info.@"align",
-        .@"addrspace" = dest_addrspace,
-        .mutable = ptr_info.mutable,
-        .@"allowzero" = ptr_info.@"allowzero",
-        .@"volatile" = ptr_info.@"volatile",
-        .size = ptr_info.size,
-    });
+    ptr_info.@"addrspace" = dest_addrspace;
+    const dest_ptr_ty = try Type.ptr(sema.arena, sema.mod, ptr_info);
+    const dest_ty = if (ptr_ty.zigTypeTag() == .Optional)
+        try Type.optional(sema.arena, dest_ptr_ty)
+    else
+        dest_ptr_ty;
 
     if (try sema.resolveMaybeUndefVal(block, ptr_src, ptr)) |val| {
         // Pointer value should compatible with both address spaces.
@@ -18472,6 +18467,9 @@ fn zirPtrCast(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air
     if (operand_info.@"volatile" and !dest_info.@"volatile") {
         return sema.fail(block, src, "cast discards volatile qualifier", .{});
     }
+    if (operand_info.@"addrspace" != dest_info.@"addrspace") {
+        return sema.fail(block, src, "cast changes pointer address space", .{});
+    }
 
     const dest_is_slice = dest_ty.isSlice();
     const operand_is_slice = operand_ty.isSlice();
diff --git a/src/type.zig b/src/type.zig
index 5ac9726727..c1c8054e26 100644
--- a/src/type.zig
+++ b/src/type.zig
@@ -2786,6 +2786,12 @@ pub const Type = extern union {
 
             .pointer => self.castTag(.pointer).?.data.@"addrspace",
 
+            .optional => {
+                var buf: Payload.ElemType = undefined;
+                const child_type = self.optionalChild(&buf);
+                return child_type.ptrAddressSpace();
+            },
+
             else => unreachable,
         };
     }

From e90a42a80844f49e8755ab92d1c082e9ac906dee Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Sun, 25 Sep 2022 01:15:33 +0200
Subject: [PATCH 16/17] stage2: improve globals with address spaces a little

---
 src/codegen/llvm.zig | 72 +++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 9c8551854b..aff7656bd3 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -2399,8 +2399,7 @@ pub const DeclGen = struct {
                     // mismatch, because we don't have the LLVM type until the *value* is created,
                     // whereas the global needs to be created based on the type alone, because
                     // lowering the value may reference the global as a pointer.
-                    const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
-                    const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
+                    const llvm_global_addrspace = toLlvmGlobalAddressSpace(decl.@"addrspace", target);
                     const new_global = dg.object.llvm_module.addGlobalInAddressSpace(
                         llvm_init.typeOf(),
                         "",
@@ -2414,12 +2413,9 @@ pub const DeclGen = struct {
                     // replaceAllUsesWith requires the type to be unchanged. So we convert
                     // the new global to the old type and use that as the thing to replace
                     // old uses.
-                    const new_global_ptr = if (llvm_addrspace != llvm_global_addrspace)
-                        new_global.constAddrSpaceCast(llvm_init.typeOf().pointerType(llvm_addrspace))
-                    else
-                        new_global;
-                    const new_global_casted_ptr = new_global_ptr.constBitCast(global.typeOf());
-                    global.replaceAllUsesWith(new_global_casted_ptr);
+                    // TODO: How should this work then the address space of a global changed?
+                    const new_global_ptr = new_global.constBitCast(global.typeOf());
+                    global.replaceAllUsesWith(new_global_ptr);
                     dg.object.decl_map.putAssumeCapacity(decl_index, new_global);
                     new_global.takeName(global);
                     global.deleteGlobal();
@@ -2617,11 +2613,12 @@ pub const DeclGen = struct {
         const target = dg.module.getTarget();
 
         const llvm_type = try dg.lowerType(decl.ty);
-        const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+        const llvm_actual_addrspace = toLlvmGlobalAddressSpace(decl.@"addrspace", target);
+
         const llvm_global = dg.object.llvm_module.addGlobalInAddressSpace(
             llvm_type,
             fqn,
-            toLlvmGlobalAddressSpace(llvm_addrspace, target),
+            llvm_actual_addrspace,
         );
         gop.value_ptr.* = llvm_global;
 
@@ -3241,16 +3238,18 @@ pub const DeclGen = struct {
                     const decl_index = tv.val.castTag(.variable).?.data.owner_decl;
                     const decl = dg.module.declPtr(decl_index);
                     dg.module.markDeclAlive(decl);
+
+                    const llvm_wanted_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+                    const llvm_actual_addrspace = toLlvmGlobalAddressSpace(decl.@"addrspace", target);
+
                     const llvm_var_type = try dg.lowerType(tv.ty);
-                    const llvm_var_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
-                    const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_var_addrspace, target);
-                    const llvm_var_ptr_type = llvm_var_type.pointerType(llvm_global_addrspace);
+                    const llvm_actual_ptr_type = llvm_var_type.pointerType(llvm_actual_addrspace);
 
                     const val = try dg.resolveGlobalDecl(decl_index);
-                    const val_ptr = val.constBitCast(llvm_var_ptr_type);
-                    if (llvm_global_addrspace != llvm_var_addrspace) {
-                        const llvm_ptr_type = llvm_var_type.pointerType(llvm_var_addrspace);
-                        return val_ptr.constAddrSpaceCast(llvm_ptr_type);
+                    const val_ptr = val.constBitCast(llvm_actual_ptr_type);
+                    if (llvm_actual_addrspace != llvm_wanted_addrspace) {
+                        const llvm_wanted_ptr_type = llvm_var_type.pointerType(llvm_wanted_addrspace);
+                        return val_ptr.constAddrSpaceCast(llvm_wanted_ptr_type);
                     }
                     return val_ptr;
                 },
@@ -4055,12 +4054,12 @@ pub const DeclGen = struct {
             try self.resolveGlobalDecl(decl_index);
 
         const target = self.module.getTarget();
-        const llvm_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
-        const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
-        const llvm_val = if (llvm_addrspace != llvm_global_addrspace) blk: {
+        const llvm_wanted_addrspace = toLlvmAddressSpace(decl.@"addrspace", target);
+        const llvm_actual_addrspace = toLlvmGlobalAddressSpace(decl.@"addrspace", target);
+        const llvm_val = if (llvm_wanted_addrspace != llvm_actual_addrspace) blk: {
             const llvm_decl_ty = try self.lowerType(decl.ty);
-            const llvm_decl_ptr_ty = llvm_decl_ty.pointerType(llvm_addrspace);
-            break :blk llvm_decl_val.constAddrSpaceCast(llvm_decl_ptr_ty);
+            const llvm_decl_wanted_ptr_ty = llvm_decl_ty.pointerType(llvm_wanted_addrspace);
+            break :blk llvm_decl_val.constAddrSpaceCast(llvm_decl_wanted_ptr_ty);
         } else llvm_decl_val;
 
         const llvm_type = try self.lowerType(tv.ty);
@@ -4328,9 +4327,9 @@ pub const FuncGen = struct {
         // We have an LLVM value but we need to create a global constant and
         // set the value as its initializer, and then return a pointer to the global.
         const target = self.dg.module.getTarget();
-        const llvm_addrspace = toLlvmAddressSpace(.generic, target);
-        const llvm_global_addrspace = toLlvmGlobalAddressSpace(llvm_addrspace, target);
-        const global = self.dg.object.llvm_module.addGlobalInAddressSpace(llvm_val.typeOf(), "", llvm_global_addrspace);
+        const llvm_wanted_addrspace = toLlvmAddressSpace(.generic, target);
+        const llvm_actual_addrspace = toLlvmGlobalAddressSpace(.generic, target);
+        const global = self.dg.object.llvm_module.addGlobalInAddressSpace(llvm_val.typeOf(), "", llvm_actual_addrspace);
         global.setInitializer(llvm_val);
         global.setLinkage(.Private);
         global.setGlobalConstant(.True);
@@ -4340,10 +4339,13 @@ pub const FuncGen = struct {
         // the type of global constants might not match the type it is supposed to
         // be, and so we must bitcast the pointer at the usage sites.
         const wanted_llvm_ty = try self.dg.lowerType(ty);
-        const wanted_bitcasted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_global_addrspace);
+        const wanted_bitcasted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_actual_addrspace);
         const bitcasted_ptr = global.constBitCast(wanted_bitcasted_llvm_ptr_ty);
-        const wanted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_addrspace);
-        const casted_ptr = bitcasted_ptr.constAddrSpaceCast(wanted_llvm_ptr_ty);
+        const wanted_llvm_ptr_ty = wanted_llvm_ty.pointerType(llvm_wanted_addrspace);
+        const casted_ptr = if (llvm_wanted_addrspace != llvm_actual_addrspace)
+            bitcasted_ptr.constAddrSpaceCast(wanted_llvm_ptr_ty)
+        else
+            bitcasted_ptr;
         gop.value_ptr.* = casted_ptr;
         return casted_ptr;
     }
@@ -9948,13 +9950,13 @@ fn llvmDefaultGlobalAddressSpace(target: std.Target) c_uint {
     };
 }
 
-/// If `llvm_addrspace` is generic, convert it to the actual address space that globals
-/// should be stored in by default.
-fn toLlvmGlobalAddressSpace(llvm_addrspace: c_uint, target: std.Target) c_uint {
-    return if (llvm_addrspace == llvm.address_space.default)
-        llvmDefaultGlobalAddressSpace(target)
-    else
-        llvm_addrspace;
+/// Return the actual address space that a value should be stored in if its a global address space.
+/// When a value is placed in the resulting address space, it needs to be cast back into wanted_address_space.
+fn toLlvmGlobalAddressSpace(wanted_address_space: std.builtin.AddressSpace, target: std.Target) c_uint {
+    return switch (wanted_address_space) {
+        .generic => llvmDefaultGlobalAddressSpace(target),
+        else => |as| toLlvmAddressSpace(as, target),
+    };
 }
 
 /// Take into account 0 bit fields and padding. Returns null if an llvm

From 9ff60e356ec5be9c3e547d0db2b55bba88c0acbd Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@voetter.nl>
Date: Wed, 12 Oct 2022 20:38:43 +0200
Subject: [PATCH 17/17] typo

---
 lib/std/target.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/std/target.zig b/lib/std/target.zig
index 139df629c5..c1f367b3d5 100644
--- a/lib/std/target.zig
+++ b/lib/std/target.zig
@@ -1157,7 +1157,7 @@ pub const Target = struct {
                 };
             }
 
-            /// Returns whether this architecture supporst the address space
+            /// Returns whether this architecture supports the address space
             pub fn supportsAddressSpace(arch: Arch, address_space: std.builtin.AddressSpace) bool {
                 const is_nvptx = arch == .nvptx or arch == .nvptx64;
                 return switch (address_space) {