Merge branch 'llvm18'

Upgrades the LLVM, Clang, and LLD dependencies to LLVM 18.x Related to #16270
2026-02-09 11:03:30 +00:00 · 2024-05-09 01:52:26 -07:00 · 2024-05-09 01:52:26 -07:00 · bcb534c295
commit bcb534c295
parent d9b00ee4ba 74f52954b9
1107 changed files with 108768 additions and 112905 deletions
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@ -4,6 +4,7 @@ on:
  push:
    branches:
      - master
+      - llvm18
 concurrency:
  # Cancels pending runs when a PR gets updated.
  group: ${{ github.head_ref || github.run_id }}-${{ github.actor }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -140,9 +140,9 @@ else()
    set(ZIG_SYSTEM_LIBCXX "stdc++" CACHE STRING "system libcxx name for build.zig")
 endif()

-find_package(llvm 17)
-find_package(clang 17)
-find_package(lld 17)
+find_package(llvm 18)
+find_package(clang 18)
+find_package(lld 18)

 if(ZIG_STATIC_ZLIB)
    if (MSVC)
@ -526,42 +526,54 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/lib/std/zig/system/x86.zig"
    "${CMAKE_SOURCE_DIR}/lib/std/zig/tokenizer.zig"
    "${CMAKE_SOURCE_DIR}/src/Air.zig"
+    "${CMAKE_SOURCE_DIR}/src/Builtin.zig"
    "${CMAKE_SOURCE_DIR}/src/Compilation.zig"
    "${CMAKE_SOURCE_DIR}/src/Compilation/Config.zig"
+    "${CMAKE_SOURCE_DIR}/src/DarwinPosixSpawn.zig"
+    "${CMAKE_SOURCE_DIR}/src/InternPool.zig"
    "${CMAKE_SOURCE_DIR}/src/Liveness.zig"
+    "${CMAKE_SOURCE_DIR}/src/Liveness/Verify.zig"
    "${CMAKE_SOURCE_DIR}/src/Module.zig"
    "${CMAKE_SOURCE_DIR}/src/Package.zig"
    "${CMAKE_SOURCE_DIR}/src/Package/Fetch.zig"
+    "${CMAKE_SOURCE_DIR}/src/Package/Fetch/git.zig"
+    "${CMAKE_SOURCE_DIR}/src/Package/Manifest.zig"
+    "${CMAKE_SOURCE_DIR}/src/Package/Module.zig"
    "${CMAKE_SOURCE_DIR}/src/RangeSet.zig"
    "${CMAKE_SOURCE_DIR}/src/Sema.zig"
+    "${CMAKE_SOURCE_DIR}/src/Sema/bitcast.zig"
+    "${CMAKE_SOURCE_DIR}/src/Sema/comptime_ptr_access.zig"
    "${CMAKE_SOURCE_DIR}/src/Value.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/Mir.zig"
-    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/abi.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/aarch64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/arm/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/arm/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/arm/Mir.zig"
-    "${CMAKE_SOURCE_DIR}/src/arch/arm/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/arm/abi.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/arm/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/Mir.zig"
-    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/abi.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/riscv64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/Mir.zig"
-    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/abi.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/sparc64/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/wasm/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/wasm/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/wasm/Mir.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/wasm/abi.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/x86/bits.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/CodeGen.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/Disassembler.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/Emit.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/Encoding.zig"
+    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/Lower.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/Mir.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/abi.zig"
    "${CMAKE_SOURCE_DIR}/src/arch/x86_64/bits.zig"
@ -574,7 +586,17 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/codegen/c.zig"
    "${CMAKE_SOURCE_DIR}/src/codegen/c/Type.zig"
    "${CMAKE_SOURCE_DIR}/src/codegen/llvm.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/BitcodeReader.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/Builder.zig"
    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/bindings.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/bitcode_writer.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/llvm/ir.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/spirv.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/spirv/Assembler.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/spirv/Module.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/spirv/Section.zig"
+    "${CMAKE_SOURCE_DIR}/src/codegen/spirv/spec.zig"
+    "${CMAKE_SOURCE_DIR}/src/crash_report.zig"
    "${CMAKE_SOURCE_DIR}/src/glibc.zig"
    "${CMAKE_SOURCE_DIR}/src/introspect.zig"
    "${CMAKE_SOURCE_DIR}/src/libcxx.zig"
@ -586,7 +608,9 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/link/Coff/Atom.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Coff/ImportTable.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Coff/Object.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Coff/Relocation.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Coff/lld.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Dwarf.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/Archive.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/Atom.zig"
@ -599,6 +623,7 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/link/Elf/eh_frame.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/file.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/gc.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Elf/merge_section.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/relocatable.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/relocation.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Elf/synthetic_sections.zig"
@ -617,9 +642,9 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/link/MachO/UnwindInfo.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/ZigObject.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/dead_strip.zig"
-    "${CMAKE_SOURCE_DIR}/src/link/MachO/dyld_info/bind.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/dyld_info/Rebase.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/dyld_info/Trie.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/MachO/dyld_info/bind.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/eh_frame.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/fat.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/file.zig"
@ -629,15 +654,32 @@ set(ZIG_STAGE2_SOURCES
    "${CMAKE_SOURCE_DIR}/src/link/MachO/synthetic.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/thunks.zig"
    "${CMAKE_SOURCE_DIR}/src/link/MachO/uuid.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/NvPtx.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Plan9.zig"
    "${CMAKE_SOURCE_DIR}/src/link/Plan9/aout.zig"
-    "${CMAKE_SOURCE_DIR}/src/link/Wasm.zig"
-    "${CMAKE_SOURCE_DIR}/src/link/msdos-stub.bin"
+    "${CMAKE_SOURCE_DIR}/src/link/SpirV.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/SpirV/BinaryModule.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/SpirV/deduplicate.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/SpirV/lower_invocation_globals.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/SpirV/prune_unused.zig"
    "${CMAKE_SOURCE_DIR}/src/link/StringTable.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/Archive.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/Atom.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/Object.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/Symbol.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/ZigObject.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/file.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/Wasm/types.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/aarch64.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/riscv.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/table_section.zig"
    "${CMAKE_SOURCE_DIR}/src/link/tapi.zig"
    "${CMAKE_SOURCE_DIR}/src/link/tapi/Tokenizer.zig"
    "${CMAKE_SOURCE_DIR}/src/link/tapi/parse.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/tapi/parse/test.zig"
    "${CMAKE_SOURCE_DIR}/src/link/tapi/yaml.zig"
+    "${CMAKE_SOURCE_DIR}/src/link/tapi/yaml/test.zig"
    "${CMAKE_SOURCE_DIR}/src/main.zig"
    "${CMAKE_SOURCE_DIR}/src/mingw.zig"
    "${CMAKE_SOURCE_DIR}/src/musl.zig"
@ -685,7 +727,7 @@ if(MSVC)
    set(EXE_LDFLAGS "${EXE_LDFLAGS} /debug:fastlink")
  endif()
 else()
-  set(EXE_CXX_FLAGS "-std=c++17 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -Werror=type-limits -Wno-missing-braces -Wno-comment")
+  set(EXE_CXX_FLAGS "-std=c++17 -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -D_GNU_SOURCE -fvisibility-inlines-hidden -fno-exceptions -fno-rtti -Wno-type-limits -Wno-missing-braces -Wno-comment")
  set(EXE_LDFLAGS " ")
  if(MINGW)
    set(EXE_CXX_FLAGS "${EXE_CXX_FLAGS} -Wno-format")
@ -866,9 +908,9 @@ target_include_directories(zig2 PUBLIC "${CMAKE_SOURCE_DIR}/stage1")
 target_link_libraries(zig2 LINK_PUBLIC zigcpp)

 if(MSVC)
-  target_link_libraries(zig2 LINK_PUBLIC ntdll.lib)
+  target_link_libraries(zig2 LINK_PUBLIC ntdll.lib ws2_32.lib)
 elseif(MINGW)
-  target_link_libraries(zig2 LINK_PUBLIC ntdll)
+  target_link_libraries(zig2 LINK_PUBLIC ntdll ws2_32)
 endif()

 if(NOT MSVC)
--- a/build.zig
+++ b/build.zig
@ -334,6 +334,9 @@ pub fn build(b: *std.Build) !void {
        }
        if (target.result.os.tag == .windows) {
            inline for (.{ exe, check_case_exe }) |artifact| {
+                // LLVM depends on networking as of version 18.
+                artifact.linkSystemLibrary("ws2_32");
+
                artifact.linkSystemLibrary("version");
                artifact.linkSystemLibrary("uuid");
                artifact.linkSystemLibrary("ole32");
@ -650,7 +653,7 @@ const exe_cflags = [_][]const u8{
    "-fvisibility-inlines-hidden",
    "-fno-exceptions",
    "-fno-rtti",
-    "-Werror=type-limits",
+    "-Wno-type-limits",
    "-Wno-missing-braces",
    "-Wno-comment",
 };
@ -1039,6 +1042,7 @@ const clang_libs = [_][]const u8{
    "clangAST",
    "clangParse",
    "clangSema",
+    "clangAPINotes",
    "clangBasic",
    "clangEdit",
    "clangLex",
@ -1068,6 +1072,7 @@ const llvm_libs = [_][]const u8{
    "LLVMXRay",
    "LLVMLibDriver",
    "LLVMDlltoolDriver",
+    "LLVMTextAPIBinaryReader",
    "LLVMCoverage",
    "LLVMLineEditor",
    "LLVMXCoreDisassembler",
@ -1169,6 +1174,7 @@ const llvm_libs = [_][]const u8{
    "LLVMAArch64Desc",
    "LLVMAArch64Utils",
    "LLVMAArch64Info",
+    "LLVMOrcDebugging",
    "LLVMOrcJIT",
    "LLVMWindowsDriver",
    "LLVMMCJIT",
@ -1188,6 +1194,7 @@ const llvm_libs = [_][]const u8{
    "LLVMMCDisassembler",
    "LLVMLTO",
    "LLVMPasses",
+    "LLVMHipStdPar",
    "LLVMCFGuard",
    "LLVMCoroutines",
    "LLVMipo",
@ -1195,10 +1202,13 @@ const llvm_libs = [_][]const u8{
    "LLVMLinker",
    "LLVMInstrumentation",
    "LLVMFrontendOpenMP",
+    "LLVMFrontendOffloading",
    "LLVMFrontendOpenACC",
    "LLVMFrontendHLSL",
+    "LLVMFrontendDriver",
    "LLVMExtensions",
    "LLVMDWARFLinkerParallel",
+    "LLVMDWARFLinkerClassic",
    "LLVMDWARFLinker",
    "LLVMGlobalISel",
    "LLVMMIRParser",
--- a/ci/aarch64-linux-debug.sh
+++ b/ci/aarch64-linux-debug.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.203+d3bc1cfc4"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-linux-release.sh
+++ b/ci/aarch64-linux-release.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.203+d3bc1cfc4"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/aarch64-macos-debug.sh
+++ b/ci/aarch64-macos-debug.sh
@ -9,10 +9,16 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.467+0345d7866"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

+if [ ! -d "$PREFIX" ]; then
+  cd $HOME
+  curl -L -O "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
+  tar xf "$CACHE_BASENAME.tar.xz"
+fi
+
 cd $ZIGDIR

 # Make the `zig version` number consistent.
--- a/ci/aarch64-macos-release.sh
+++ b/ci/aarch64-macos-release.sh
@ -9,10 +9,16 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.467+0345d7866"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

+if [ ! -d "$PREFIX" ]; then
+  cd $HOME
+  curl -L -O "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
+  tar xf "$CACHE_BASENAME.tar.xz"
+fi
+
 cd $ZIGDIR

 # Make the `zig version` number consistent.
--- a/ci/aarch64-windows.ps1
+++ b/ci/aarch64-windows.ps1
@ -1,5 +1,5 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.12.0-dev.2087+e9a18010b"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 $MCPU = "baseline"
 $ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$(Get-Location)\..\$ZIG_LLVM_CLANG_LLD_NAME"
--- a/ci/x86_64-linux-debug.sh
+++ b/ci/x86_64-linux-debug.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.203+d3bc1cfc4"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/x86_64-linux-release.sh
+++ b/ci/x86_64-linux-release.sh
@ -8,7 +8,7 @@ set -e
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.203+d3bc1cfc4"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/deps/$CACHE_BASENAME"
 ZIG="$PREFIX/bin/zig"

--- a/ci/x86_64-macos-release.sh
+++ b/ci/x86_64-macos-release.sh
@ -6,18 +6,17 @@ set -e
 ZIGDIR="$PWD"
 TARGET="$ARCH-macos-none"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.12.0-dev.467+0345d7866"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j3"
-
-rm -rf $PREFIX
-cd $HOME
-
-curl -L -O "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
-tar xf "$CACHE_BASENAME.tar.xz"
-
 ZIG="$PREFIX/bin/zig"

+if [ ! -d "$PREFIX" ]; then
+  cd $HOME
+  curl -L -O "https://ziglang.org/deps/$CACHE_BASENAME.tar.xz"
+  tar xf "$CACHE_BASENAME.tar.xz"
+fi
+
 cd $ZIGDIR

 # Make the `zig version` number consistent.
--- a/ci/x86_64-windows-debug.ps1
+++ b/ci/x86_64-windows-debug.ps1
@ -1,10 +1,20 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.12.0-dev.2073+402fe565a"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 $MCPU = "baseline"
+$ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$($Env:USERPROFILE)\$ZIG_LLVM_CLANG_LLD_NAME"
 $ZIG = "$PREFIX_PATH\bin\zig.exe"
 $ZIG_LIB_DIR = "$(Get-Location)\lib"

+if (!(Test-Path "$PREFIX_PATH.zip")) {
+    Write-Output "Downloading $ZIG_LLVM_CLANG_LLD_URL"
+    Invoke-WebRequest -Uri "$ZIG_LLVM_CLANG_LLD_URL" -OutFile "$PREFIX_PATH.zip"
+
+    Write-Output "Extracting..."
+    Add-Type -AssemblyName System.IO.Compression.FileSystem ;
+    [System.IO.Compression.ZipFile]::ExtractToDirectory("$PREFIX_PATH.zip", "$PREFIX_PATH\..")
+}
+
 function CheckLastExitCode {
    if (!$?) {
        exit 1
@ -25,6 +35,12 @@ Remove-Item -Path 'build-debug' -Recurse -Force -ErrorAction Ignore
 New-Item -Path 'build-debug' -ItemType Directory
 Set-Location -Path 'build-debug'

+# Override the cache directories because they won't actually help other CI runs
+# which will be testing alternate versions of zig, and ultimately would just
+# fill up space on the hard drive for no reason.
+$Env:ZIG_GLOBAL_CACHE_DIR="$(Get-Location)\zig-global-cache"
+$Env:ZIG_LOCAL_CACHE_DIR="$(Get-Location)\zig-local-cache"
+
 # CMake gives a syntax error when file paths with backward slashes are used.
 # Here, we use forward slashes only to work around this.
 & cmake .. `
--- a/ci/x86_64-windows-release.ps1
+++ b/ci/x86_64-windows-release.ps1
@ -1,10 +1,20 @@
 $TARGET = "$($Env:ARCH)-windows-gnu"
-$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.12.0-dev.2073+402fe565a"
+$ZIG_LLVM_CLANG_LLD_NAME = "zig+llvm+lld+clang-$TARGET-0.13.0-dev.130+98a30acad"
 $MCPU = "baseline"
+$ZIG_LLVM_CLANG_LLD_URL = "https://ziglang.org/deps/$ZIG_LLVM_CLANG_LLD_NAME.zip"
 $PREFIX_PATH = "$($Env:USERPROFILE)\$ZIG_LLVM_CLANG_LLD_NAME"
 $ZIG = "$PREFIX_PATH\bin\zig.exe"
 $ZIG_LIB_DIR = "$(Get-Location)\lib"

+if (!(Test-Path "$PREFIX_PATH.zip")) {
+    Write-Output "Downloading $ZIG_LLVM_CLANG_LLD_URL"
+    Invoke-WebRequest -Uri "$ZIG_LLVM_CLANG_LLD_URL" -OutFile "$PREFIX_PATH.zip"
+
+    Write-Output "Extracting..."
+    Add-Type -AssemblyName System.IO.Compression.FileSystem ;
+    [System.IO.Compression.ZipFile]::ExtractToDirectory("$PREFIX_PATH.zip", "$PREFIX_PATH\..")
+}
+
 function CheckLastExitCode {
    if (!$?) {
        exit 1
@ -25,6 +35,12 @@ Remove-Item -Path 'build-release' -Recurse -Force -ErrorAction Ignore
 New-Item -Path 'build-release' -ItemType Directory
 Set-Location -Path 'build-release'

+# Override the cache directories because they won't actually help other CI runs
+# which will be testing alternate versions of zig, and ultimately would just
+# fill up space on the hard drive for no reason.
+$Env:ZIG_GLOBAL_CACHE_DIR="$(Get-Location)\zig-global-cache"
+$Env:ZIG_LOCAL_CACHE_DIR="$(Get-Location)\zig-local-cache"
+
 # CMake gives a syntax error when file paths with backward slashes are used.
 # Here, we use forward slashes only to work around this.
 & cmake .. `
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@ -17,9 +17,9 @@ find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
 if(${LLVM_LINK_MODE} STREQUAL "shared")
  find_library(CLANG_LIBRARIES
    NAMES
-      libclang-cpp.so.17
-      clang-cpp-17.0
-      clang-cpp170
+      libclang-cpp.so.18
+      clang-cpp-18.0
+      clang-cpp180
      clang-cpp
    NAMES_PER_DIR
    HINTS "${LLVM_LIBDIRS}"
@ -55,6 +55,7 @@ else()
  FIND_AND_ADD_CLANG_LIB(clangAST)
  FIND_AND_ADD_CLANG_LIB(clangParse)
  FIND_AND_ADD_CLANG_LIB(clangSema)
+  FIND_AND_ADD_CLANG_LIB(clangAPINotes)
  FIND_AND_ADD_CLANG_LIB(clangBasic)
  FIND_AND_ADD_CLANG_LIB(clangEdit)
  FIND_AND_ADD_CLANG_LIB(clangLex)
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@ -9,21 +9,21 @@
 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
    HINTS ${LLVM_INCLUDE_DIRS}
    PATHS
-        /usr/lib/llvm-17/include
-        /usr/local/llvm170/include
-        /usr/local/llvm17/include
-        /usr/local/opt/llvm@17/include
-        /opt/homebrew/opt/llvm@17/include
+        /usr/lib/llvm-18/include
+        /usr/local/llvm180/include
+        /usr/local/llvm18/include
+        /usr/local/opt/llvm@18/include
+        /opt/homebrew/opt/llvm@18/include
        /mingw64/include)

-find_library(LLD_LIBRARY NAMES lld-17.0 lld170 lld NAMES_PER_DIR
+find_library(LLD_LIBRARY NAMES lld-18.0 lld180 lld NAMES_PER_DIR
    HINTS ${LLVM_LIBDIRS}
    PATHS
-        /usr/lib/llvm-17/lib
-        /usr/local/llvm170/lib
-        /usr/local/llvm17/lib
-        /usr/local/opt/llvm@17/lib
-        /opt/homebrew/opt/llvm@17/lib
+        /usr/lib/llvm-18/lib
+        /usr/local/llvm180/lib
+        /usr/local/llvm18/lib
+        /usr/local/opt/llvm@18/lib
+        /opt/homebrew/opt/llvm@18/lib
 )
 if(EXISTS ${LLD_LIBRARY})
    set(LLD_LIBRARIES ${LLD_LIBRARY})
@ -34,11 +34,11 @@ else()
            HINTS ${LLVM_LIBDIRS}
            PATHS
                ${LLD_LIBDIRS}
-                /usr/lib/llvm-17/lib
-                /usr/local/llvm170/lib
-                /usr/local/llvm17/lib
-                /usr/local/opt/llvm@17/lib
-                /opt/homebrew/opt/llvm@17/lib
+                /usr/lib/llvm-18/lib
+                /usr/local/llvm180/lib
+                /usr/local/llvm18/lib
+                /usr/local/opt/llvm@18/lib
+                /opt/homebrew/opt/llvm@18/lib
                /mingw64/lib
                /c/msys64/mingw64/lib
                c:/msys64/mingw64/lib)
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@ -14,12 +14,12 @@ if(ZIG_USE_LLVM_CONFIG)
  while(1)
    unset(LLVM_CONFIG_EXE CACHE)
    find_program(LLVM_CONFIG_EXE
-        NAMES llvm-config-17 llvm-config-17.0 llvm-config170 llvm-config17 llvm-config NAMES_PER_DIR
+        NAMES llvm-config-18 llvm-config-18.0 llvm-config180 llvm-config18 llvm-config NAMES_PER_DIR
        PATHS
            "/mingw64/bin"
            "/c/msys64/mingw64/bin"
            "c:/msys64/mingw64/bin"
-            "C:/Libraries/llvm-17.0.0/bin")
+            "C:/Libraries/llvm-18.0.0/bin")

    if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
      if (NOT LLVM_CONFIG_ERROR_MESSAGES STREQUAL "")
@ -37,9 +37,9 @@ if(ZIG_USE_LLVM_CONFIG)
      OUTPUT_STRIP_TRAILING_WHITESPACE)

    get_filename_component(LLVM_CONFIG_DIR "${LLVM_CONFIG_EXE}" DIRECTORY)
-    if("${LLVM_CONFIG_VERSION}" VERSION_LESS 17 OR "${LLVM_CONFIG_VERSION}" VERSION_EQUAL 18 OR "${LLVM_CONFIG_VERSION}" VERSION_GREATER 18)
+    if("${LLVM_CONFIG_VERSION}" VERSION_LESS 18 OR "${LLVM_CONFIG_VERSION}" VERSION_EQUAL 19 OR "${LLVM_CONFIG_VERSION}" VERSION_GREATER 19)
      # Save the error message, in case this is the last llvm-config we find
-      list(APPEND LLVM_CONFIG_ERROR_MESSAGES "expected LLVM 17.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+      list(APPEND LLVM_CONFIG_ERROR_MESSAGES "expected LLVM 18.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")

      # Ignore this directory and try the search again
      list(APPEND CMAKE_IGNORE_PATH "${LLVM_CONFIG_DIR}")
@ -63,9 +63,9 @@ if(ZIG_USE_LLVM_CONFIG)
      if (LLVM_CONFIG_ERROR) 
        # Save the error message, in case this is the last llvm-config we find
        if (ZIG_SHARED_LLVM)
-          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 17.x found at ${LLVM_CONFIG_EXE} does not support linking as a shared library")
+          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 18.x found at ${LLVM_CONFIG_EXE} does not support linking as a shared library")
        else()
-          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 17.x found at ${LLVM_CONFIG_EXE} does not support linking as a static library")
+          list(APPEND LLVM_CONFIG_ERROR_MESSAGES "LLVM 18.x found at ${LLVM_CONFIG_EXE} does not support linking as a static library")
        endif()

        # Ignore this directory and try the search again
@ -195,6 +195,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMXRay)
  FIND_AND_ADD_LLVM_LIB(LLVMLibDriver)
  FIND_AND_ADD_LLVM_LIB(LLVMDlltoolDriver)
+  FIND_AND_ADD_LLVM_LIB(LLVMTextAPIBinaryReader)
  FIND_AND_ADD_LLVM_LIB(LLVMCoverage)
  FIND_AND_ADD_LLVM_LIB(LLVMLineEditor)
  FIND_AND_ADD_LLVM_LIB(LLVMXCoreDisassembler)
@ -296,6 +297,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Desc)
  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Utils)
  FIND_AND_ADD_LLVM_LIB(LLVMAArch64Info)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcDebugging)
  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
  FIND_AND_ADD_LLVM_LIB(LLVMWindowsDriver)
  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
@ -315,6 +317,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMMCDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMLTO)
  FIND_AND_ADD_LLVM_LIB(LLVMPasses)
+  FIND_AND_ADD_LLVM_LIB(LLVMHipStdPar)
  FIND_AND_ADD_LLVM_LIB(LLVMCFGuard)
  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
  FIND_AND_ADD_LLVM_LIB(LLVMipo)
@ -322,10 +325,13 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
  FIND_AND_ADD_LLVM_LIB(LLVMInstrumentation)
  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenMP)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOffloading)
  FIND_AND_ADD_LLVM_LIB(LLVMFrontendOpenACC)
  FIND_AND_ADD_LLVM_LIB(LLVMFrontendHLSL)
+  FIND_AND_ADD_LLVM_LIB(LLVMFrontendDriver)
  FIND_AND_ADD_LLVM_LIB(LLVMExtensions)
  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinkerParallel)
+  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinkerClassic)
  FIND_AND_ADD_LLVM_LIB(LLVMDWARFLinker)
  FIND_AND_ADD_LLVM_LIB(LLVMGlobalISel)
  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
--- a/lib/compiler/aro/aro/Type.zig
+++ b/lib/compiler/aro/aro/Type.zig
@ -1116,7 +1116,7 @@ pub fn alignof(ty: Type, comp: *const Compilation) u29 {

        .bit_int => @min(
            std.math.ceilPowerOfTwoPromote(u16, (ty.data.int.bits + 7) / 8),
-            comp.target.maxIntAlignment(),
+            16, // comp.target.maxIntAlignment(), please use your own logic for this value as it is implementation-defined
        ),

        .float => comp.target.c_type_alignment(.float),
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@ -502,8 +502,8 @@ __DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
 __DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
 __DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
 __DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }
-__DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
-__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
+__DEVICE__ int __popc(unsigned int __a) { return __nv_popc(__a); }
+__DEVICE__ int __popcll(unsigned long long __a) { return __nv_popcll(__a); }
 __DEVICE__ float __powf(float __a, float __b) {
  return __nv_fast_powf(__a, __b);
 }
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@ -285,8 +285,8 @@ __DEVICE__ double __nv_normcdfinv(double __a);
 __DEVICE__ float __nv_normcdfinvf(float __a);
 __DEVICE__ float __nv_normf(int __a, const float *__b);
 __DEVICE__ double __nv_norm(int __a, const double *__b);
-__DEVICE__ int __nv_popc(int __a);
-__DEVICE__ int __nv_popcll(long long __a);
+__DEVICE__ int __nv_popc(unsigned int __a);
+__DEVICE__ int __nv_popcll(unsigned long long __a);
 __DEVICE__ double __nv_pow(double __a, double __b);
 __DEVICE__ float __nv_powf(float __a, float __b);
 __DEVICE__ double __nv_powi(double __a, int __b);
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -36,7 +36,7 @@
 // because the OpenMP overlay requires constexpr functions here but prior to
 // c++14 void return functions could not be constexpr.
 #pragma push_macro("__DEVICE_VOID__")
-#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
+#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
 #define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE_VOID__ __DEVICE__
@ -45,9 +45,9 @@
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
 // __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
 #pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __FAST_OR_SLOW(fast, slow) fast
 #else
 #define __FAST_OR_SLOW(fast, slow) slow
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -196,12 +196,12 @@ inline __host__ double __signbitd(double x) {

 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
-// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+// sin and exp.  This is controlled in clang by -fgpu-approx-transcendentals.
 //
 // device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
 // slow divides), so we need to scope our define carefully here.
 #pragma push_macro("__USE_FAST_MATH__")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __USE_FAST_MATH__ 1
 #endif

--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@ -14,9 +14,6 @@
 #endif

 #if !defined(__HIPCC_RTC__)
-#if defined(__cplusplus)
-#include <algorithm>
-#endif
 #include <limits.h>
 #include <stdint.h>
 #ifdef __OPENMP_AMDGCN__
@ -32,6 +29,17 @@
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
 #endif

+// Device library provides fast low precision and slow full-recision
+// implementations for some functions. Which one gets selected depends on
+// __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
+#pragma push_macro("__FAST_OR_SLOW")
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+#define __FAST_OR_SLOW(fast, slow) fast
+#else
+#define __FAST_OR_SLOW(fast, slow) slow
+#endif
+
 // A few functions return bool type starting only in C++11.
 #pragma push_macro("__RETURN_TYPE")
 #ifdef __OPENMP_AMDGCN__
@ -139,21 +147,180 @@ uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
 }

 // BEGIN FLOAT
+
+// BEGIN INTRINSICS
+
+__DEVICE__
+float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
+
+__DEVICE__
+float __exp10f(float __x) {
+  const float __log2_10 = 0x1.a934f0p+1f;
+  return __builtin_amdgcn_exp2f(__log2_10 * __x);
+}
+
+__DEVICE__
+float __expf(float __x) {
+  const float __log2_e = 0x1.715476p+0;
+  return __builtin_amdgcn_exp2f(__log2_e * __x);
+}
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
+__DEVICE__
+float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
+__DEVICE__
+float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __x + __y; }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
+__DEVICE__
+float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
+__DEVICE__
+float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
+__DEVICE__
+float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fdiv_rn(float __x, float __y) { return __x / __y; }
+#endif
+
+__DEVICE__
+float __fdividef(float __x, float __y) { return __x / __y; }
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fmaf_rd(float __x, float __y, float __z) {
+  return __ocml_fma_rtn_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __ocml_fma_rte_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_ru(float __x, float __y, float __z) {
+  return __ocml_fma_rtp_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_rz(float __x, float __y, float __z) {
+  return __ocml_fma_rtz_f32(__x, __y, __z);
+}
+#else
+__DEVICE__
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __builtin_fmaf(__x, __y, __z);
+}
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
+__DEVICE__
+float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
+__DEVICE__
+float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
+__DEVICE__
+float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fmul_rn(float __x, float __y) { return __x * __y; }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
+#else
+__DEVICE__
+float __frcp_rn(float __x) { return 1.0f / __x; }
+#endif
+
+__DEVICE__
+float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); }
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
+__DEVICE__
+float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
+__DEVICE__
+float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
+__DEVICE__
+float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
+#else
+__DEVICE__
+float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
+__DEVICE__
+float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
+__DEVICE__
+float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
+__DEVICE__
+float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fsub_rn(float __x, float __y) { return __x - __y; }
+#endif
+
+__DEVICE__
+float __log10f(float __x) { return __builtin_log10f(__x); }
+
+__DEVICE__
+float __log2f(float __x) { return __builtin_amdgcn_logf(__x); }
+
+__DEVICE__
+float __logf(float __x) { return __builtin_logf(__x); }
+
+__DEVICE__
+float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+
+__DEVICE__
+float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
+
+__DEVICE__
+void __sincosf(float __x, float *__sinptr, float *__cosptr) {
+  *__sinptr = __ocml_native_sin_f32(__x);
+  *__cosptr = __ocml_native_cos_f32(__x);
+}
+
+__DEVICE__
+float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
+
+__DEVICE__
+float __tanf(float __x) {
+  return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x));
+}
+// END INTRINSICS
+
 #if defined(__cplusplus)
 __DEVICE__
 int abs(int __x) {
-  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_abs(__x);
 }
 __DEVICE__
 long labs(long __x) {
-  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_labs(__x);
 }
 __DEVICE__
 long long llabs(long long __x) {
-  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_llabs(__x);
 }
 #endif

@ -188,7 +355,7 @@ __DEVICE__
 float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); }

 __DEVICE__
-float cosf(float __x) { return __ocml_cos_f32(__x); }
+float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); }

 __DEVICE__
 float coshf(float __x) { return __ocml_cosh_f32(__x); }
@ -321,13 +488,13 @@ __DEVICE__
 float log1pf(float __x) { return __ocml_log1p_f32(__x); }

 __DEVICE__
-float log2f(float __x) { return __builtin_log2f(__x); }
+float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); }

 __DEVICE__
 float logbf(float __x) { return __ocml_logb_f32(__x); }

 __DEVICE__
-float logf(float __x) { return __builtin_logf(__x); }
+float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); }

 __DEVICE__
 long int lrintf(float __x) { return __builtin_rintf(__x); }
@ -401,7 +568,7 @@ float normf(int __dim,
    ++__a;
  }

-  return __ocml_sqrt_f32(__r);
+  return __builtin_sqrtf(__r);
 }

 __DEVICE__
@ -483,9 +650,13 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
 #ifdef __OPENMP_AMDGCN__
 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
 #endif
+#ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+  __sincosf(__x, __sinptr, __cosptr);
+#else
  *__sinptr =
      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
+#endif
 }

 __DEVICE__
@ -500,7 +671,7 @@ void sincospif(float __x, float *__sinptr, float *__cosptr) {
 }

 __DEVICE__
-float sinf(float __x) { return __ocml_sin_f32(__x); }
+float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); }

 __DEVICE__
 float sinhf(float __x) { return __ocml_sinh_f32(__x); }
@ -509,7 +680,7 @@ __DEVICE__
 float sinpif(float __x) { return __ocml_sinpi_f32(__x); }

 __DEVICE__
-float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
+float sqrtf(float __x) { return __builtin_sqrtf(__x); }

 __DEVICE__
 float tanf(float __x) { return __ocml_tan_f32(__x); }
@ -551,158 +722,7 @@ float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
  return __x1;
 }

-// BEGIN INTRINSICS

-__DEVICE__
-float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
-
-__DEVICE__
-float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
-
-__DEVICE__
-float __expf(float __x) { return __ocml_native_exp_f32(__x); }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
-__DEVICE__
-float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
-__DEVICE__
-float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
-__DEVICE__
-float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fadd_rn(float __x, float __y) { return __x + __y; }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
-__DEVICE__
-float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
-__DEVICE__
-float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
-__DEVICE__
-float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fdiv_rn(float __x, float __y) { return __x / __y; }
-#endif
-
-__DEVICE__
-float __fdividef(float __x, float __y) { return __x / __y; }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fmaf_rd(float __x, float __y, float __z) {
-  return __ocml_fma_rtn_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_rte_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_ru(float __x, float __y, float __z) {
-  return __ocml_fma_rtp_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_rz(float __x, float __y, float __z) {
-  return __ocml_fma_rtz_f32(__x, __y, __z);
-}
-#else
-__DEVICE__
-float __fmaf_rn(float __x, float __y, float __z) {
-  return __builtin_fmaf(__x, __y, __z);
-}
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
-__DEVICE__
-float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
-__DEVICE__
-float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
-__DEVICE__
-float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fmul_rn(float __x, float __y) { return __x * __y; }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
-#else
-__DEVICE__
-float __frcp_rn(float __x) { return 1.0f / __x; }
-#endif
-
-__DEVICE__
-float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
-__DEVICE__
-float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
-__DEVICE__
-float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
-__DEVICE__
-float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
-#else
-__DEVICE__
-float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
-__DEVICE__
-float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
-__DEVICE__
-float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
-__DEVICE__
-float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fsub_rn(float __x, float __y) { return __x - __y; }
-#endif
-
-__DEVICE__
-float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
-
-__DEVICE__
-float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
-
-__DEVICE__
-float __logf(float __x) { return __ocml_native_log_f32(__x); }
-
-__DEVICE__
-float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
-
-__DEVICE__
-float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
-
-__DEVICE__
-void __sincosf(float __x, float *__sinptr, float *__cosptr) {
-  *__sinptr = __ocml_native_sin_f32(__x);
-  *__cosptr = __ocml_native_cos_f32(__x);
-}
-
-__DEVICE__
-float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
-
-__DEVICE__
-float __tanf(float __x) { return __ocml_tan_f32(__x); }
-// END INTRINSICS
 // END FLOAT

 // BEGIN DOUBLE
@ -941,7 +961,7 @@ double norm(int __dim,
    ++__a;
  }

-  return __ocml_sqrt_f64(__r);
+  return __builtin_sqrt(__r);
 }

 __DEVICE__
@ -1064,7 +1084,7 @@ __DEVICE__
 double sinpi(double __x) { return __ocml_sinpi_f64(__x); }

 __DEVICE__
-double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
+double sqrt(double __x) { return __builtin_sqrt(__x); }

 __DEVICE__
 double tan(double __x) { return __ocml_tan_f64(__x); }
@ -1198,7 +1218,7 @@ __DEVICE__
 double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
 #else
 __DEVICE__
-double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
+double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); }
 #endif

 #if defined OCML_BASIC_ROUNDED_OPERATIONS
@ -1288,16 +1308,17 @@ double min(double __x, double __y) { return __builtin_fmin(__x, __y); }

 #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 __host__ inline static int min(int __arg1, int __arg2) {
-  return std::min(__arg1, __arg2);
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }

 __host__ inline static int max(int __arg1, int __arg2) {
-  return std::max(__arg1, __arg2);
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
 #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif

 #pragma pop_macro("__DEVICE__")
 #pragma pop_macro("__RETURN_TYPE")
+#pragma pop_macro("__FAST_OR_SLOW")

 #endif // __CLANG_HIP_MATH_H__
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -46,6 +46,67 @@ extern "C" {
 }
 #endif //__cplusplus

+#if !defined(__HIPCC_RTC__)
+#if __has_include("hip/hip_version.h")
+#include "hip/hip_version.h"
+#endif // __has_include("hip/hip_version.h")
+#endif // __HIPCC_RTC__
+
+typedef __SIZE_TYPE__ __hip_size_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif //__cplusplus
+
+#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
+__device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
+__device__ void __ockl_dm_dealloc(unsigned long long __addr);
+#if __has_feature(address_sanitizer)
+__device__ unsigned long long __asan_malloc_impl(unsigned long long __size,
+                                                 unsigned long long __pc);
+__device__ void __asan_free_impl(unsigned long long __addr,
+                                 unsigned long long __pc);
+__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  return (void *)__asan_malloc_impl(__size, __pc);
+}
+__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  __asan_free_impl((unsigned long long)__ptr, __pc);
+}
+#else // __has_feature(address_sanitizer)
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  return (void *) __ockl_dm_alloc(__size);
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __ockl_dm_dealloc((unsigned long long)__ptr);
+}
+#endif // __has_feature(address_sanitizer)
+#else  // HIP version check
+#if __HIP_ENABLE_DEVICE_MALLOC__
+__device__ void *__hip_malloc(__hip_size_t __size);
+__device__ void *__hip_free(void *__ptr);
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  return __hip_malloc(__size);
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __hip_free(__ptr);
+}
+#else  // __HIP_ENABLE_DEVICE_MALLOC__
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  __builtin_trap();
+  return (void *)0;
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __builtin_trap();
+}
+#endif // __HIP_ENABLE_DEVICE_MALLOC__
+#endif // HIP version check
+
+#ifdef __cplusplus
+} // extern "C"
+#endif //__cplusplus
+
 #if !defined(__HIPCC_RTC__)
 #include <cmath>
 #include <cstdlib>
@ -71,59 +132,6 @@ typedef __SIZE_TYPE__ size_t;
 #define INT_MAX __INTMAX_MAX__
 #endif // __HIPCC_RTC__

-typedef __SIZE_TYPE__ __hip_size_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif //__cplusplus
-
-#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
-extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
-extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
-#if __has_feature(address_sanitizer)
-extern "C" __device__ unsigned long long __asan_malloc_impl(unsigned long long __size, unsigned long long __pc);
-extern "C" __device__ void __asan_free_impl(unsigned long long __addr, unsigned long long __pc);
-__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  return (void *)__asan_malloc_impl(__size, __pc);
-}
-__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  __asan_free_impl((unsigned long long)__ptr, __pc);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return (void *) __ockl_dm_alloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __ockl_dm_dealloc((unsigned long long)__ptr);
-}
-#endif // __has_feature(address_sanitizer)
-#else  // HIP version check
-#if __HIP_ENABLE_DEVICE_MALLOC__
-__device__ void *__hip_malloc(__hip_size_t __size);
-__device__ void *__hip_free(void *__ptr);
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return __hip_malloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __hip_free(__ptr);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  __builtin_trap();
-  return (void *)0;
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __builtin_trap();
-}
-#endif
-#endif // HIP version check
-
-#ifdef __cplusplus
-} // extern "C"
-#endif //__cplusplus
-
 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>
 #include <__clang_hip_stdlib.h>
--- a/lib/include/stdarg_gnuc_va_list.h
+++ b/lib/include/stdarg_gnuc_va_list.h
@ -0,0 +1,13 @@
+/*===---- __stdarg___gnuc_va_list.h - Definition of __gnuc_va_list ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST
+typedef __builtin_va_list __gnuc_va_list;
+#endif
--- a/lib/include/stdarg_va_copy.h
+++ b/lib/include/stdarg_va_copy.h
@ -0,0 +1,12 @@
+/*===---- __stdarg___va_copy.h - Definition of __va_copy -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __va_copy
+#define __va_copy(d, s) __builtin_va_copy(d, s)
+#endif
--- a/lib/include/__stdarg_va_arg.h
+++ b/lib/include/__stdarg_va_arg.h
@ -0,0 +1,22 @@
+/*===---- __stdarg_va_arg.h - Definitions of va_start, va_arg, va_end-------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef va_arg
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+/* C23 does not require the second parameter for va_start. */
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#else
+/* Versions before C23 do require the second parameter. */
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#define va_end(ap) __builtin_va_end(ap)
+#define va_arg(ap, type) __builtin_va_arg(ap, type)
+
+#endif
--- a/lib/include/__stdarg_va_copy.h
+++ b/lib/include/__stdarg_va_copy.h
@ -0,0 +1,12 @@
+/*===---- __stdarg_va_copy.h - Definition of va_copy------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef va_copy
+#define va_copy(dest, src) __builtin_va_copy(dest, src)
+#endif
--- a/lib/include/__stdarg_va_list.h
+++ b/lib/include/__stdarg_va_list.h
@ -0,0 +1,13 @@
+/*===---- __stdarg_va_list.h - Definition of va_list -----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _VA_LIST
+#define _VA_LIST
+typedef __builtin_va_list va_list;
+#endif
--- a/lib/include/__stddef_max_align_t.h
+++ b/lib/include/__stddef_max_align_t.h
@ -1,4 +1,4 @@
-/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+/*===---- __stddef_max_align_t.h - Definition of max_align_t ---------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/lib/include/__stddef_null.h
+++ b/lib/include/__stddef_null.h
@ -0,0 +1,29 @@
+/*===---- __stddef_null.h - Definition of NULL -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(NULL) || !__building_module(_Builtin_stddef)
+
+/* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
+ * __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
+ * Modules don't support redefining macros like that, but support that pattern
+ * in the non-modules case.
+ */
+#undef NULL
+
+#ifdef __cplusplus
+#if !defined(__MINGW32__) && !defined(_MSC_VER)
+#define NULL __null
+#else
+#define NULL 0
+#endif
+#else
+#define NULL ((void*)0)
+#endif
+
+#endif
--- a/lib/include/__stddef_nullptr_t.h
+++ b/lib/include/__stddef_nullptr_t.h
@ -0,0 +1,29 @@
+/*===---- __stddef_nullptr_t.h - Definition of nullptr_t -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_NULLPTR_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _NULLPTR_T
+
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std {
+typedef decltype(nullptr) nullptr_t;
+}
+using ::std::nullptr_t;
+#endif
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+typedef typeof(nullptr) nullptr_t;
+#endif
+
+#endif
--- a/lib/include/__stddef_offsetof.h
+++ b/lib/include/__stddef_offsetof.h
@ -0,0 +1,17 @@
+/*===---- __stddef_offsetof.h - Definition of offsetof ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(offsetof) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#endif
--- a/lib/include/__stddef_ptrdiff_t.h
+++ b/lib/include/__stddef_ptrdiff_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_ptrdiff_t.h - Definition of ptrdiff_t -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_PTRDIFF_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _PTRDIFF_T
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+#endif
--- a/lib/include/__stddef_rsize_t.h
+++ b/lib/include/__stddef_rsize_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_rsize_t.h - Definition of rsize_t -----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_RSIZE_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _RSIZE_T
+
+typedef __SIZE_TYPE__ rsize_t;
+
+#endif
--- a/lib/include/__stddef_size_t.h
+++ b/lib/include/__stddef_size_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_size_t.h - Definition of size_t -------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_SIZE_T) ||                                                       \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _SIZE_T
+
+typedef __SIZE_TYPE__ size_t;
+
+#endif
--- a/lib/include/__stddef_unreachable.h
+++ b/lib/include/__stddef_unreachable.h
@ -0,0 +1,21 @@
+/*===---- __stddef_unreachable.h - Definition of unreachable ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __cplusplus
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(unreachable) ||                                                   \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define unreachable() __builtin_unreachable()
+#endif
+
+#endif
--- a/lib/include/__stddef_wchar_t.h
+++ b/lib/include/__stddef_wchar_t.h
@ -0,0 +1,28 @@
+/*===---- __stddef_wchar.h - Definition of wchar_t -------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_WCHAR_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _WCHAR_T
+
+#ifdef _MSC_EXTENSIONS
+#define _WCHAR_T_DEFINED
+#endif
+
+typedef __WCHAR_TYPE__ wchar_t;
+
+#endif
+
+#endif
--- a/lib/include/__stddef_wint_t.h
+++ b/lib/include/__stddef_wint_t.h
@ -0,0 +1,15 @@
+/*===---- __stddef_wint.h - Definition of wint_t ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WINT_T
+#define _WINT_T
+
+typedef __WINT_TYPE__ wint_t;
+
+#endif
--- a/lib/include/adcintrin.h
+++ b/lib/include/adcintrin.h
@ -0,0 +1,160 @@
+/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ADCINTRIN_H
+#define __ADCINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
+///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c SBB instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 32-bit unsigned minuend.
+/// \param __y
+///    The 32-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
+///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 64-bit unsigned minuend.
+/// \param __y
+///    The 64-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADCINTRIN_H */
--- a/lib/include/adxintrin.h
+++ b/lib/include/adxintrin.h
@ -15,7 +15,8 @@
 #define __ADXINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx")))

 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
@ -53,10 +54,10 @@ extern "C" {
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-                   unsigned int *__p) {
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }

@ -84,137 +85,10 @@ __INLINE unsigned char
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
-                   unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/* Intrinsics that are also available if __ADX__ is undefined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
 __INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
               unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif

@ -222,6 +96,7 @@ _subborrow_u64(unsigned char __cf, unsigned long long __x,
 }
 #endif

+#undef __INLINE
 #undef __DEFAULT_FN_ATTRS

 #endif /* __ADXINTRIN_H */
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -14647,67 +14647,86 @@ static __inline__ void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,

 static __inline__ vector signed char __ATTRS_o_ai vec_promote(signed char __a,
                                                              int __b) {
-  vector signed char __res = (vector signed char)(0);
-  __res[__b & 0x7] = __a;
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1,
+                              -1, -1, -1, -1, -1, -1, -1, -1);
+  __res[__b & 0xf] = __a;
  return __res;
 }

 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_promote(unsigned char __a, int __b) {
-  vector unsigned char __res = (vector unsigned char)(0);
-  __res[__b & 0x7] = __a;
+  const vector unsigned char __zero = (vector unsigned char)(0);
+  vector unsigned char __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1,
+                              -1, -1, -1, -1, -1, -1, -1, -1);
+  __res[__b & 0xf] = __a;
  return __res;
 }

 static __inline__ vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
-  vector short __res = (vector short)(0);
+  const vector short __zero = (vector short)(0);
+  vector short __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1);
  __res[__b & 0x7] = __a;
  return __res;
 }

 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_promote(unsigned short __a, int __b) {
-  vector unsigned short __res = (vector unsigned short)(0);
+  const vector unsigned short __zero = (vector unsigned short)(0);
+  vector unsigned short __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1);
  __res[__b & 0x7] = __a;
  return __res;
 }

 static __inline__ vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
-  vector int __res = (vector int)(0);
+  const vector int __zero = (vector int)(0);
+  vector int __res = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 static __inline__ vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a,
                                                               int __b) {
-  vector unsigned int __res = (vector unsigned int)(0);
+  const vector unsigned int __zero = (vector unsigned int)(0);
+  vector unsigned int __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 static __inline__ vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
-  vector float __res = (vector float)(0);
+  const vector float __zero = (vector float)(0);
+  vector float __res = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_promote(double __a, int __b) {
-  vector double __res = (vector double)(0);
+  const vector double __zero = (vector double)(0);
+  vector double __res = __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }

 static __inline__ vector signed long long __ATTRS_o_ai
 vec_promote(signed long long __a, int __b) {
-  vector signed long long __res = (vector signed long long)(0);
+  const vector signed long long __zero = (vector signed long long)(0);
+  vector signed long long __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }

 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_promote(unsigned long long __a, int __b) {
-  vector unsigned long long __res = (vector unsigned long long)(0);
+  const vector unsigned long long __zero = (vector unsigned long long)(0);
+  vector unsigned long long __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }
--- a/lib/include/ammintrin.h
+++ b/lib/include/ammintrin.h
@ -155,9 +155,9 @@ _mm_insert_si64(__m128i __x, __m128i __y)
 /// \param __a
 ///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(double *__p, __m128d __a)
+_mm_stream_sd(void *__p, __m128d __a)
 {
-  __builtin_ia32_movntsd(__p, (__v2df)__a);
+  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
 }

 /// Stores a 32-bit single-precision floating-point value in a 32-bit
@ -173,9 +173,9 @@ _mm_stream_sd(double *__p, __m128d __a)
 /// \param __a
 ///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(float *__p, __m128 __a)
+_mm_stream_ss(void *__p, __m128 __a)
 {
-  __builtin_ia32_movntss(__p, (__v4sf)__a);
+  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
 }

 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -4,6 +4,13 @@
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
+ * The Arm C Language Extensions specifications can be found in the following
+ * link: https://github.com/ARM-software/acle/releases
+ *
+ * The ACLE section numbers are subject to change. When consulting the
+ * specifications, it is recommended to search using section titles if
+ * the section numbers look outdated.
+ *
 *===-----------------------------------------------------------------------===
 */

@ -20,8 +27,8 @@
 extern "C" {
 #endif

-/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
-/* 8.3 Memory barriers */
+/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 7.3 Memory barriers */
 #if !__has_builtin(__dmb)
 #define __dmb(i) __builtin_arm_dmb(i)
 #endif
@ -32,7 +39,7 @@ extern "C" {
 #define __isb(i) __builtin_arm_isb(i)
 #endif

-/* 8.4 Hints */
+/* 7.4 Hints */

 #if !__has_builtin(__wfi)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
@ -68,7 +75,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif

-/* 8.5 Swap */
+/* 7.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
  uint32_t v;
@ -78,8 +85,8 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  return v;
 }

-/* 8.6 Memory prefetch intrinsics */
-/* 8.6.1 Data prefetch */
+/* 7.6 Memory prefetch intrinsics */
+/* 7.6.1 Data prefetch */
 #define __pld(addr) __pldx(0, 0, 0, addr)

 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
@ -90,7 +97,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
 #endif

-/* 8.6.2 Instruction prefetch */
+/* 7.6.2 Instruction prefetch */
 #define __pli(addr) __plix(0, 0, addr)

 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
@ -101,15 +108,15 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
 #endif

-/* 8.7 NOP */
+/* 7.7 NOP */
 #if !defined(_MSC_VER) || !defined(__aarch64__)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
  __builtin_arm_nop();
 }
 #endif

-/* 9 DATA-PROCESSING INTRINSICS */
-/* 9.2 Miscellaneous data-processing intrinsics */
+/* 8 DATA-PROCESSING INTRINSICS */
+/* 8.2 Miscellaneous data-processing intrinsics */
 /* ROR */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __ror(uint32_t __x, uint32_t __y) {
@ -248,9 +255,7 @@ __rbitl(unsigned long __t) {
 #endif
 }

-/*
- * 9.3 16-bit multiplications
- */
+/* 8.3 16-bit multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbb(int32_t __a, int32_t __b) {
@ -279,18 +284,18 @@ __smulwt(int32_t __a, int32_t __b) {
 #endif

 /*
- * 9.4 Saturating intrinsics
+ * 8.4 Saturating intrinsics
 *
 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
 * intrinsics are implemented and the flag is enabled.
 */
-/* 9.4.1 Width-specified saturation intrinsics */
+/* 8.4.1 Width-specified saturation intrinsics */
 #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif

-/* 9.4.2 Saturating addition and subtraction intrinsics */
+/* 8.4.2 Saturating addition and subtraction intrinsics */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
@ -308,7 +313,7 @@ __qdbl(int32_t __t) {
 }
 #endif

-/* 9.4.3 Accumultating multiplications */
+/* 8.4.3 Accumultating multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
@ -337,13 +342,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {
 #endif


-/* 9.5.4 Parallel 16-bit saturation */
+/* 8.5.4 Parallel 16-bit saturation */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
 #define __usat16(x, y) __builtin_arm_usat16(x, y)
 #endif

-/* 9.5.5 Packing and unpacking */
+/* 8.5.5 Packing and unpacking */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 typedef int32_t int8x4_t;
 typedef int32_t int16x2_t;
@ -368,7 +373,7 @@ __uxtb16(int8x4_t __a) {
 }
 #endif

-/* 9.5.6 Parallel selection */
+/* 8.5.6 Parallel selection */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __sel(uint8x4_t __a, uint8x4_t __b) {
@ -376,7 +381,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
 }
 #endif

-/* 9.5.7 Parallel 8-bit addition and subtraction */
+/* 8.5.7 Parallel 8-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qadd8(int8x4_t __a, int8x4_t __b) {
@ -428,7 +433,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
 }
 #endif

-/* 9.5.8 Sum of 8-bit absolute differences */
+/* 8.5.8 Sum of 8-bit absolute differences */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usad8(uint8x4_t __a, uint8x4_t __b) {
@ -440,7 +445,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
 }
 #endif

-/* 9.5.9 Parallel 16-bit addition and subtraction */
+/* 8.5.9 Parallel 16-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qadd16(int16x2_t __a, int16x2_t __b) {
@ -540,7 +545,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 }
 #endif

-/* 9.5.10 Parallel 16-bit multiplications */
+/* 8.5.10 Parallel 16-bit multiplications */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
@ -592,7 +597,22 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
 }
 #endif

-/* 9.7 CRC32 intrinsics */
+/* 8.6 Floating-point data-processing intrinsics */
+#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING)    &&                         \
+  (__ARM_FEATURE_DIRECTED_ROUNDING))             &&                         \
+  (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__rintn(double __a) {
+  return __builtin_roundeven(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__rintnf(float __a) {
+  return __builtin_roundevenf(__a);
+}
+#endif
+
+/* 8.8 CRC32 intrinsics */
 #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
@ -636,6 +656,7 @@ __crc32cd(uint32_t __a, uint64_t __b) {
 }
 #endif

+/* 8.6 Floating-point data-processing intrinsics */
 /* Armv8.3-A Javascript conversion intrinsic */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
@ -687,7 +708,7 @@ __rint64x(double __a) {
 }
 #endif

-/* Armv8.7-A load/store 64-byte intrinsics */
+/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 typedef struct {
    uint64_t val[8];
@ -713,7 +734,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 }
 #endif

-/* 10.1 Special register intrinsics */
+/* 11.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
 #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
@ -727,7 +748,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))

-/* Memory Tagging Extensions (MTE) Intrinsics */
+/* 10.3 Memory Tagging Extensions (MTE) Intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@ -736,12 +757,71 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)

-/* Memory Operations Intrinsics */
+/* 18 Memory Operations Intrinsics */
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif

-/* Transactional Memory Extension (TME) Intrinsics */
+/* 11.3 Coprocessor Intrinsics */
+#if defined(__ARM_FEATURE_COPROC)
+
+#if (__ARM_FEATURE_COPROC & 0x1)
+
+#if (__ARM_ARCH < 8)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#endif /* __ARM_ARCH < 8 */
+
+#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
+#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
+
+#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2)                         \
+  __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc(coproc, opc1, CRn, CRm, opc2)                                \
+  __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
+
+#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
+
+#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* ___ARM_ARCH_8M_MAIN__ */
+
+#endif /* __ARM_FEATURE_COPROC & 0x1 */
+
+#if (__ARM_FEATURE_COPROC & 0x2)
+#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)                          \
+  __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
+#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
+#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
+#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
+#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)                        \
+  __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2)                               \
+  __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x4)
+#define __arm_mcrr(coproc, opc1, value, CRm)                                   \
+  __builtin_arm_mcrr(coproc, opc1, value, CRm)
+#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x8)
+#define __arm_mcrr2(coproc, opc1, value, CRm)                                  \
+  __builtin_arm_mcrr2(coproc, opc1, value, CRm)
+#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
+#endif
+
+#endif // __ARM_FEATURE_COPROC
+
+/* 17 Transactional Memory Extension (TME) Intrinsics */
 #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME

 #define _TMFAILURE_REASON  0x00007fffu
@ -763,7 +843,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {

 #endif /* __ARM_FEATURE_TME */

-/* Armv8.5-A Random number generation intrinsics */
+/* 8.7 Armv8.5-A Random number generation intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndr(uint64_t *__p) {
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
@ -35,12 +35,7 @@
 #include <stdint.h>

 #include <arm_bf16.h>
-typedef float float32_t;
-typedef __fp16 float16_t;
-#ifdef __aarch64__
-typedef double float64_t;
-#endif
-
+#include <arm_vector_types.h>
 #ifdef __aarch64__
 typedef uint8_t poly8_t;
 typedef uint16_t poly16_t;
@ -51,30 +46,6 @@ typedef int8_t poly8_t;
 typedef int16_t poly16_t;
 typedef int64_t poly64_t;
 #endif
-typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
-typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
-typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
-typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
-typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
-typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
-typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
-typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
-typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
-typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
-typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
-typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
-typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
-typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
-typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
-typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
-typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
-typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
-typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
-typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
-#ifdef __aarch64__
-typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
-typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
-#endif
 typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
 typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
 typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
@ -82,96 +53,6 @@ typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
 typedef __attribute__((neon_polyvector_type(1))) poly64_t poly64x1_t;
 typedef __attribute__((neon_polyvector_type(2))) poly64_t poly64x2_t;

-typedef struct int8x8x2_t {
-  int8x8_t val[2];
-} int8x8x2_t;
-
-typedef struct int8x16x2_t {
-  int8x16_t val[2];
-} int8x16x2_t;
-
-typedef struct int16x4x2_t {
-  int16x4_t val[2];
-} int16x4x2_t;
-
-typedef struct int16x8x2_t {
-  int16x8_t val[2];
-} int16x8x2_t;
-
-typedef struct int32x2x2_t {
-  int32x2_t val[2];
-} int32x2x2_t;
-
-typedef struct int32x4x2_t {
-  int32x4_t val[2];
-} int32x4x2_t;
-
-typedef struct int64x1x2_t {
-  int64x1_t val[2];
-} int64x1x2_t;
-
-typedef struct int64x2x2_t {
-  int64x2_t val[2];
-} int64x2x2_t;
-
-typedef struct uint8x8x2_t {
-  uint8x8_t val[2];
-} uint8x8x2_t;
-
-typedef struct uint8x16x2_t {
-  uint8x16_t val[2];
-} uint8x16x2_t;
-
-typedef struct uint16x4x2_t {
-  uint16x4_t val[2];
-} uint16x4x2_t;
-
-typedef struct uint16x8x2_t {
-  uint16x8_t val[2];
-} uint16x8x2_t;
-
-typedef struct uint32x2x2_t {
-  uint32x2_t val[2];
-} uint32x2x2_t;
-
-typedef struct uint32x4x2_t {
-  uint32x4_t val[2];
-} uint32x4x2_t;
-
-typedef struct uint64x1x2_t {
-  uint64x1_t val[2];
-} uint64x1x2_t;
-
-typedef struct uint64x2x2_t {
-  uint64x2_t val[2];
-} uint64x2x2_t;
-
-typedef struct float16x4x2_t {
-  float16x4_t val[2];
-} float16x4x2_t;
-
-typedef struct float16x8x2_t {
-  float16x8_t val[2];
-} float16x8x2_t;
-
-typedef struct float32x2x2_t {
-  float32x2_t val[2];
-} float32x2x2_t;
-
-typedef struct float32x4x2_t {
-  float32x4_t val[2];
-} float32x4x2_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x2_t {
-  float64x1_t val[2];
-} float64x1x2_t;
-
-typedef struct float64x2x2_t {
-  float64x2_t val[2];
-} float64x2x2_t;
-
-#endif
 typedef struct poly8x8x2_t {
  poly8x8_t val[2];
 } poly8x8x2_t;
@ -196,96 +77,6 @@ typedef struct poly64x2x2_t {
  poly64x2_t val[2];
 } poly64x2x2_t;

-typedef struct int8x8x3_t {
-  int8x8_t val[3];
-} int8x8x3_t;
-
-typedef struct int8x16x3_t {
-  int8x16_t val[3];
-} int8x16x3_t;
-
-typedef struct int16x4x3_t {
-  int16x4_t val[3];
-} int16x4x3_t;
-
-typedef struct int16x8x3_t {
-  int16x8_t val[3];
-} int16x8x3_t;
-
-typedef struct int32x2x3_t {
-  int32x2_t val[3];
-} int32x2x3_t;
-
-typedef struct int32x4x3_t {
-  int32x4_t val[3];
-} int32x4x3_t;
-
-typedef struct int64x1x3_t {
-  int64x1_t val[3];
-} int64x1x3_t;
-
-typedef struct int64x2x3_t {
-  int64x2_t val[3];
-} int64x2x3_t;
-
-typedef struct uint8x8x3_t {
-  uint8x8_t val[3];
-} uint8x8x3_t;
-
-typedef struct uint8x16x3_t {
-  uint8x16_t val[3];
-} uint8x16x3_t;
-
-typedef struct uint16x4x3_t {
-  uint16x4_t val[3];
-} uint16x4x3_t;
-
-typedef struct uint16x8x3_t {
-  uint16x8_t val[3];
-} uint16x8x3_t;
-
-typedef struct uint32x2x3_t {
-  uint32x2_t val[3];
-} uint32x2x3_t;
-
-typedef struct uint32x4x3_t {
-  uint32x4_t val[3];
-} uint32x4x3_t;
-
-typedef struct uint64x1x3_t {
-  uint64x1_t val[3];
-} uint64x1x3_t;
-
-typedef struct uint64x2x3_t {
-  uint64x2_t val[3];
-} uint64x2x3_t;
-
-typedef struct float16x4x3_t {
-  float16x4_t val[3];
-} float16x4x3_t;
-
-typedef struct float16x8x3_t {
-  float16x8_t val[3];
-} float16x8x3_t;
-
-typedef struct float32x2x3_t {
-  float32x2_t val[3];
-} float32x2x3_t;
-
-typedef struct float32x4x3_t {
-  float32x4_t val[3];
-} float32x4x3_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x3_t {
-  float64x1_t val[3];
-} float64x1x3_t;
-
-typedef struct float64x2x3_t {
-  float64x2_t val[3];
-} float64x2x3_t;
-
-#endif
 typedef struct poly8x8x3_t {
  poly8x8_t val[3];
 } poly8x8x3_t;
@ -310,96 +101,6 @@ typedef struct poly64x2x3_t {
  poly64x2_t val[3];
 } poly64x2x3_t;

-typedef struct int8x8x4_t {
-  int8x8_t val[4];
-} int8x8x4_t;
-
-typedef struct int8x16x4_t {
-  int8x16_t val[4];
-} int8x16x4_t;
-
-typedef struct int16x4x4_t {
-  int16x4_t val[4];
-} int16x4x4_t;
-
-typedef struct int16x8x4_t {
-  int16x8_t val[4];
-} int16x8x4_t;
-
-typedef struct int32x2x4_t {
-  int32x2_t val[4];
-} int32x2x4_t;
-
-typedef struct int32x4x4_t {
-  int32x4_t val[4];
-} int32x4x4_t;
-
-typedef struct int64x1x4_t {
-  int64x1_t val[4];
-} int64x1x4_t;
-
-typedef struct int64x2x4_t {
-  int64x2_t val[4];
-} int64x2x4_t;
-
-typedef struct uint8x8x4_t {
-  uint8x8_t val[4];
-} uint8x8x4_t;
-
-typedef struct uint8x16x4_t {
-  uint8x16_t val[4];
-} uint8x16x4_t;
-
-typedef struct uint16x4x4_t {
-  uint16x4_t val[4];
-} uint16x4x4_t;
-
-typedef struct uint16x8x4_t {
-  uint16x8_t val[4];
-} uint16x8x4_t;
-
-typedef struct uint32x2x4_t {
-  uint32x2_t val[4];
-} uint32x2x4_t;
-
-typedef struct uint32x4x4_t {
-  uint32x4_t val[4];
-} uint32x4x4_t;
-
-typedef struct uint64x1x4_t {
-  uint64x1_t val[4];
-} uint64x1x4_t;
-
-typedef struct uint64x2x4_t {
-  uint64x2_t val[4];
-} uint64x2x4_t;
-
-typedef struct float16x4x4_t {
-  float16x4_t val[4];
-} float16x4x4_t;
-
-typedef struct float16x8x4_t {
-  float16x8_t val[4];
-} float16x8x4_t;
-
-typedef struct float32x2x4_t {
-  float32x2_t val[4];
-} float32x2x4_t;
-
-typedef struct float32x4x4_t {
-  float32x4_t val[4];
-} float32x4x4_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x4_t {
-  float64x1_t val[4];
-} float64x1x4_t;
-
-typedef struct float64x2x4_t {
-  float64x2_t val[4];
-} float64x2x4_t;
-
-#endif
 typedef struct poly8x8x4_t {
  poly8x8_t val[4];
 } poly8x8x4_t;
@ -424,33 +125,6 @@ typedef struct poly64x2x4_t {
  poly64x2_t val[4];
 } poly64x2x4_t;

-typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
-typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
-
-typedef struct bfloat16x4x2_t {
-  bfloat16x4_t val[2];
-} bfloat16x4x2_t;
-
-typedef struct bfloat16x8x2_t {
-  bfloat16x8_t val[2];
-} bfloat16x8x2_t;
-
-typedef struct bfloat16x4x3_t {
-  bfloat16x4_t val[3];
-} bfloat16x4x3_t;
-
-typedef struct bfloat16x8x3_t {
-  bfloat16x8_t val[3];
-} bfloat16x8x3_t;
-
-typedef struct bfloat16x4x4_t {
-  bfloat16x4_t val[4];
-} bfloat16x4x4_t;
-
-typedef struct bfloat16x8x4_t {
-  bfloat16x8_t val[4];
-} bfloat16x8x4_t;
-
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))

 #ifdef __LITTLE_ENDIAN__
@ -66600,6 +66274,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd32x_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd32x_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd32x_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd32zq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66632,6 +66327,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd32z_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd32z_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd32z_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd64xq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66664,6 +66380,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd64x_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd64x_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd64x_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd64zq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66696,6 +66433,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd64z_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd64z_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd64z_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #endif
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
 #ifdef __LITTLE_ENDIAN__
--- a/lib/include/arm_sme.h
+++ b/lib/include/arm_sme.h
--- a/lib/include/arm_sme_draft_spec_subject_to_change.h
+++ b/lib/include/arm_sme_draft_spec_subject_to_change.h
@ -1,642 +0,0 @@
-/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics ------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_SME_H
-#define __ARM_SME_H
-
-#if !defined(__LITTLE_ENDIAN__)
-#error "Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h"
-#endif
-#include <arm_sve.h> 
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#define __aio static __inline__ __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsb), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsb(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsd), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsd(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsh), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsh(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsw), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsw(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za128), arm_streaming, arm_shared_za))
-void svld1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za16), arm_streaming, arm_shared_za))
-void svld1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za32), arm_streaming, arm_shared_za))
-void svld1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za64), arm_streaming, arm_shared_za))
-void svld1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za8), arm_streaming, arm_shared_za))
-void svld1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za128), arm_streaming, arm_shared_za))
-void svld1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za16), arm_streaming, arm_shared_za))
-void svld1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za32), arm_streaming, arm_shared_za))
-void svld1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za64), arm_streaming, arm_shared_za))
-void svld1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za8), arm_streaming, arm_shared_za))
-void svld1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_mask_za), arm_streaming_compatible, arm_shared_za))
-void svzero_mask_za(uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za), arm_streaming_compatible, arm_shared_za))
-void svzero_za();
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_vnum_za), arm_streaming_compatible, arm_shared_za))
-void svldr_vnum_za(uint32_t, uint64_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_vnum_za), arm_streaming_compatible, arm_shared_za, arm_preserves_za))
-void svstr_vnum_za(uint32_t, uint64_t, void *);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#undef __ai
-
-#endif /* __ARM_SME_H */
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/arm_vector_types.h
+++ b/lib/include/arm_vector_types.h
@ -0,0 +1,345 @@
+/*===---- arm_vector_types - ARM vector type ------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
+#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
+
+#endif
+#ifndef __ARM_NEON_TYPES_H
+#define __ARM_NEON_TYPES_H
+typedef float float32_t;
+typedef __fp16 float16_t;
+#ifdef __aarch64__
+typedef double float64_t;
+#endif
+
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+#endif
+
+typedef struct int8x8x2_t {
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t {
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t {
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t {
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t {
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t {
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t {
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t {
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t {
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t {
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t {
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t {
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t {
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t {
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t {
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t {
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float16x4x2_t {
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t {
+  float16x8_t val[2];
+} float16x8x2_t;
+
+typedef struct float32x2x2_t {
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t {
+  float32x4_t val[2];
+} float32x4x2_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x2_t {
+  float64x1_t val[2];
+} float64x1x2_t;
+
+typedef struct float64x2x2_t {
+  float64x2_t val[2];
+} float64x2x2_t;
+
+#endif
+typedef struct int8x8x3_t {
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t {
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t {
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t {
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t {
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t {
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t {
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t {
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t {
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t {
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t {
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t {
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t {
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t {
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t {
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t {
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float16x4x3_t {
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t {
+  float16x8_t val[3];
+} float16x8x3_t;
+
+typedef struct float32x2x3_t {
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t {
+  float32x4_t val[3];
+} float32x4x3_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x3_t {
+  float64x1_t val[3];
+} float64x1x3_t;
+
+typedef struct float64x2x3_t {
+  float64x2_t val[3];
+} float64x2x3_t;
+
+#endif
+typedef struct int8x8x4_t {
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t {
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t {
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t {
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t {
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t {
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t {
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t {
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t {
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t {
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t {
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t {
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t {
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t {
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t {
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t {
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float16x4x4_t {
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t {
+  float16x8_t val[4];
+} float16x8x4_t;
+
+typedef struct float32x2x4_t {
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t {
+  float32x4_t val[4];
+} float32x4x4_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x4_t {
+  float64x1_t val[4];
+} float64x1x4_t;
+
+typedef struct float64x2x4_t {
+  float64x2_t val[4];
+} float64x2x4_t;
+
+#endif
+typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
+typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
+
+typedef struct bfloat16x4x2_t {
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t {
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t {
+  bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t {
+  bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t {
+  bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t {
+  bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
+#endif // __ARM_NEON_TYPES_H
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -15,8 +15,12 @@
 #define __AVX2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx2,no-evex512"), __min_vector_width__(128)))

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
@ -1307,6 +1311,23 @@ _mm256_min_epu32(__m256i __a, __m256i __b)
  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 }

+/// Creates a 32-bit integer mask from the most significant bit of each byte
+///    in the 256-bit integer vector in \a __a and returns the result.
+///
+/// \code{.operation}
+/// FOR i := 0 TO 31
+///   j := i*8
+///   result[i] := __a[j+7]
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
+///
+/// \param __a
+///    A 256-bit integer vector containing the source bytes.
+/// \returns The 32-bit integer mask.
 static __inline__ int __DEFAULT_FN_ATTRS256
 _mm256_movemask_epi8(__m256i __a)
 {
@ -2962,7 +2983,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
+_mm256_stream_load_si256(const void *__V)
 {
  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
--- a/lib/include/avx512bf16intrin.h
+++ b/lib/include/avx512bf16intrin.h
@ -20,10 +20,11 @@ typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));

 #define __DEFAULT_FN_ATTRS512 \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
                 __min_vector_width__(512)))
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bf16,no-evex512")))

 /// Convert One BF16 Data to One Single Float Data.
 ///
--- a/lib/include/avx512bitalgintrin.h
+++ b/lib/include/avx512bitalgintrin.h
@ -15,7 +15,10 @@
 #define __AVX512BITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bitalg,evex512"),                           \
+                 __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi16(__m512i __A)
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -18,8 +18,12 @@ typedef unsigned int __mmask32;
 typedef unsigned long long __mmask64;

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,no-evex512")))

 static __inline __mmask32 __DEFAULT_FN_ATTRS
 _knot_mask32(__mmask32 __M)
@ -27,9 +31,7 @@ _knot_mask32(__mmask32 __M)
  return __builtin_ia32_knotsi(__M);
 }

-static __inline __mmask64 __DEFAULT_FN_ATTRS
-_knot_mask64(__mmask64 __M)
-{
+static __inline __mmask64 __DEFAULT_FN_ATTRS _knot_mask64(__mmask64 __M) {
  return __builtin_ia32_knotdi(__M);
 }

@ -39,9 +41,8 @@ _kand_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kand_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kand_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B);
 }

@ -51,9 +52,8 @@ _kandn_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kandn_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kandn_mask64(__mmask64 __A,
+                                                             __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B);
 }

@ -63,9 +63,8 @@ _kor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kor_mask64(__mmask64 __A,
+                                                           __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B);
 }

@ -75,9 +74,8 @@ _kxnor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxnor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxnor_mask64(__mmask64 __A,
+                                                             __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B);
 }

@ -87,9 +85,8 @@ _kxor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxor_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
 }

@ -112,14 +109,12 @@ _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
 }

@ -148,14 +143,12 @@ _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
 }

@ -171,9 +164,8 @@ _kadd_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kadd_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kadd_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B);
 }

@ -214,8 +206,7 @@ _load_mask32(__mmask32 *__A) {
  return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_load_mask64(__mmask64 *__A) {
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _load_mask64(__mmask64 *__A) {
  return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A);
 }

@ -224,8 +215,8 @@ _store_mask32(__mmask32 *__A, __mmask32 __B) {
  *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask64(__mmask64 *__A, __mmask64 __B) {
+static __inline__ void __DEFAULT_FN_ATTRS _store_mask64(__mmask64 *__A,
+                                                        __mmask64 __B) {
  *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B);
 }

@ -1714,9 +1705,8 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
                                              (__v64qi) _mm512_setzero_si512());
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A,
+                                                               __mmask64 __B) {
  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
                (__mmask64) __B);
 }
--- a/lib/include/avx512cdintrin.h
+++ b/lib/include/avx512cdintrin.h
@ -15,7 +15,9 @@
 #define __AVX512CDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512cd,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
--- a/lib/include/avx512dqintrin.h
+++ b/lib/include/avx512dqintrin.h
@ -15,8 +15,10 @@
 #define __AVX512DQINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512dq,no-evex512")))

 static __inline __mmask8 __DEFAULT_FN_ATTRS
 _knot_mask8(__mmask8 __M)
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -167,9 +167,13 @@ typedef enum
 } _MM_MANTISSA_SIGN_ENUM;

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,no-evex512"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,no-evex512")))

 /* Create vectors with repeated elements */

--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
@ -22,13 +22,15 @@ typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS512                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
-                 __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
 #define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,no-evex512"),                          \
                 __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,no-evex512"),                          \
                 __min_vector_width__(128)))

 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
--- a/lib/include/avx512ifmaintrin.h
+++ b/lib/include/avx512ifmaintrin.h
@ -15,7 +15,9 @@
 #define __IFMAINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
--- a/lib/include/avx512ifmavlintrin.h
+++ b/lib/include/avx512ifmavlintrin.h
@ -15,8 +15,14 @@
 #define __IFMAVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))

 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
--- a/lib/include/avx512pfintrin.h
+++ b/lib/include/avx512pfintrin.h
@ -14,9 +14,6 @@
 #ifndef __AVX512PFINTRIN_H
 #define __AVX512PFINTRIN_H

-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
-
 #define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                             (void const *)(addr), (int)(scale), \
@ -92,6 +89,4 @@
  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), (int)(hint))

-#undef __DEFAULT_FN_ATTRS
-
 #endif
--- a/lib/include/avx512vbmi2intrin.h
+++ b/lib/include/avx512vbmi2intrin.h
@ -15,7 +15,7 @@
 #define __AVX512VBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))


 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/lib/include/avx512vbmiintrin.h
+++ b/lib/include/avx512vbmiintrin.h
@ -15,8 +15,9 @@
 #define __VBMIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
-
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
--- a/lib/include/avx512vbmivlintrin.h
+++ b/lib/include/avx512vbmivlintrin.h
@ -15,9 +15,14 @@
 #define __VBMIVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
-
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -15,12 +15,14 @@
 #ifndef __AVX512VLBF16INTRIN_H
 #define __AVX512VLBF16INTRIN_H

-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__, \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__, \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(256)))

 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
--- a/lib/include/avx512vlbitalgintrin.h
+++ b/lib/include/avx512vlbitalgintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLBITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(256)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi16(__m256i __A)
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLBWINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bw,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bw,no-evex512"),                   \
+                 __min_vector_width__(256)))

 /* Integer compare */

--- a/lib/include/avx512vlcdintrin.h
+++ b/lib/include/avx512vlcdintrin.h
@ -14,9 +14,14 @@
 #define __AVX512VLCDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmb_epi64 (__mmask8 __A)
--- a/lib/include/avx512vldqintrin.h
+++ b/lib/include/avx512vldqintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLDQINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512dq,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512dq,no-evex512"),                   \
+                 __min_vector_width__(256)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
--- a/lib/include/avx512vlfp16intrin.h
+++ b/lib/include/avx512vlfp16intrin.h
@ -19,11 +19,11 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16, avx512vl"),                           \
+                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
                 __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16, avx512vl"),                           \
+                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
                 __min_vector_width__(128)))

 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -14,8 +14,14 @@
 #ifndef __AVX512VLINTRIN_H
 #define __AVX512VLINTRIN_H

-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,no-evex512"),                            \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,no-evex512"),                            \
+                 __min_vector_width__(256)))

 typedef short __v2hi __attribute__((__vector_size__(4)));
 typedef char __v4qi __attribute__((__vector_size__(4)));
--- a/lib/include/avx512vlvbmi2intrin.h
+++ b/lib/include/avx512vlvbmi2intrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLVBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLVNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(256)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
--- a/lib/include/avx512vlvp2intersectintrin.h
+++ b/lib/include/avx512vlvp2intersectintrin.h
@ -28,12 +28,14 @@
 #ifndef _AVX512VLVP2INTERSECT_H
 #define _AVX512VLVP2INTERSECT_H

-#define __DEFAULT_FN_ATTRS128 \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
                 __min_vector_width__(128)))

-#define __DEFAULT_FN_ATTRS256 \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
                 __min_vector_width__(256)))
 /// Store, in an even/odd pair of mask registers, the indicators of the
 /// locations of value matches between dwords in operands __a and __b.
--- a/lib/include/avx512vnniintrin.h
+++ b/lib/include/avx512vnniintrin.h
@ -15,8 +15,9 @@
 #define __AVX512VNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
-
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vnni,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
--- a/lib/include/avx512vp2intersectintrin.h
+++ b/lib/include/avx512vp2intersectintrin.h
@ -28,8 +28,9 @@
 #ifndef _AVX512VP2INTERSECT_H
 #define _AVX512VP2INTERSECT_H

-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vp2intersect"), \
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vp2intersect,evex512"),                     \
                 __min_vector_width__(512)))

 /// Store, in an even/odd pair of mask registers, the indicators of the
--- a/lib/include/avx512vpopcntdqintrin.h
+++ b/lib/include/avx512vpopcntdqintrin.h
@ -17,7 +17,9 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,evex512"),                        \
+                 __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
--- a/lib/include/avx512vpopcntdqvlintrin.h
+++ b/lib/include/avx512vpopcntdqvlintrin.h
@ -17,9 +17,13 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi64(__m128i __A) {
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -50,8 +50,12 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
 #endif

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
+                 __min_vector_width__(128)))

 /* Arithmetic */
 /// Adds two 256-bit vectors of [4 x double].
@ -3563,7 +3567,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 /// \param __b
 ///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
+_mm256_stream_si256(void *__a, __m256i __b)
 {
  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
@ -3583,7 +3587,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
+_mm256_stream_pd(void *__a, __m256d __b)
 {
  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
@ -3604,7 +3608,7 @@ _mm256_stream_pd(double *__a, __m256d __b)
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
+_mm256_stream_ps(void *__p, __m256 __a)
 {
  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -19,18 +19,17 @@
   to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))

-#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
-
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 16-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _tzcnt_u16
 static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
@ -41,13 +40,30 @@ __tzcnt_u16(unsigned short __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see __tzcnt_u16
+#define _tzcnt_u16 __tzcnt_u16
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_32
+/// \see { _mm_tzcnt_32 _tzcnt_u32 }
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@ -58,20 +74,35 @@ __tzcnt_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
+/// \returns A 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u32
+/// \see { __tzcnt_u32 _tzcnt_u32 }
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
  return (int)__builtin_ia32_tzcnt_u32(__X);
 }

-#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_32 __tzcnt_u32 }
+#define _tzcnt_u32 __tzcnt_u32

 #ifdef __x86_64__

@ -79,13 +110,13 @@ _mm_tzcnt_32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_64
+/// \see { _mm_tzcnt_64 _tzcnt_u64 }
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@ -96,20 +127,35 @@ __tzcnt_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u64
+/// \see { __tzcnt_u64 _tzcnt_u64 }
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
  return (long long)__builtin_ia32_tzcnt_u64(__X);
 }

-#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_64 __tzcnt_u64
+#define _tzcnt_u64 __tzcnt_u64

 #endif /* __x86_64__ */

@ -121,21 +167,12 @@ _mm_tzcnt_64(unsigned long long __X)
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))

-#define _andn_u32(a, b)   (__andn_u32((a), (b)))
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32(a)      (__blsi_u32((a)))
-
-#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
-
-#define _blsr_u32(a)      (__blsr_u32((a)))
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing one of the operands.
@ -143,19 +180,40 @@ _mm_tzcnt_64(unsigned long long __X)
 ///    An unsigned integer containing one of the operands.
 /// \returns An unsigned integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
  return ~__X & __Y;
 }

+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u32
+#define _andn_u32 __andn_u32
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -178,7 +236,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -203,7 +261,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -224,33 +282,89 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be cleared.
 /// \returns An unsigned integer containing the result of clearing the bits from
 ///    the source operand.
+/// \see _blsi_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
  return __X & -__X;
 }

+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+/// \see __blsi_u32
+#define _blsi_u32 __blsi_u32
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+/// \see _blsmsk_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned integer used to create the mask.
 /// \returns An unsigned integer containing the newly created mask.
+/// \see __blsmsk_u32
+#define _blsmsk_u32 __blsmsk_u32
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+/// \see _blsr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
+__blsr_u32(unsigned int __X)
 {
-  return __X ^ (__X - 1);
+  return __X & (__X - 1);
 }

 /// Clears the least significant bit that is set to 1 in the source
@ -258,35 +372,27 @@ __blsmsk_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// \code
+/// unsigned int _bls4_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing the operand to be cleared.
 /// \returns An unsigned integer containing the result of clearing the source
 ///    operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
-  return __X & (__X - 1);
-}
+/// \see __blsr_u32
+#define _blsr_u32 __blsr_u32

 #ifdef __x86_64__

-#define _andn_u64(a, b)   (__andn_u64((a), (b)))
-
-/* _bextr_u64 != __bextr_u64 */
-#define _blsi_u64(a)      (__blsi_u64((a)))
-
-#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
-
-#define _blsr_u64(a)      (__blsr_u64((a)))
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing one of the operands.
@ -294,19 +400,41 @@ __blsr_u32(unsigned int __X)
 ///    An unsigned 64-bit integer containing one of the operands.
 /// \returns An unsigned 64-bit integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
  return ~__X & __Y;
 }

+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64(unsigned long long __X,
+///                              unsigned long long __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u64
+#define _andn_u64 __andn_u64
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -329,7 +457,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -354,7 +482,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -375,33 +503,89 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    bits from the source operand.
+/// \see _blsi_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
  return __X & -__X;
 }

+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+/// \see __blsi_u64
+#define _blsi_u64 __blsi_u64
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see _blsmsk_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
 /// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see __blsmsk_u64
+#define _blsmsk_u64 __blsmsk_u64
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+/// \see _blsr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
+__blsr_u64(unsigned long long __X)
 {
-  return __X ^ (__X - 1);
+  return __X & (__X - 1);
 }

 /// Clears the least significant bit that is set to 1 in the source
@ -409,17 +593,18 @@ __blsmsk_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing the operand to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
-  return __X & (__X - 1);
-}
+/// \see __blsr_u64
+#define _blsr_u64 __blsr_u64

 #endif /* __x86_64__ */

--- a/lib/include/cuda_wrappers/bits/basic_string.h
+++ b/lib/include/cuda_wrappers/bits/basic_string.h
@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/basic_string.h"
+
+#pragma pop_macro("__noinline__")
--- a/lib/include/cuda_wrappers/bits/basic_string.tcc
+++ b/lib/include/cuda_wrappers/bits/basic_string.tcc
@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/basic_string.tcc"
+
+#pragma pop_macro("__noinline__")
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -50,11 +50,11 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
-                 __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"),       \
-                 __min_vector_width__(64)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))

 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
                                                        __m128d __a) {
  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
 }
@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
                                                           __m128i __a) {
  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
 }
@ -3983,8 +3983,8 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
 ///    A 32-bit integer containing the value to be stored.
 static __inline__ void
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si32(int *__p, int __a) {
-  __builtin_ia32_movnti(__p, __a);
+    _mm_stream_si32(void *__p, int __a) {
+  __builtin_ia32_movnti((int *)__p, __a);
 }

 #ifdef __x86_64__
@ -4003,8 +4003,8 @@ static __inline__ void
 ///    A 64-bit integer containing the value to be stored.
 static __inline__ void
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si64(long long *__p, long long __a) {
-  __builtin_ia32_movnti64(__p, __a);
+    _mm_stream_si64(void *__p, long long __a) {
+  __builtin_ia32_movnti64((long long *)__p, __a);
 }
 #endif

--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -15,19 +15,36 @@
 #define __GFNIINTRIN_H

 /* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("gfni,no-evex512"), __min_vector_width__(128)))

 /* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS_Y                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx,gfni,no-evex512"),                            \
+                 __min_vector_width__(256)))

 /* Default attributes for ZMM unmasked forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS_Z                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,evex512,gfni"),                           \
+                 __min_vector_width__(512)))
 /* Default attributes for ZMM masked forms. */
-#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS_Z_MASK                                              \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,evex512,gfni"),                          \
+                 __min_vector_width__(512)))

 /* Default attributes for VLX masked forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS_VL128                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(256)))

 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -291,11 +291,13 @@

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDPID__)
-/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> RDPID </c> instruction.
+///
+/// \returns The 32-bit contents of the MSR.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
 _rdpid_u32(void) {
  return __builtin_ia32_rdpid();
@ -488,6 +490,15 @@ _writegsbase_u64(unsigned long long __V)
 * field inside of it.
 */

+/// Load a 16-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 16-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i16(void const * __P) {
  struct __loadu_i16 {
@ -496,6 +507,16 @@ _loadbe_i16(void const * __P) {
  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
 }

+/// Swap the bytes of a 16-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 16-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i16(void * __P, short __D) {
  struct __storeu_i16 {
@ -504,6 +525,15 @@ _storebe_i16(void * __P, short __D) {
  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
 }

+/// Load a 32-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 32-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i32(void const * __P) {
  struct __loadu_i32 {
@ -512,6 +542,16 @@ _loadbe_i32(void const * __P) {
  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
 }

+/// Swap the bytes of a 32-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 32-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i32(void * __P, int __D) {
  struct __storeu_i32 {
@ -521,6 +561,15 @@ _storebe_i32(void * __P, int __D) {
 }

 #ifdef __x86_64__
+/// Load a 64-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 64-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i64(void const * __P) {
  struct __loadu_i64 {
@ -529,6 +578,16 @@ _loadbe_i64(void const * __P) {
  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
 }

+/// Swap the bytes of a 64-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 64-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i64(void * __P, long long __D) {
  struct __storeu_i64 {
@ -578,9 +637,13 @@ _storebe_i64(void * __P, long long __D) {
 #include <cetintrin.h>
 #endif

-/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
- * whereas others are also available at all times. */
+/* Intrinsics inside adcintrin.h are available at all times. */
+#include <adcintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__ADX__)
 #include <adxintrin.h>
+#endif

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDSEED__)
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -572,6 +572,22 @@ unsigned char __readx18byte(unsigned long offset);
 unsigned short __readx18word(unsigned long offset);
 unsigned long __readx18dword(unsigned long offset);
 unsigned __int64 __readx18qword(unsigned long offset);
+
+double _CopyDoubleFromInt64(__int64);
+float _CopyFloatFromInt32(__int32);
+__int32 _CopyInt32FromFloat(float);
+__int64 _CopyInt64FromDouble(double);
+
+unsigned int _CountLeadingOnes(unsigned long);
+unsigned int _CountLeadingOnes64(unsigned __int64);
+unsigned int _CountLeadingSigns(long);
+unsigned int _CountLeadingSigns64(__int64);
+unsigned int _CountLeadingZeros(unsigned long);
+unsigned int _CountLeadingZeros64(unsigned _int64);
+unsigned int _CountOneBits(unsigned long);
+unsigned int _CountOneBits64(unsigned __int64);
+
+void __cdecl __prefetch(void *);
 #endif

 /*----------------------------------------------------------------------------*\
--- a/lib/include/larchintrin.h
+++ b/lib/include/larchintrin.h
@ -156,7 +156,7 @@ extern __inline unsigned char
  return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
 }

-extern __inline unsigned char
+extern __inline unsigned short
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __iocsrrd_h(unsigned int _1) {
  return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
@ -228,6 +228,18 @@ extern __inline void
  ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif

+#define __frecipe_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frecipe_s((float)_1)
+
+#define __frecipe_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frecipe_d((double)_1)
+
+#define __frsqrte_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frsqrte_s((float)_1)
+
+#define __frsqrte_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frsqrte_d((double)_1)
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/include/lasxintrin.h
+++ b/lib/include/lasxintrin.h
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -66,10 +66,8 @@

 #define CHAR_BIT  __CHAR_BIT__

-/* C2x 5.2.4.2.1 */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C23 5.2.4.2.1 */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 #define BOOL_WIDTH   __BOOL_WIDTH__
 #define CHAR_WIDTH   CHAR_BIT
 #define SCHAR_WIDTH  CHAR_BIT
--- a/lib/include/llvm_libc_wrappers/assert.h
+++ b/lib/include/llvm_libc_wrappers/assert.h
@ -0,0 +1,34 @@
+//===-- Wrapper for C standard assert.h declarations on the GPU ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
+
+#if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
+#error "This file is for GPU offloading compilation only"
+#endif
+
+#include_next <assert.h>
+
+#if __has_include(<llvm-libc-decls/assert.h>)
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define __LIBC_ATTRS __attribute__((device))
+#endif
+
+#pragma omp begin declare target
+
+#include <llvm-libc-decls/assert.h>
+
+#pragma omp end declare target
+
+#undef __LIBC_ATTRS
+
+#endif
+
+#endif // __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
--- a/lib/include/llvm_libc_wrappers/ctype.h
+++ b/lib/include/llvm_libc_wrappers/ctype.h
@ -13,8 +13,19 @@
 #error "This file is for GPU offloading compilation only"
 #endif

+// The GNU headers like to define 'toupper' and 'tolower' redundantly. This is
+// necessary to prevent it from doing that and remapping our implementation.
+#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__)
+#pragma push_macro("__USE_EXTERN_INLINES")
+#undef __USE_EXTERN_INLINES
+#endif
+
 #include_next <ctype.h>

+#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__)
+#pragma pop_macro("__USE_EXTERN_INLINES")
+#endif
+
 #if __has_include(<llvm-libc-decls/ctype.h>)

 #if defined(__HIP__) || defined(__CUDA__)
@ -26,6 +37,7 @@

 #pragma push_macro("isalnum")
 #pragma push_macro("isalpha")
+#pragma push_macro("isascii")
 #pragma push_macro("isblank")
 #pragma push_macro("iscntrl")
 #pragma push_macro("isdigit")
@ -36,11 +48,13 @@
 #pragma push_macro("isspace")
 #pragma push_macro("isupper")
 #pragma push_macro("isxdigit")
+#pragma push_macro("toascii")
 #pragma push_macro("tolower")
 #pragma push_macro("toupper")

 #undef isalnum
 #undef isalpha
+#undef isascii
 #undef iscntrl
 #undef isdigit
 #undef islower
@ -51,6 +65,7 @@
 #undef isupper
 #undef isblank
 #undef isxdigit
+#undef toascii
 #undef tolower
 #undef toupper

@ -64,6 +79,7 @@
 #if !defined(__NVPTX__) && !defined(__AMDGPU__)
 #pragma pop_macro("isalnum")
 #pragma pop_macro("isalpha")
+#pragma pop_macro("isascii")
 #pragma pop_macro("isblank")
 #pragma pop_macro("iscntrl")
 #pragma pop_macro("isdigit")
@ -74,6 +90,7 @@
 #pragma pop_macro("isspace")
 #pragma pop_macro("isupper")
 #pragma pop_macro("isxdigit")
+#pragma pop_macro("toascii")
 #pragma pop_macro("tolower")
 #pragma pop_macro("toupper")
 #endif
--- a/lib/include/llvm_libc_wrappers/stdio.h
+++ b/lib/include/llvm_libc_wrappers/stdio.h
@ -6,21 +6,58 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
-#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
-
 #if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
 #error "This file is for GPU offloading compilation only"
 #endif

 #include_next <stdio.h>

+// In some old versions of glibc, other standard headers sometimes define
+// special macros (e.g., __need_FILE) before including stdio.h to cause stdio.h
+// to produce special definitions.  Future includes of stdio.h when those
+// special macros are undefined are expected to produce the normal definitions
+// from stdio.h.
+//
+// We do not apply our include guard (__CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__)
+// unconditionally to the above include_next.  Otherwise, after an occurrence of
+// the first glibc stdio.h use case described above, the include_next would be
+// skipped for remaining includes of stdio.h, leaving required symbols
+// undefined.
+//
+// We make the following assumptions to handle all use cases:
+//
+// 1. If the above include_next produces special glibc definitions, then (a) it
+//    does not produce the normal definitions that we must intercept below, (b)
+//    the current file was included from a glibc header that already defined
+//    __GLIBC__ (usually by including glibc's <features.h>), and (c) the above
+//    include_next does not define _STDIO_H.  In that case, we skip the rest of
+//    the current file and don't guard against future includes.
+// 2. If the above include_next produces the normal stdio.h definitions, then
+//    either (a) __GLIBC__ is not defined because C headers are from some other
+//    libc implementation or (b) the above include_next defines _STDIO_H to
+//    prevent the above include_next from having any effect in the future.
+#if !defined(__GLIBC__) || defined(_STDIO_H)
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+
 #if __has_include(<llvm-libc-decls/stdio.h>)

 #if defined(__HIP__) || defined(__CUDA__)
 #define __LIBC_ATTRS __attribute__((device))
 #endif

+// Some headers provide these as macros. Temporarily undefine them so they do
+// not conflict with any definitions for the GPU.
+
+#pragma push_macro("stdout")
+#pragma push_macro("stdin")
+#pragma push_macro("stderr")
+
+#undef stdout
+#undef stderr
+#undef stdin
+
 #pragma omp begin declare target

 #include <llvm-libc-decls/stdio.h>
@ -29,6 +66,15 @@

 #undef __LIBC_ATTRS

+// Restore the original macros when compiling on the host.
+#if !defined(__NVPTX__) && !defined(__AMDGPU__)
+#pragma pop_macro("stdout")
+#pragma pop_macro("stderr")
+#pragma pop_macro("stdin")
+#endif
+
 #endif

 #endif // __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+
+#endif
--- a/lib/include/llvm_libc_wrappers/stdlib.h
+++ b/lib/include/llvm_libc_wrappers/stdlib.h
@ -23,8 +23,11 @@

 #pragma omp begin declare target

-// The LLVM C library uses this type so we forward declare it.
+// The LLVM C library uses these named types so we forward declare them.
 typedef void (*__atexithandler_t)(void);
+typedef int (*__bsearchcompare_t)(const void *, const void *);
+typedef int (*__qsortcompare_t)(const void *, const void *);
+typedef int (*__qsortrcompare_t)(const void *, const void *, void *);

 // Enforce ABI compatibility with the structs used by the LLVM C library.
 _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
--- a/lib/include/llvm_libc_wrappers/string.h
+++ b/lib/include/llvm_libc_wrappers/string.h
@ -13,9 +13,6 @@
 #error "This file is for GPU offloading compilation only"
 #endif

-// FIXME: The GNU headers provide C++ standard compliant headers when in C++
-// mode and the LLVM libc does not. We cannot enable memchr, strchr, strchrnul,
-// strpbrk, strrchr, strstr, or strcasestr until this is addressed.
 #include_next <string.h>

 #if __has_include(<llvm-libc-decls/string.h>)
@ -26,8 +23,70 @@

 #pragma omp begin declare target

+// The GNU headers provide C++ standard compliant headers when in C++ mode and
+// the LLVM libc does not. We need to manually provide the definitions using the
+// same prototypes.
+#if defined(__cplusplus) && defined(__GLIBC__) &&                              \
+    defined(__CORRECT_ISO_CPP_STRING_H_PROTO)
+
+#ifndef __LIBC_ATTRS
+#define __LIBC_ATTRS
+#endif
+
+extern "C" {
+void *memccpy(void *__restrict, const void *__restrict, int,
+              size_t) __LIBC_ATTRS;
+int memcmp(const void *, const void *, size_t) __LIBC_ATTRS;
+void *memcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS;
+void *memmem(const void *, size_t, const void *, size_t) __LIBC_ATTRS;
+void *memmove(void *, const void *, size_t) __LIBC_ATTRS;
+void *mempcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS;
+void *memset(void *, int, size_t) __LIBC_ATTRS;
+char *stpcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+char *stpncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+char *strcat(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+int strcmp(const char *, const char *) __LIBC_ATTRS;
+int strcoll(const char *, const char *) __LIBC_ATTRS;
+char *strcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+size_t strcspn(const char *, const char *) __LIBC_ATTRS;
+char *strdup(const char *) __LIBC_ATTRS;
+size_t strlen(const char *) __LIBC_ATTRS;
+char *strncat(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+int strncmp(const char *, const char *, size_t) __LIBC_ATTRS;
+char *strncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+char *strndup(const char *, size_t) __LIBC_ATTRS;
+size_t strnlen(const char *, size_t) __LIBC_ATTRS;
+size_t strspn(const char *, const char *) __LIBC_ATTRS;
+char *strtok(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+char *strtok_r(char *__restrict, const char *__restrict,
+               char **__restrict) __LIBC_ATTRS;
+size_t strxfrm(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+}
+
+extern "C++" {
+char *strstr(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strstr(const char *, const char *) noexcept __LIBC_ATTRS;
+char *strpbrk(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strpbrk(const char *, const char *) noexcept __LIBC_ATTRS;
+char *strrchr(char *, int) noexcept __LIBC_ATTRS;
+const char *strrchr(const char *, int) noexcept __LIBC_ATTRS;
+char *strchr(char *, int) noexcept __LIBC_ATTRS;
+const char *strchr(const char *, int) noexcept __LIBC_ATTRS;
+char *strchrnul(char *, int) noexcept __LIBC_ATTRS;
+const char *strchrnul(const char *, int) noexcept __LIBC_ATTRS;
+char *strcasestr(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strcasestr(const char *, const char *) noexcept __LIBC_ATTRS;
+void *memrchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+const void *memrchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+void *memchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+const void *memchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+}
+
+#else
 #include <llvm-libc-decls/string.h>

+#endif
+
 #pragma omp end declare target

 #undef __LIBC_ATTRS
--- a/lib/include/llvm_libc_wrappers/time.h
+++ b/lib/include/llvm_libc_wrappers/time.h
@ -0,0 +1,34 @@
+//===-- Wrapper for C standard time.h declarations on the GPU -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
+
+#if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
+#error "This file is for GPU offloading compilation only"
+#endif
+
+#include_next <time.h>
+
+#if __has_include(<llvm-libc-decls/time.h>)
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define __LIBC_ATTRS __attribute__((device))
+#endif
+
+#pragma omp begin declare target
+
+_Static_assert(sizeof(clock_t) == sizeof(long), "ABI mismatch!");
+
+#include <llvm-libc-decls/time.h>
+
+#pragma omp end declare target
+
+#endif
+
+#endif // __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
--- a/lib/include/lsxintrin.h
+++ b/lib/include/lsxintrin.h
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@ -22,7 +22,9 @@ typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
+                 __min_vector_width__(64)))

 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@ -31,10 +33,10 @@ typedef char __v8qi __attribute__((__vector_size__(8)));
 ///
 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
-static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
-_mm_empty(void)
-{
-    __builtin_ia32_emms();
+static __inline__ void __attribute__((__always_inline__, __nodebug__,
+                                      __target__("mmx,no-evex512")))
+_mm_empty(void) {
+  __builtin_ia32_emms();
 }

 /// Constructs a 64-bit integer vector, setting the lower 32 bits to the
--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@ -153,10 +153,163 @@ module _Builtin_intrinsics [system] [extern_c] {
  }
 }

-module _Builtin_stddef_max_align_t [system] [extern_c] {
-  header "__stddef_max_align_t.h"
+// Start -fbuiltin-headers-in-system-modules affected modules
+
+// The following modules all ignore their headers when
+// -fbuiltin-headers-in-system-modules is passed, and many of
+// those headers join system modules when present.
+
+// e.g. if -fbuiltin-headers-in-system-modules is passed, then
+// float.h will not be in the _Builtin_float module (that module
+// will be empty). If there is a system module that declares
+// `header "float.h"`, then the builtin float.h will join
+// that module. The system float.h (if present) will be treated
+// as a textual header in the sytem module.
+module _Builtin_float [system] {
+  header "float.h"
+  export *
 }

+module _Builtin_inttypes [system] {
+  header "inttypes.h"
+  export *
+}
+
+module _Builtin_iso646 [system] {
+  header "iso646.h"
+  export *
+}
+
+module _Builtin_limits [system] {
+  header "limits.h"
+  export *
+}
+
+module _Builtin_stdalign [system] {
+  header "stdalign.h"
+  export *
+}
+
+module _Builtin_stdarg [system] {
+  textual header "stdarg.h"
+
+  explicit module __gnuc_va_list {
+    header "__stdarg___gnuc_va_list.h"
+    export *
+  }
+
+  explicit module __va_copy {
+    header "__stdarg___va_copy.h"
+    export *
+  }
+
+  explicit module va_arg {
+    header "__stdarg_va_arg.h"
+    export *
+  }
+
+  explicit module va_copy {
+    header "__stdarg_va_copy.h"
+    export *
+  }
+
+  explicit module va_list {
+    header "__stdarg_va_list.h"
+    export *
+  }
+}
+
+module _Builtin_stdatomic [system] {
+  header "stdatomic.h"
+  export *
+}
+
+module _Builtin_stdbool [system] {
+  header "stdbool.h"
+  export *
+}
+
+module _Builtin_stddef [system] {
+  textual header "stddef.h"
+
+  // __stddef_max_align_t.h is always in this module, even if
+  // -fbuiltin-headers-in-system-modules is passed.
+  explicit module max_align_t {
+    header "__stddef_max_align_t.h"
+    export *
+  }
+
+  explicit module null {
+    header "__stddef_null.h"
+    export *
+  }
+
+  explicit module nullptr_t {
+    header "__stddef_nullptr_t.h"
+    export *
+  }
+
+  explicit module offsetof {
+    header "__stddef_offsetof.h"
+    export *
+  }
+
+  explicit module ptrdiff_t {
+    header "__stddef_ptrdiff_t.h"
+    export *
+  }
+
+  explicit module rsize_t {
+    header "__stddef_rsize_t.h"
+    export *
+  }
+
+  explicit module size_t {
+    header "__stddef_size_t.h"
+    export *
+  }
+
+  explicit module unreachable {
+    header "__stddef_unreachable.h"
+    export *
+  }
+
+  explicit module wchar_t {
+    header "__stddef_wchar_t.h"
+    export *
+  }
+}
+
+// wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+// for compatibility, but must be explicitly requested. Therefore
+// __stddef_wint_t.h is not part of _Builtin_stddef. It is always in
+// this module even if -fbuiltin-headers-in-system-modules is passed.
+module _Builtin_stddef_wint_t [system] {
+  header "__stddef_wint_t.h"
+  export *
+}
+
+module _Builtin_stdint [system] {
+  header "stdint.h"
+  export *
+}
+
+module _Builtin_stdnoreturn [system] {
+  header "stdnoreturn.h"
+  export *
+}
+
+module _Builtin_tgmath [system] {
+  header "tgmath.h"
+  export *
+}
+
+module _Builtin_unwind [system] {
+  header "unwind.h"
+  export *
+}
+// End -fbuiltin-headers-in-system-modules affected modules
+
 module opencl_c {
  requires opencl
  header "opencl-c.h"
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -45,6 +45,7 @@
 #define __opencl_c_ext_fp32_local_atomic_add 1
 #define __opencl_c_ext_fp32_global_atomic_min_max 1
 #define __opencl_c_ext_fp32_local_atomic_min_max 1
+#define __opencl_c_ext_image_raw10_raw12 1

 #endif // defined(__SPIR__) || defined(__SPIRV__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
@ -477,6 +478,10 @@ typedef enum memory_order
 #if __OPENCL_C_VERSION__ >= CL_VERSION_3_0
 #define CLK_UNORM_INT_101010_2 0x10E0
 #endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
+#ifdef __opencl_c_ext_image_raw10_raw12
+#define CLK_UNSIGNED_INT_RAW10_EXT 0x10E3
+#define CLK_UNSIGNED_INT_RAW12_EXT 0x10E4
+#endif // __opencl_c_ext_image_raw10_raw12

 // Channel order, numbering must be aligned with cl_channel_order in cl.h
 //
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@ -1,4 +1,4 @@
-/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
+/*===-- __clang_openmp_device_functions.h - OpenMP math declares -*- c++ -*-===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -17,8 +17,9 @@
 #include <emmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse3,no-evex512"), __min_vector_width__(128)))

 /// Loads data from an unaligned memory location to elements in a 128-bit
 ///    vector.
--- a/Show More
+++ b/Show More