Merge branch 'llvm13'

2026-02-11 12:01:18 +00:00 · 2021-10-02 10:45:56 -07:00 · 2021-10-02 10:45:56 -07:00 · dde0adcb36
commit dde0adcb36
parent c4cd592f0e 7a2624c3e4
749 changed files with 200103 additions and 33693 deletions
--- a/.builds/netbsd.yml
+++ b/.builds/netbsd.yml
@ -1,8 +0,0 @@
-image: netbsd/latest
-secrets:
-  - 51bfddf5-86a6-4e01-8576-358c72a4a0a4
-  - 512ed797-0927-475a-83fd-bc997792860c
-sources:
-  - https://github.com/ziglang/zig
-tasks:
-  - build: cd zig && ./ci/srht/netbsd_script
--- a/build.zig
+++ b/build.zig
@ -65,6 +65,26 @@ pub fn build(b: *Builder) !void {
    const omit_stage2 = b.option(bool, "omit-stage2", "Do not include stage2 behind a feature flag inside stage1") orelse false;
    const static_llvm = b.option(bool, "static-llvm", "Disable integration with system-installed LLVM, Clang, LLD, and libc++") orelse false;
    const enable_llvm = b.option(bool, "enable-llvm", "Build self-hosted compiler with LLVM backend enabled") orelse (is_stage1 or static_llvm);
+    const llvm_has_m68k = b.option(
+        bool,
+        "llvm-has-m68k",
+        "Whether LLVM has the experimental target m68k enabled",
+    ) orelse false;
+    const llvm_has_csky = b.option(
+        bool,
+        "llvm-has-csky",
+        "Whether LLVM has the experimental target csky enabled",
+    ) orelse false;
+    const llvm_has_ve = b.option(
+        bool,
+        "llvm-has-ve",
+        "Whether LLVM has the experimental target ve enabled",
+    ) orelse false;
+    const llvm_has_arc = b.option(
+        bool,
+        "llvm-has-arc",
+        "Whether LLVM has the experimental target arc enabled",
+    ) orelse false;
    const enable_macos_sdk = b.option(bool, "enable-macos-sdk", "Run tests requiring presence of macOS SDK and frameworks") orelse false;
    const config_h_path_option = b.option([]const u8, "config_h", "Path to the generated config.h");

@ -124,6 +144,10 @@ pub fn build(b: *Builder) !void {
    exe_options.addOption(u32, "mem_leak_frames", mem_leak_frames);
    exe_options.addOption(bool, "skip_non_native", skip_non_native);
    exe_options.addOption(bool, "have_llvm", enable_llvm);
+    exe_options.addOption(bool, "llvm_has_m68k", llvm_has_m68k);
+    exe_options.addOption(bool, "llvm_has_csky", llvm_has_csky);
+    exe_options.addOption(bool, "llvm_has_ve", llvm_has_ve);
+    exe_options.addOption(bool, "llvm_has_arc", llvm_has_arc);

    if (enable_llvm) {
        const cmake_cfg = if (static_llvm) null else findAndParseConfigH(b, config_h_path_option);
@ -282,6 +306,10 @@ pub fn build(b: *Builder) !void {
    test_stage2_options.addOption(bool, "is_stage1", is_stage1);
    test_stage2_options.addOption(bool, "omit_stage2", omit_stage2);
    test_stage2_options.addOption(bool, "have_llvm", enable_llvm);
+    test_stage2_options.addOption(bool, "llvm_has_m68k", llvm_has_m68k);
+    test_stage2_options.addOption(bool, "llvm_has_csky", llvm_has_csky);
+    test_stage2_options.addOption(bool, "llvm_has_ve", llvm_has_ve);
+    test_stage2_options.addOption(bool, "llvm_has_arc", llvm_has_arc);
    test_stage2_options.addOption(bool, "enable_qemu", is_qemu_enabled);
    test_stage2_options.addOption(bool, "enable_wine", is_wine_enabled);
    test_stage2_options.addOption(bool, "enable_wasmtime", is_wasmtime_enabled);
--- a/ci/azure/linux_script
+++ b/ci/azure/linux_script
@ -9,7 +9,7 @@ sudo apt-get install -y cmake s3cmd tidy
 ZIGDIR="$(pwd)"
 ARCH="$(uname -m)"
 TARGET="$ARCH-linux-musl"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.1-dev.94+535615117"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.9.0-dev.1243+456d7e5f5"
 PREFIX="$HOME/$CACHE_BASENAME"
 MCPU="baseline"
 JOBS="-j$(nproc)"
--- a/ci/azure/macos_arm64_script
+++ b/ci/azure/macos_arm64_script
@ -10,13 +10,13 @@ ZIGDIR="$(pwd)"
 HOST_ARCH="x86_64"
 HOST_TARGET="$HOST_ARCH-macos-gnu"
 HOST_MCPU="baseline"
-HOST_CACHE_BASENAME="zig+llvm+lld+clang-$HOST_TARGET-0.8.0-dev.2703+c12704a33"
+HOST_CACHE_BASENAME="zig+llvm+lld+clang-$HOST_TARGET-0.9.0-dev.1249+210ef5af8"
 HOST_PREFIX="$HOME/$HOST_CACHE_BASENAME"

 ARCH="aarch64"
 TARGET="$ARCH-macos-gnu"
 MCPU="apple_a14"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.0-dev.2703+c12704a33"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.9.0-dev.1249+210ef5af8"
 PREFIX="$HOME/$CACHE_BASENAME"

 JOBS="-j2"
@ -50,7 +50,7 @@ cd build.host
 cmake .. \
  -DCMAKE_INSTALL_PREFIX="$(pwd)/release" \
  -DCMAKE_PREFIX_PATH="$HOST_PREFIX" \
-  -DCMAKE_BUILD_TYPE=Debug \
+  -DCMAKE_BUILD_TYPE=Release \
  -DZIG_TARGET_TRIPLE="$HOST_TARGET" \
  -DZIG_TARGET_MCPU="$HOST_MCPU" \
  -DZIG_STATIC=ON
--- a/ci/azure/macos_script
+++ b/ci/azure/macos_script
@ -9,7 +9,7 @@ ZIGDIR="$(pwd)"
 ARCH="x86_64"
 TARGET="$ARCH-macos-gnu"
 MCPU="baseline"
-CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.8.1-dev.94+535615117"
+CACHE_BASENAME="zig+llvm+lld+clang-$TARGET-0.9.0-dev.1249+210ef5af8"
 PREFIX="$HOME/$CACHE_BASENAME"
 JOBS="-j2"

--- a/ci/azure/windows_msvc_install
+++ b/ci/azure/windows_msvc_install
@ -6,6 +6,11 @@ set -e
 pacman -Suy --needed --noconfirm
 pacman -S --needed --noconfirm wget p7zip python3-pip tar xz

+TARBALL="llvm+clang+lld-13.0.0-x86_64-windows-msvc-release-mt.tar.xz"
+
 pip install s3cmd
-wget -nv "https://ziglang.org/deps/llvm%2bclang%2blld-12.0.1-rc1-x86_64-windows-msvc-release-mt.tar.xz"
-tar xf llvm+clang+lld-12.0.1-rc1-x86_64-windows-msvc-release-mt.tar.xz
+wget -nv "https://ziglang.org/deps/$TARBALL"
+# If the first extraction fails, re-try it once; this can happen if the tarball
+# contains symlinks that are in the table of contents before the files that
+# they point to.
+tar -xf $TARBALL || tar --overwrite -xf $TARBALL
--- a/ci/azure/windows_msvc_script.bat
+++ b/ci/azure/windows_msvc_script.bat
@ -11,7 +11,7 @@ SET "MSYSTEM=%PREVMSYSTEM%"

 SET "ZIGBUILDDIR=%SRCROOT%\build"
 SET "ZIGINSTALLDIR=%ZIGBUILDDIR%\dist"
-SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-12.0.1-rc1-x86_64-windows-msvc-release-mt"
+SET "ZIGPREFIXPATH=%SRCROOT%\llvm+clang+lld-13.0.0-x86_64-windows-msvc-release-mt"

 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64

--- a/ci/drone/drone.yml
+++ b/ci/drone/drone.yml
@ -7,28 +7,28 @@ platform:

 steps:
 - name: build
-  image: ziglang/static-base:llvm12-aarch64-5
+  image: ziglang/static-base:llvm13-aarch64-1
  commands:
  - ./ci/drone/linux_script_build

 - name: test-1
  depends_on:
  - build
-  image: ziglang/static-base:llvm12-aarch64-5
+  image: ziglang/static-base:llvm13-aarch64-1
  commands:
  - ./ci/drone/linux_script_test 1

 - name: test-2
  depends_on:
  - build
-  image: ziglang/static-base:llvm12-aarch64-5
+  image: ziglang/static-base:llvm13-aarch64-1
  commands:
  - ./ci/drone/linux_script_test 2

 - name: test-3
  depends_on:
  - build
-  image: ziglang/static-base:llvm12-aarch64-5
+  image: ziglang/static-base:llvm13-aarch64-1
  commands:
  - ./ci/drone/linux_script_test 3

@ -38,7 +38,7 @@ steps:
  - test-1
  - test-2
  - test-3
-  image: ziglang/static-base:llvm12-aarch64-5
+  image: ziglang/static-base:llvm13-aarch64-1
  environment:
    SRHT_OAUTH_TOKEN:
      from_secret: SRHT_OAUTH_TOKEN
--- a/ci/srht/freebsd_script
+++ b/ci/srht/freebsd_script
@ -7,7 +7,7 @@ sudo pkg update -fq
 sudo pkg install -y cmake py38-s3cmd wget curl jq samurai

 ZIGDIR="$(pwd)"
-CACHE_BASENAME="zig+llvm+lld+clang-x86_64-freebsd-gnu-0.8.0-dev.2703+c12704a33"
+CACHE_BASENAME="zig+llvm+lld+clang-x86_64-freebsd-gnu-0.9.0-dev.1243+456d7e5f5"
 PREFIX="$HOME/$CACHE_BASENAME"

 cd $HOME
--- a/ci/srht/index.json
+++ b/ci/srht/index.json
@ -14,11 +14,6 @@
      "shasum": "{{X86_64_FREEBSD_SHASUM}}",
      "size": "{{X86_64_FREEBSD_BYTESIZE}}"
    },
-    "x86_64-netbsd": {
-      "tarball": "https://ziglang.org/builds/{{X86_64_NETBSD_TARBALL}}",
-      "shasum": "{{X86_64_NETBSD_SHASUM}}",
-      "size": "{{X86_64_NETBSD_BYTESIZE}}"
-    },
    "x86_64-macos": {
      "tarball": "https://ziglang.org/builds/{{X86_64_MACOS_TARBALL}}",
      "shasum": "{{X86_64_MACOS_SHASUM}}",
--- a/ci/srht/update_download_page
+++ b/ci/srht/update_download_page
@ -15,7 +15,7 @@ X86_64_WINDOWS_JSON_URL="https://ziglang.org/builds/x86_64-windows-$VERSION.json
 AARCH64_MACOS_JSON_URL="https://ziglang.org/builds/aarch64-macos-$VERSION.json"
 X86_64_MACOS_JSON_URL="https://ziglang.org/builds/x86_64-macos-$VERSION.json"
 X86_64_FREEBSD_JSON_URL="https://ziglang.org/builds/x86_64-freebsd-$VERSION.json"
-X86_64_NETBSD_JSON_URL="https://ziglang.org/builds/x86_64-netbsd-$VERSION.json"
+#X86_64_NETBSD_JSON_URL="https://ziglang.org/builds/x86_64-netbsd-$VERSION.json"

 # If any of these fail, it's not really this job failing; rather we have detected
 # that this job will be called again later when other jobs have completed.
@ -25,7 +25,7 @@ curl --fail -I "$X86_64_WINDOWS_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$AARCH64_MACOS_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_MACOS_JSON_URL" >/dev/null || exit 0
 curl --fail -I "$X86_64_FREEBSD_JSON_URL" >/dev/null || exit 0
-curl --fail -I "$X86_64_NETBSD_JSON_URL" >/dev/null || exit 0
+#curl --fail -I "$X86_64_NETBSD_JSON_URL" >/dev/null || exit 0

 # Without --user, this gave me:
 # ERROR: Could not install packages due to an EnvironmentError: [Errno 13] Permission denied
@ -86,10 +86,10 @@ export X86_64_FREEBSD_TARBALL="$(echo "$X86_64_FREEBSD_JSON" | jq .tarball -r)"
 export X86_64_FREEBSD_BYTESIZE="$(echo "$X86_64_FREEBSD_JSON" | jq .size -r)"
 export X86_64_FREEBSD_SHASUM="$(echo "$X86_64_FREEBSD_JSON" | jq .shasum -r)"

-X86_64_NETBSD_JSON=$(curl --fail "$X86_64_NETBSD_JSON_URL" || exit 1)
-export X86_64_NETBSD_TARBALL="$(echo "$X86_64_NETBSD_JSON" | jq .tarball -r)"
-export X86_64_NETBSD_BYTESIZE="$(echo "$X86_64_NETBSD_JSON" | jq .size -r)"
-export X86_64_NETBSD_SHASUM="$(echo "$X86_64_NETBSD_JSON" | jq .shasum -r)"
+#X86_64_NETBSD_JSON=$(curl --fail "$X86_64_NETBSD_JSON_URL" || exit 1)
+#export X86_64_NETBSD_TARBALL="$(echo "$X86_64_NETBSD_JSON" | jq .tarball -r)"
+#export X86_64_NETBSD_BYTESIZE="$(echo "$X86_64_NETBSD_JSON" | jq .size -r)"
+#export X86_64_NETBSD_SHASUM="$(echo "$X86_64_NETBSD_JSON" | jq .shasum -r)"

 export MASTER_DATE="$(date +%Y-%m-%d)"
 export MASTER_VERSION="$VERSION"
--- a/cmake/Findclang.cmake
+++ b/cmake/Findclang.cmake
@ -9,31 +9,31 @@

 find_path(CLANG_INCLUDE_DIRS NAMES clang/Frontend/ASTUnit.h
  PATHS
-    /usr/lib/llvm/12/include
-    /usr/lib/llvm-12/include
-    /usr/lib/llvm-12.0/include
-    /usr/local/llvm120/include
-    /usr/local/llvm12/include
-    /usr/local/opt/llvm@12/include
-    /opt/homebrew/opt/llvm@12/include
+    /usr/lib/llvm/13/include
+    /usr/lib/llvm-13/include
+    /usr/lib/llvm-13.0/include
+    /usr/local/llvm130/include
+    /usr/local/llvm13/include
+    /usr/local/opt/llvm@13/include
+    /opt/homebrew/opt/llvm@13/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(CLANG_LIBRARIES
    NAMES
-      clang-cpp-12.0
-      clang-cpp120
+      clang-cpp-13.0
+      clang-cpp130
      clang-cpp
    PATHS
      ${CLANG_LIBDIRS}
-      /usr/lib/llvm/12/lib
-      /usr/lib/llvm/12/lib64
-      /usr/lib/llvm-12/lib
-      /usr/local/llvm120/lib
-      /usr/local/llvm12/lib
-      /usr/local/opt/llvm@12/lib
-      /opt/homebrew/opt/llvm@12/lib
+      /usr/lib/llvm/13/lib
+      /usr/lib/llvm/13/lib64
+      /usr/lib/llvm-13/lib
+      /usr/local/llvm130/lib
+      /usr/local/llvm13/lib
+      /usr/local/opt/llvm@13/lib
+      /opt/homebrew/opt/llvm@13/lib
  )
 endif()

@ -43,13 +43,13 @@ if(NOT CLANG_LIBRARIES)
    find_library(CLANG_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
        ${CLANG_LIBDIRS}
-        /usr/lib/llvm/12/lib
-        /usr/lib/llvm-12/lib
-        /usr/lib/llvm-12.0/lib
-        /usr/local/llvm120/lib
-        /usr/local/llvm12/lib
-        /usr/local/opt/llvm@12/lib
-        /opt/homebrew/opt/llvm@12/lib
+        /usr/lib/llvm/13/lib
+        /usr/lib/llvm-13/lib
+        /usr/lib/llvm-13.0/lib
+        /usr/local/llvm130/lib
+        /usr/local/llvm13/lib
+        /usr/local/opt/llvm@13/lib
+        /opt/homebrew/opt/llvm@13/lib
        /mingw64/lib
        /c/msys64/mingw64/lib
        c:\\msys64\\mingw64\\lib
--- a/cmake/Findlld.cmake
+++ b/cmake/Findlld.cmake
@ -8,20 +8,20 @@

 find_path(LLD_INCLUDE_DIRS NAMES lld/Common/Driver.h
    PATHS
-        /usr/lib/llvm-12/include
-        /usr/local/llvm120/include
-        /usr/local/llvm12/include
-        /usr/local/opt/llvm@12/include
-        /opt/homebrew/opt/llvm@12/include
+        /usr/lib/llvm-13/include
+        /usr/local/llvm130/include
+        /usr/local/llvm13/include
+        /usr/local/opt/llvm@13/include
+        /opt/homebrew/opt/llvm@13/include
        /mingw64/include)

-find_library(LLD_LIBRARY NAMES lld-12.0 lld120 lld
+find_library(LLD_LIBRARY NAMES lld-13.0 lld130 lld
    PATHS
-        /usr/lib/llvm-12/lib
-        /usr/local/llvm120/lib
-        /usr/local/llvm12/lib
-        /usr/local/opt/llvm@12/lib
-        /opt/homebrew/opt/llvm@12/lib
+        /usr/lib/llvm-13/lib
+        /usr/local/llvm130/lib
+        /usr/local/llvm13/lib
+        /usr/local/opt/llvm@13/lib
+        /opt/homebrew/opt/llvm@13/lib
 )
 if(EXISTS ${LLD_LIBRARY})
    set(LLD_LIBRARIES ${LLD_LIBRARY})
@ -31,11 +31,11 @@ else()
        find_library(LLD_${_prettylibname_}_LIB NAMES ${_libname_}
            PATHS
                ${LLD_LIBDIRS}
-                /usr/lib/llvm-12/lib
-                /usr/local/llvm120/lib
-                /usr/local/llvm12/lib
-                /usr/local/opt/llvm@12/lib
-                /opt/homebrew/opt/llvm@12/lib
+                /usr/lib/llvm-13/lib
+                /usr/local/llvm130/lib
+                /usr/local/llvm13/lib
+                /usr/local/opt/llvm@13/lib
+                /opt/homebrew/opt/llvm@13/lib
                /mingw64/lib
                /c/msys64/mingw64/lib
                c:/msys64/mingw64/lib)
--- a/cmake/Findllvm.cmake
+++ b/cmake/Findllvm.cmake
@ -9,41 +9,41 @@

 find_path(LLVM_INCLUDE_DIRS NAMES llvm/IR/IRBuilder.h
  PATHS
-    /usr/lib/llvm/12/include
-    /usr/lib/llvm-12/include
-    /usr/lib/llvm-12.0/include
-    /usr/local/llvm12/include
-    /usr/local/llvm120/include
-    /usr/local/opt/llvm@12/include
-    /opt/homebrew/opt/llvm@12/include
+    /usr/lib/llvm/13/include
+    /usr/lib/llvm-13/include
+    /usr/lib/llvm-13.0/include
+    /usr/local/llvm13/include
+    /usr/local/llvm130/include
+    /usr/local/opt/llvm@13/include
+    /opt/homebrew/opt/llvm@13/include
    /mingw64/include
 )

 if(ZIG_PREFER_CLANG_CPP_DYLIB)
  find_library(LLVM_LIBRARIES
    NAMES
-      LLVM-12.0
-      LLVM-12
-      LLVM-120
+      LLVM-13.0
+      LLVM-13
+      LLVM-130
      LLVM
    PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/12/lib
-      /usr/lib/llvm/12/lib64
-      /usr/lib/llvm-12/lib
-      /usr/local/llvm12/lib
-      /usr/local/llvm120/lib
-      /usr/local/opt/llvm@12/lib
-      /opt/homebrew/opt/llvm@12/lib
+      /usr/lib/llvm/13/lib
+      /usr/lib/llvm/13/lib64
+      /usr/lib/llvm-13/lib
+      /usr/local/llvm13/lib
+      /usr/local/llvm130/lib
+      /usr/local/opt/llvm@13/lib
+      /opt/homebrew/opt/llvm@13/lib
  )

  find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
+      NAMES llvm-config-13 llvm-config-13.0 llvm-config130 llvm-config13 llvm-config
      PATHS
          "/mingw64/bin"
          "/c/msys64/mingw64/bin"
          "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-12.0.0/bin")
+          "C:/Libraries/llvm-13.0.0/bin")

  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
    message(FATAL_ERROR "unable to find llvm-config")
@ -58,23 +58,23 @@ if(ZIG_PREFER_CLANG_CPP_DYLIB)
    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)

-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 13)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 14)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 14)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
 elseif(ZIG_USE_LLVM_CONFIG)
  find_program(LLVM_CONFIG_EXE
-      NAMES llvm-config-12 llvm-config-12.0 llvm-config120 llvm-config12 llvm-config
+      NAMES llvm-config-13 llvm-config-13.0 llvm-config130 llvm-config13 llvm-config
      PATHS
          "/mingw64/bin"
          "/c/msys64/mingw64/bin"
          "c:/msys64/mingw64/bin"
-          "C:/Libraries/llvm-12.0.0/bin")
+          "C:/Libraries/llvm-13.0.0/bin")

  if ("${LLVM_CONFIG_EXE}" STREQUAL "LLVM_CONFIG_EXE-NOTFOUND")
    message(FATAL_ERROR "unable to find llvm-config")
@ -89,14 +89,14 @@ elseif(ZIG_USE_LLVM_CONFIG)
    OUTPUT_VARIABLE LLVM_CONFIG_VERSION
    OUTPUT_STRIP_TRAILING_WHITESPACE)

-  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 12)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_LESS 13)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 13)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_EQUAL 14)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()
-  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 13)
-    message(FATAL_ERROR "expected LLVM 12.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
+  if("${LLVM_CONFIG_VERSION}" VERSION_GREATER 14)
+    message(FATAL_ERROR "expected LLVM 13.x but found ${LLVM_CONFIG_VERSION} using ${LLVM_CONFIG_EXE}")
  endif()

  execute_process(
@ -170,7 +170,7 @@ elseif(ZIG_USE_LLVM_CONFIG)
  set(LLVM_LIBRARIES ${LLVM_LIBRARIES} ${LLVM_SYSTEM_LIBS})

  if(NOT LLVM_LIBRARIES)
-    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-12 LLVM-12.0)
+    find_library(LLVM_LIBRARIES NAMES LLVM LLVM-13 LLVM-13.0)
  endif()

  link_directories("${CMAKE_PREFIX_PATH}/lib")
@ -184,13 +184,13 @@ else()
    find_library(LLVM_${_prettylibname_}_LIB NAMES ${_libname_}
      PATHS
      ${LLVM_LIBDIRS}
-      /usr/lib/llvm/12/lib
-      /usr/lib/llvm-12/lib
-      /usr/lib/llvm-12.0/lib
-      /usr/local/llvm120/lib
-      /usr/local/llvm12/lib
-      /usr/local/opt/llvm@12/lib
-      /opt/homebrew/opt/llvm@12/lib
+      /usr/lib/llvm/13/lib
+      /usr/lib/llvm-13/lib
+      /usr/lib/llvm-13.0/lib
+      /usr/local/llvm130/lib
+      /usr/local/llvm13/lib
+      /usr/local/opt/llvm@13/lib
+      /opt/homebrew/opt/llvm@13/lib
      /mingw64/lib
      /c/msys64/mingw64/lib
      c:\\msys64\\mingw64\\lib)
@ -219,6 +219,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyAsmParser)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyCodeGen)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyDesc)
+  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyUtils)
  FIND_AND_ADD_LLVM_LIB(LLVMWebAssemblyInfo)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZDisassembler)
  FIND_AND_ADD_LLVM_LIB(LLVMSystemZAsmParser)
@ -294,11 +295,12 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMOrcJIT)
  FIND_AND_ADD_LLVM_LIB(LLVMMCJIT)
  FIND_AND_ADD_LLVM_LIB(LLVMJITLink)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcTargetProcess)
-  FIND_AND_ADD_LLVM_LIB(LLVMOrcShared)
  FIND_AND_ADD_LLVM_LIB(LLVMInterpreter)
  FIND_AND_ADD_LLVM_LIB(LLVMExecutionEngine)
  FIND_AND_ADD_LLVM_LIB(LLVMRuntimeDyld)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcTargetProcess)
+  FIND_AND_ADD_LLVM_LIB(LLVMOrcShared)
+  FIND_AND_ADD_LLVM_LIB(LLVMDWP)
  FIND_AND_ADD_LLVM_LIB(LLVMSymbolize)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoPDB)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoGSYM)
@ -311,7 +313,6 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMCFGuard)
  FIND_AND_ADD_LLVM_LIB(LLVMCoroutines)
  FIND_AND_ADD_LLVM_LIB(LLVMObjCARCOpts)
-  FIND_AND_ADD_LLVM_LIB(LLVMHelloNew)
  FIND_AND_ADD_LLVM_LIB(LLVMipo)
  FIND_AND_ADD_LLVM_LIB(LLVMVectorize)
  FIND_AND_ADD_LLVM_LIB(LLVMLinker)
@ -323,6 +324,7 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMGlobalISel)
  FIND_AND_ADD_LLVM_LIB(LLVMMIRParser)
  FIND_AND_ADD_LLVM_LIB(LLVMAsmPrinter)
+  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoDWARF)
  FIND_AND_ADD_LLVM_LIB(LLVMSelectionDAG)
  FIND_AND_ADD_LLVM_LIB(LLVMCodeGen)
@ -344,7 +346,6 @@ else()
  FIND_AND_ADD_LLVM_LIB(LLVMMCParser)
  FIND_AND_ADD_LLVM_LIB(LLVMMC)
  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoCodeView)
-  FIND_AND_ADD_LLVM_LIB(LLVMDebugInfoMSF)
  FIND_AND_ADD_LLVM_LIB(LLVMBitReader)
  FIND_AND_ADD_LLVM_LIB(LLVMCore)
  FIND_AND_ADD_LLVM_LIB(LLVMRemarks)
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@ -34,10 +34,12 @@ __DEVICE__ unsigned long long __brevll(unsigned long long __a) {
  return __nv_brevll(__a);
 }
 #if defined(__cplusplus)
-__DEVICE__ void __brkpt() { asm volatile("brkpt;"); }
+__DEVICE__ void __brkpt() { __asm__ __volatile__("brkpt;"); }
 __DEVICE__ void __brkpt(int __a) { __brkpt(); }
 #else
-__DEVICE__ void __attribute__((overloadable)) __brkpt(void) { asm volatile("brkpt;"); }
+__DEVICE__ void __attribute__((overloadable)) __brkpt(void) {
+  __asm__ __volatile__("brkpt;");
+}
 __DEVICE__ void __attribute__((overloadable)) __brkpt(int __a) { __brkpt(); }
 #endif
 __DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b,
@ -507,7 +509,7 @@ __DEVICE__ float __powf(float __a, float __b) {
 }

 // Parameter must have a known integer value.
-#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a))
+#define __prof_trigger(__a) __asm__ __volatile__("pmevent \t%0;" ::"i"(__a))
 __DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); }
 __DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) {
  return __nv_sad(__a, __b, __c);
@ -526,7 +528,7 @@ __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); }
 __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); }
 __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); };
 __DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); };
-__DEVICE__ void __trap(void) { asm volatile("trap;"); }
+__DEVICE__ void __trap(void) { __asm__ __volatile__("trap;"); }
 __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) {
  return __nvvm_atom_add_gen_i((int *)__p, __v);
 }
@ -1051,122 +1053,136 @@ __DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) {
 }
 __DEVICE__ unsigned int __vabs2(unsigned int __a) {
  unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabs4(unsigned int __a) {
  unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }

 __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabsss2(unsigned int __a) {
  unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vabsss4(unsigned int __a) {
  unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(0), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(0), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd2.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vadd4.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vavrg4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
@ -1174,7 +1190,9 @@ __DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.eq %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
@ -1182,7 +1200,9 @@ __DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
@ -1190,7 +1210,9 @@ __DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
@ -1198,7 +1220,9 @@ __DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
@ -1206,7 +1230,9 @@ __DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ge %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
@ -1214,7 +1240,9 @@ __DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
@ -1222,7 +1250,9 @@ __DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
@ -1230,7 +1260,9 @@ __DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
@ -1238,7 +1270,9 @@ __DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.gt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
@ -1246,7 +1280,9 @@ __DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
@ -1254,7 +1290,9 @@ __DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
@ -1262,7 +1300,9 @@ __DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
@ -1270,7 +1310,9 @@ __DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.le %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
@ -1278,7 +1320,9 @@ __DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
@ -1286,7 +1330,9 @@ __DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.s32.s32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
@ -1294,7 +1340,9 @@ __DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
@ -1302,7 +1350,9 @@ __DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.lt %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
@ -1310,7 +1360,9 @@ __DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset2.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
@ -1318,7 +1370,9 @@ __DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) {
 }
 __DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vset4.u32.u32.ne %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) {
@ -1345,94 +1399,112 @@ __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) {
    unsigned mask = __vcmpgts2(__a, __b);
    r = (__a & mask) | (__b & ~mask);
  } else {
-    asm("vmax2.s32.s32.s32 %0,%1,%2,%3;"
-        : "=r"(r)
-        : "r"(__a), "r"(__b), "r"(0));
+    __asm__("vmax2.s32.s32.s32 %0,%1,%2,%3;"
+            : "=r"(r)
+            : "r"(__a), "r"(__b), "r"(0));
  }
  return r;
 }
 __DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmax4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.s32.s32.s32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vmin4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }

 __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); }

 __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); }
 __DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
@ -1440,9 +1512,9 @@ __DEVICE__ unsigned int __vnegss2(unsigned int __a) {
 }
 __DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.s32.s32.s32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
@ -1450,16 +1522,16 @@ __DEVICE__ unsigned int __vnegss4(unsigned int __a) {
 }
 __DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub2.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) {
  unsigned int r;
-  asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
-      : "=r"(r)
-      : "r"(__a), "r"(__b), "r"(0));
+  __asm__("vsub4.u32.u32.u32.sat %0,%1,%2,%3;"
+          : "=r"(r)
+          : "r"(__a), "r"(__b), "r"(0));
  return r;
 }
 #endif // CUDA_VERSION >= 9020
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -166,6 +166,8 @@ __DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
 __DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
 __DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
 __DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
+__DEVICE__ double round(double __a) { return __nv_round(__a); }
+__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
 __DEVICE__ double log(double __a) { return __nv_log(__a); }
 __DEVICE__ double log10(double __a) { return __nv_log10(__a); }
 __DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
@ -270,8 +272,6 @@ __DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
 __DEVICE__ float rnormf(int __dim, const float *__t) {
  return __nv_rnormf(__dim, __t);
 }
-__DEVICE__ double round(double __a) { return __nv_round(__a); }
-__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
 __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
 __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
 __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -349,9 +349,14 @@ extern "C" {
 __device__ int vprintf(const char *, const char *);
 __device__ void free(void *) __attribute((nothrow));
 __device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+
+// __assertfail() used to have a `noreturn` attribute. Unfortunately that
+// contributed to triggering the longstanding bug in ptxas when assert was used
+// in sufficiently convoluted code. See
+// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
 __device__ void __assertfail(const char *__message, const char *__file,
                             unsigned __line, const char *__function,
-                             size_t __charSize) __attribute__((noreturn));
+                             size_t __charSize);

 // In order for standard assert() macro on linux to work we need to
 // provide device-side __assert_fail()
--- a/lib/include/__clang_hip_cmath.h
+++ b/lib/include/__clang_hip_cmath.h
@ -10,10 +10,11 @@
 #ifndef __CLANG_HIP_CMATH_H__
 #define __CLANG_HIP_CMATH_H__

-#if !defined(__HIP__)
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif

+#if !defined(__HIPCC_RTC__)
 #if defined(__cplusplus)
 #include <limits>
 #include <type_traits>
@ -21,102 +22,162 @@
 #endif
 #include <limits.h>
 #include <stdint.h>
+#endif // !defined(__HIPCC_RTC__)

 #pragma push_macro("__DEVICE__")
+#pragma push_macro("__CONSTEXPR__")
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static __attribute__((always_inline, nothrow))
+#define __CONSTEXPR__ constexpr
+#else
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#define __CONSTEXPR__
+#endif // __OPENMP_AMDGCN__

 // Start with functions that cannot be defined by DEF macros below.
 #if defined(__cplusplus)
-__DEVICE__ double abs(double __x) { return ::fabs(__x); }
-__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
-__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
-__DEVICE__ long abs(long __n) { return ::labs(__n); }
-__DEVICE__ float fma(float __x, float __y, float __z) {
+#if defined __OPENMP_AMDGCN__
+__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); }
+__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); }
+__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); }
+#endif
+__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); }
+__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); }
+__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); }
+__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); }
+__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) {
  return ::fmaf(__x, __y, __z);
 }
-__DEVICE__ int fpclassify(float __x) {
+#if !defined(__HIPCC_RTC__)
+// The value returned by fpclassify is platform dependent, therefore it is not
+// supported by hipRTC.
+__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
 }
-__DEVICE__ int fpclassify(double __x) {
+__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
 }
-__DEVICE__ float frexp(float __arg, int *__exp) {
+#endif // !defined(__HIPCC_RTC__)
+
+__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) {
  return ::frexpf(__arg, __exp);
 }
-__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
-__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
-__DEVICE__ bool isgreater(float __x, float __y) {
+
+#if defined(__OPENMP_AMDGCN__)
+// For OpenMP we work around some old system headers that have non-conforming
+// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
+// this by providing two versions of these functions, differing only in the
+// return type. To avoid conflicting definitions we disable implicit base
+// function generation. That means we will end up with two specializations, one
+// per type, but only one has a base function defined by the system header.
+#pragma omp begin declare variant match(                                       \
+    implementation = {extension(disable_implicit_base)})
+
+// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
+//        add a suffix. This means we would clash with the names of the variants
+//        (note that we do not create implicit base functions here). To avoid
+//        this clash we add a new trait to some of them that is always true
+//        (this is LLVM after all ;)). It will only influence the mangled name
+//        of the variants inside the inner region and avoid the clash.
+#pragma omp begin declare variant match(implementation = {vendor(llvm)})
+
+__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); }
+
+#pragma omp end declare variant
+#endif // defined(__OPENMP_AMDGCN__)
+
+__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); }
+
+#if defined(__OPENMP_AMDGCN__)
+#pragma omp end declare variant
+#endif // defined(__OPENMP_AMDGCN__)
+
+__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) {
  return __builtin_isgreater(__x, __y);
 }
-__DEVICE__ bool isgreater(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) {
  return __builtin_isgreater(__x, __y);
 }
-__DEVICE__ bool isgreaterequal(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) {
  return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isgreaterequal(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) {
  return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
-__DEVICE__ bool isless(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) {
  return __builtin_isless(__x, __y);
 }
-__DEVICE__ bool isless(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) {
  return __builtin_isless(__x, __y);
 }
-__DEVICE__ bool islessequal(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) {
  return __builtin_islessequal(__x, __y);
 }
-__DEVICE__ bool islessequal(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) {
  return __builtin_islessequal(__x, __y);
 }
-__DEVICE__ bool islessgreater(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) {
  return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool islessgreater(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) {
  return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
-__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
-__DEVICE__ bool isunordered(float __x, float __y) {
+__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) {
+  return __builtin_isnormal(__x);
+}
+__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) {
+  return __builtin_isnormal(__x);
+}
+__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) {
  return __builtin_isunordered(__x, __y);
 }
-__DEVICE__ bool isunordered(double __x, double __y) {
+__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) {
  return __builtin_isunordered(__x, __y);
 }
-__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
-__DEVICE__ float pow(float __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) {
+  return ::modff(__x, __iptr);
+}
+__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) {
  return ::powif(__base, __iexp);
 }
-__DEVICE__ double pow(double __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) {
  return ::powi(__base, __iexp);
 }
-__DEVICE__ float remquo(float __x, float __y, int *__quo) {
+__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) {
  return ::remquof(__x, __y, __quo);
 }
-__DEVICE__ float scalbln(float __x, long int __n) {
+__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) {
  return ::scalblnf(__x, __n);
 }
-__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
-__DEVICE__ bool signbit(double __x) { return ::__signbit(__x); }
+__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); }
+__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); }

 // Notably missing above is nexttoward.  We omit it because
 // ocml doesn't provide an implementation, and we don't want to be in the
 // business of implementing tricky libm functions in this header.

 // Other functions.
-__DEVICE__ _Float16 fma(_Float16 __x, _Float16 __y, _Float16 __z) {
+__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y,
+                                      _Float16 __z) {
  return __ocml_fma_f16(__x, __y, __z);
 }
-__DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {
+__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) {
  return __ocml_pown_f16(__base, __iexp);
 }

+#ifndef __OPENMP_AMDGCN__
 // BEGIN DEF_FUN and HIP_OVERLOAD

 // BEGIN DEF_FUN
@ -127,18 +188,19 @@ __DEVICE__ _Float16 pow(_Float16 __base, int __iexp) {

 // Define cmath functions with float argument and returns __retty.
 #define __DEF_FUN1(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  __retty __func(float __x) { return __func##f(__x); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); }

 // Define cmath functions with two float arguments and returns __retty.
 #define __DEF_FUN2(__retty, __func)                                            \
-  __DEVICE__                                                                   \
-  __retty __func(float __x, float __y) { return __func##f(__x, __y); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) {              \
+    return __func##f(__x, __y);                                                \
+  }

 // Define cmath functions with a float and an int argument and returns __retty.
 #define __DEF_FUN2_FI(__retty, __func)                                         \
-  __DEVICE__                                                                   \
-  __retty __func(float __x, int __y) { return __func##f(__x, __y); }
+  __DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) {                \
+    return __func##f(__x, __y);                                                \
+  }

 __DEF_FUN1(float, acos)
 __DEF_FUN1(float, acosh)
@ -207,11 +269,117 @@ template <bool __B, class __T = void> struct __hip_enable_if {};

 template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };

+namespace __hip {
+template <class _Tp> struct is_integral {
+  enum { value = 0 };
+};
+template <> struct is_integral<bool> {
+  enum { value = 1 };
+};
+template <> struct is_integral<char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<signed char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<wchar_t> {
+  enum { value = 1 };
+};
+template <> struct is_integral<short> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned short> {
+  enum { value = 1 };
+};
+template <> struct is_integral<int> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned int> {
+  enum { value = 1 };
+};
+template <> struct is_integral<long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<long long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned long long> {
+  enum { value = 1 };
+};
+
+// ToDo: specializes is_arithmetic<_Float16>
+template <class _Tp> struct is_arithmetic {
+  enum { value = 0 };
+};
+template <> struct is_arithmetic<bool> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<signed char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<wchar_t> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<short> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned short> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<int> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned int> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<long long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned long long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<float> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<double> {
+  enum { value = 1 };
+};
+
+struct true_type {
+  static const __constant__ bool value = true;
+};
+struct false_type {
+  static const __constant__ bool value = false;
+};
+
+template <typename __T, typename __U> struct is_same : public false_type {};
+template <typename __T> struct is_same<__T, __T> : public true_type {};
+
+template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
+
+template <typename __T> typename add_rvalue_reference<__T>::type declval();
+
 // decltype is only available in C++11 and above.
 #if __cplusplus >= 201103L
 // __hip_promote
-namespace __hip {
-
 template <class _Tp> struct __numeric_type {
  static void __test(...);
  static _Float16 __test(_Float16);
@ -227,8 +395,8 @@ template <class _Tp> struct __numeric_type {
  // No support for long double, use double instead.
  static double __test(long double);

-  typedef decltype(__test(std::declval<_Tp>())) type;
-  static const bool value = !std::is_same<type, void>::value;
+  typedef decltype(__test(declval<_Tp>())) type;
+  static const bool value = !is_same<type, void>::value;
 };

 template <> struct __numeric_type<void> { static const bool value = true; };
@ -271,18 +439,17 @@ public:

 template <class _A1, class _A2 = void, class _A3 = void>
 class __promote : public __promote_imp<_A1, _A2, _A3> {};
-
-} // namespace __hip
 #endif //__cplusplus >= 201103L
+} // namespace __hip

 // __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
 // avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
 // floor(double).
 #define __HIP_OVERLOAD1(__retty, __fn)                                         \
  template <typename __T>                                                      \
-  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
-                                      __retty>::type                           \
-  __fn(__T __x) {                                                              \
+  __DEVICE__ __CONSTEXPR__                                                     \
+      typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type  \
+      __fn(__T __x) {                                                          \
    return ::__fn((double)__x);                                                \
  }

@ -292,9 +459,8 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #if __cplusplus >= 201103L
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
  template <typename __T1, typename __T2>                                      \
-  __DEVICE__ typename __hip_enable_if<                                         \
-      std::numeric_limits<__T1>::is_specialized &&                             \
-          std::numeric_limits<__T2>::is_specialized,                           \
+  __DEVICE__ __CONSTEXPR__ typename __hip_enable_if<                           \
+      __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value,  \
      typename __hip::__promote<__T1, __T2>::type>::type                       \
  __fn(__T1 __x, __T2 __y) {                                                   \
    typedef typename __hip::__promote<__T1, __T2>::type __result_type;         \
@ -303,16 +469,15 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #else
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
  template <typename __T1, typename __T2>                                      \
-  __DEVICE__                                                                   \
-      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
-                                   std::numeric_limits<__T2>::is_specialized,  \
+  __DEVICE__ __CONSTEXPR__                                                     \
+      typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&            \
+                                   __hip::is_arithmetic<__T2>::value,          \
                               __retty>::type                                  \
      __fn(__T1 __x, __T2 __y) {                                               \
    return __fn((double)__x, (double)__y);                                     \
  }
 #endif

-__HIP_OVERLOAD1(double, abs)
 __HIP_OVERLOAD1(double, acos)
 __HIP_OVERLOAD1(double, acosh)
 __HIP_OVERLOAD1(double, asin)
@ -336,7 +501,9 @@ __HIP_OVERLOAD1(double, floor)
 __HIP_OVERLOAD2(double, fmax)
 __HIP_OVERLOAD2(double, fmin)
 __HIP_OVERLOAD2(double, fmod)
+#if !defined(__HIPCC_RTC__)
 __HIP_OVERLOAD1(int, fpclassify)
+#endif // !defined(__HIPCC_RTC__)
 __HIP_OVERLOAD2(double, hypot)
 __HIP_OVERLOAD1(int, ilogb)
 __HIP_OVERLOAD1(bool, isfinite)
@ -381,10 +548,9 @@ __HIP_OVERLOAD2(double, min)
 // Additional Overloads that don't quite match HIP_OVERLOAD.
 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2, typename __T3>
-__DEVICE__ typename __hip_enable_if<
-    std::numeric_limits<__T1>::is_specialized &&
-        std::numeric_limits<__T2>::is_specialized &&
-        std::numeric_limits<__T3>::is_specialized,
+__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<
+    __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
+        __hip::is_arithmetic<__T3>::value,
    typename __hip::__promote<__T1, __T2, __T3>::type>::type
 fma(__T1 __x, __T2 __y, __T3 __z) {
  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
@ -392,10 +558,10 @@ fma(__T1 __x, __T2 __y, __T3 __z) {
 }
 #else
 template <typename __T1, typename __T2, typename __T3>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized &&
-                                 std::numeric_limits<__T3>::is_specialized,
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value &&
+                                 __hip::is_arithmetic<__T3>::value,
                             double>::type
    fma(__T1 __x, __T2 __y, __T3 __z) {
  return ::fma((double)__x, (double)__y, (double)__z);
@ -403,31 +569,31 @@ __DEVICE__
 #endif

 template <typename __T>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    frexp(__T __x, int *__exp) {
  return ::frexp((double)__x, __exp);
 }

 template <typename __T>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    ldexp(__T __x, int __exp) {
  return ::ldexp((double)__x, __exp);
 }

 template <typename __T>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    modf(__T __x, double *__exp) {
  return ::modf((double)__x, __exp);
 }

 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized,
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value,
                             typename __hip::__promote<__T1, __T2>::type>::type
    remquo(__T1 __x, __T2 __y, int *__quo) {
  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
@ -435,9 +601,9 @@ __DEVICE__
 }
 #else
 template <typename __T1, typename __T2>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized,
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value,
                             double>::type
    remquo(__T1 __x, __T2 __y, int *__quo) {
  return ::remquo((double)__x, (double)__y, __quo);
@ -445,15 +611,15 @@ __DEVICE__
 #endif

 template <typename __T>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbln(__T __x, long int __exp) {
  return ::scalbln((double)__x, __exp);
 }

 template <typename __T>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+__DEVICE__ __CONSTEXPR__
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbn(__T __x, int __exp) {
  return ::scalbn((double)__x, __exp);
 }
@ -465,17 +631,20 @@ __DEVICE__

 // END DEF_FUN and HIP_OVERLOAD

+#endif // ifndef __OPENMP_AMDGCN__
 #endif // defined(__cplusplus)

+#ifndef __OPENMP_AMDGCN__
 // Define these overloads inside the namespace our standard library uses.
+#if !defined(__HIPCC_RTC__)
 #ifdef _LIBCPP_BEGIN_NAMESPACE_STD
 _LIBCPP_BEGIN_NAMESPACE_STD
 #else
 namespace std {
 #ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-#endif
+#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif // _LIBCPP_BEGIN_NAMESPACE_STD

 // Pull the new overloads we defined above into namespace std.
 // using ::abs; - This may be considered for C++.
@ -620,11 +789,13 @@ _LIBCPP_END_NAMESPACE_STD
 #else
 #ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _GLIBCXX_END_NAMESPACE_VERSION
-#endif
+#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
 } // namespace std
-#endif
+#endif // _LIBCPP_END_NAMESPACE_STD
+#endif // !defined(__HIPCC_RTC__)

 // Define device-side math functions from <ymath.h> on MSVC.
+#if !defined(__HIPCC_RTC__)
 #if defined(_MSC_VER)

 // Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
@ -636,29 +807,36 @@ _GLIBCXX_END_NAMESPACE_VERSION
 #if defined(__cplusplus)
 extern "C" {
 #endif // defined(__cplusplus)
-__DEVICE__ __attribute__((overloadable)) double _Cosh(double x, double y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x,
+                                                                    double y) {
  return cosh(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) float _FCosh(float x, float y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x,
+                                                                    float y) {
  return coshf(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) short _Dtest(double *p) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) {
  return fpclassify(*p);
 }
-__DEVICE__ __attribute__((overloadable)) short _FDtest(float *p) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) {
  return fpclassify(*p);
 }
-__DEVICE__ __attribute__((overloadable)) double _Sinh(double x, double y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x,
+                                                                    double y) {
  return sinh(x) * y;
 }
-__DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
+__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x,
+                                                                    float y) {
  return sinhf(x) * y;
 }
 #if defined(__cplusplus)
 }
 #endif // defined(__cplusplus)
 #endif // defined(_MSC_VER)
+#endif // !defined(__HIPCC_RTC__)
+#endif // ifndef __OPENMP_AMDGCN__

 #pragma pop_macro("__DEVICE__")
+#pragma pop_macro("__CONSTEXPR__")

 #endif // __CLANG_HIP_CMATH_H__
--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@ -138,14 +138,22 @@ __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);

-__device__ __attribute__((const)) float
-__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_cos_f32(float __x) {
+  return __builtin_amdgcn_cosf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_rcp_f32(float __x) {
+  return __builtin_amdgcn_rcpf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_rsq_f32(float __x) {
+  return __builtin_amdgcn_rsqf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_sin_f32(float __x) {
+  return __builtin_amdgcn_sinf(__x);
+}
 // END INTRINSICS
 // END FLOAT

@ -269,10 +277,14 @@ __device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
 __device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
                                                            double);

-__device__ __attribute__((const)) double
-__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
-__device__ __attribute__((const)) double
-__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+__device__ inline __attribute__((const)) double
+__llvm_amdgcn_rcp_f64(double __x) {
+  return __builtin_amdgcn_rcp(__x);
+}
+__device__ inline __attribute__((const)) double
+__llvm_amdgcn_rsq_f64(double __x) {
+  return __builtin_amdgcn_rsq(__x);
+}

 __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
 __device__ _Float16 __ocml_cos_f16(_Float16);
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@ -9,26 +9,40 @@
 #ifndef __CLANG_HIP_MATH_H__
 #define __CLANG_HIP_MATH_H__

-#if !defined(__HIP__)
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif

+#if !defined(__HIPCC_RTC__)
 #if defined(__cplusplus)
 #include <algorithm>
 #endif
 #include <limits.h>
 #include <stdint.h>
+#ifdef __OPENMP_AMDGCN__
+#include <omp.h>
+#endif
+#endif // !defined(__HIPCC_RTC__)

 #pragma push_macro("__DEVICE__")
+
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
+#else
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#endif

 // A few functions return bool type starting only in C++11.
 #pragma push_macro("__RETURN_TYPE")
+#ifdef __OPENMP_AMDGCN__
+#define __RETURN_TYPE int
+#else
 #if defined(__cplusplus)
 #define __RETURN_TYPE bool
 #else
 #define __RETURN_TYPE int
 #endif
+#endif // __OPENMP_AMDGCN__

 #if defined (__cplusplus) && __cplusplus < 201103L
 // emulate static_assert on type sizes
@ -36,7 +50,7 @@ template<bool>
 struct __compare_result{};
 template<>
 struct __compare_result<true> {
-  static const bool valid;
+  static const __device__ bool valid;
 };

 __DEVICE__
@ -247,6 +261,9 @@ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
 __DEVICE__
 float frexpf(float __x, int *__nptr) {
  int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  float __r =
      __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;
@ -332,6 +349,9 @@ long int lroundf(float __x) { return __ocml_round_f32(__x); }
 __DEVICE__
 float modff(float __x, float *__iptr) {
  float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  float __r =
      __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__iptr = __tmp;
@ -412,6 +432,9 @@ float remainderf(float __x, float __y) {
 __DEVICE__
 float remquof(float __x, float __y, int *__quo) {
  int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  float __r = __ocml_remquo_f32(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;
@ -468,6 +491,9 @@ __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
 __DEVICE__
 void sincosf(float __x, float *__sinptr, float *__cosptr) {
  float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  *__sinptr =
      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
@ -476,6 +502,9 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
 __DEVICE__
 void sincospif(float __x, float *__sinptr, float *__cosptr) {
  float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  *__sinptr = __ocml_sincospi_f32(
      __x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
@ -788,6 +817,9 @@ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
 __DEVICE__
 double frexp(double __x, int *__nptr) {
  int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  double __r =
      __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
  *__nptr = __tmp;
@ -872,6 +904,9 @@ long int lround(double __x) { return __ocml_round_f64(__x); }
 __DEVICE__
 double modf(double __x, double *__iptr) {
  double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  double __r =
      __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
  *__iptr = __tmp;
@ -960,6 +995,9 @@ double remainder(double __x, double __y) {
 __DEVICE__
 double remquo(double __x, double __y, int *__quo) {
  int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  double __r = __ocml_remquo_f64(
      __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
  *__quo = __tmp;
@ -1018,6 +1056,9 @@ double sin(double __x) { return __ocml_sin_f64(__x); }
 __DEVICE__
 void sincos(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  *__sinptr = __ocml_sincos_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
@ -1026,6 +1067,9 @@ void sincos(double __x, double *__sinptr, double *__cosptr) {
 __DEVICE__
 void sincospi(double __x, double *__sinptr, double *__cosptr) {
  double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
  *__sinptr = __ocml_sincospi_f64(
      __x, (__attribute__((address_space(5))) double *)&__tmp);
  *__cosptr = __tmp;
@ -1260,6 +1304,7 @@ float min(float __x, float __y) { return fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return fmin(__x, __y); }

+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 __host__ inline static int min(int __arg1, int __arg2) {
  return std::min(__arg1, __arg2);
 }
@ -1267,6 +1312,7 @@ __host__ inline static int min(int __arg1, int __arg2) {
 __host__ inline static int max(int __arg1, int __arg2) {
  return std::max(__arg1, __arg2);
 }
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif

 #pragma pop_macro("__DEVICE__")
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -18,52 +18,107 @@

 #if __HIP__

-#include <cmath>
-#include <cstdlib>
-#include <stdlib.h>
-
 #define __host__ __attribute__((host))
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
 #define __shared__ __attribute__((shared))
 #define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))

 #if !defined(__cplusplus) || __cplusplus < 201103L
  #define nullptr NULL;
 #endif

+#ifdef __cplusplus
+extern "C" {
+  __attribute__((__visibility__("default")))
+  __attribute__((weak))
+  __attribute__((noreturn))
+  __device__ void __cxa_pure_virtual(void) {
+    __builtin_trap();
+  }
+  __attribute__((__visibility__("default")))
+  __attribute__((weak))
+  __attribute__((noreturn))
+  __device__ void __cxa_deleted_virtual(void) {
+    __builtin_trap();
+  }
+}
+#endif //__cplusplus
+
+#if !defined(__HIPCC_RTC__)
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+#else
+typedef __SIZE_TYPE__ size_t;
+// Define macros which are needed to declare HIP device API's without standard
+// C/C++ headers. This is for readability so that these API's can be written
+// the same way as non-hipRTC use case. These macros need to be popped so that
+// they do not pollute users' name space.
+#pragma push_macro("NULL")
+#pragma push_macro("uint32_t")
+#pragma push_macro("uint64_t")
+#pragma push_macro("CHAR_BIT")
+#pragma push_macro("INT_MAX")
+#define NULL (void *)0
+#define uint32_t __UINT32_TYPE__
+#define uint64_t __UINT64_TYPE__
+#define CHAR_BIT __CHAR_BIT__
+#define INT_MAX __INTMAX_MAX__
+#endif // __HIPCC_RTC__
+
+typedef __SIZE_TYPE__ __hip_size_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif //__cplusplus
+
 #if __HIP_ENABLE_DEVICE_MALLOC__
-extern "C" __device__ void *__hip_malloc(size_t __size);
-extern "C" __device__ void *__hip_free(void *__ptr);
-static inline __device__ void *malloc(size_t __size) {
+__device__ void *__hip_malloc(__hip_size_t __size);
+__device__ void *__hip_free(void *__ptr);
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  return __hip_malloc(__size);
 }
-static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
-#else
-static inline __device__ void *malloc(size_t __size) {
-  __builtin_trap();
-  return nullptr;
+__attribute__((weak)) inline __device__ void *free(void *__ptr) {
+  return __hip_free(__ptr);
 }
-static inline __device__ void *free(void *__ptr) {
+#else
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  __builtin_trap();
-  return nullptr;
+  return (void *)0;
+}
+__attribute__((weak)) inline __device__ void *free(void *__ptr) {
+  __builtin_trap();
+  return (void *)0;
 }
 #endif

+#ifdef __cplusplus
+} // extern "C"
+#endif //__cplusplus
+
 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>

-#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#if defined(__HIPCC_RTC__)
+#include <__clang_hip_cmath.h>
+#else
 #include <__clang_cuda_math_forward_declares.h>
 #include <__clang_hip_cmath.h>
 #include <__clang_cuda_complex_builtins.h>
-
 #include <algorithm>
 #include <complex>
 #include <new>
-#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // __HIPCC_RTC__

 #define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
-
+#if defined(__HIPCC_RTC__)
+#pragma pop_macro("NULL")
+#pragma pop_macro("uint32_t")
+#pragma pop_macro("uint64_t")
+#pragma pop_macro("CHAR_BIT")
+#pragma pop_macro("INT_MAX")
+#endif // __HIPCC_RTC__
 #endif // __HIP__
 #endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -15,8 +15,13 @@
 #define __AMXINTRIN_H
 #ifdef __x86_64__

+/* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS_TILE                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+#define __DEFAULT_FN_ATTRS_BF16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))

 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@ -25,7 +30,7 @@
 /// config and the tile data, and the tiles are zeroed. Any invalid
 /// configurations will result in #GP fault.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
 ///
@ -41,7 +46,7 @@ _tile_loadconfig(const void *__config) {
 /// palette, the number of bytes per row, and the number of rows. If tiles
 /// are not configured, all zeroes will be stored to memory.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
 ///
@ -55,7 +60,7 @@ _tile_storeconfig(void *__config) {
 /// Release the tile configuration to return to the init state, which
 /// releases all storage it currently holds.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
@ -66,7 +71,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// destination tile "dst" using the tile configuration previously configured
 /// via "_tile_loadconfig".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 ///
@ -86,7 +91,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// that the data will likely not be reused in the near future and the data
 /// caching can be optimized accordingly.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 ///
@ -104,7 +109,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// "stride" using the tile configuration previously configured via
 /// "_tile_loadconfig".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
@ -119,7 +124,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {

 /// Zero the tile specified by "tdest".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 ///
@ -133,7 +138,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 ///
@ -152,7 +157,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 /// in "dst", and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 ///
@ -171,7 +176,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 ///
@ -190,7 +195,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 /// "dst", and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 ///
@ -208,7 +213,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// elements with elements in "dst", and store the 32-bit result back to tile
 /// "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 ///
@ -221,10 +226,12 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 #define _tile_dpbf16ps(dst, src0, src1)                                        \
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))

-#define __DEFAULT_FN_ATTRS_INT8                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
-
+/// AMX tile register size can be configured, the maximum size is 16x64=1024
+/// bytes. Since there is no 2D type in llvm IR, we use vector type to
+/// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                     __SIZE_TYPE__ stride) {
@ -232,12 +239,43 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                                             (__SIZE_TYPE__)(stride));
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddt164_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                      __SIZE_TYPE__ stride, _tile1024i tile) {
@ -245,34 +283,211 @@ _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                                              (__SIZE_TYPE__)(stride), tile);
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// This struct pack the shape and tile data together for user. We suggest
+/// initializing the struct as early as possible, because compiler depends
+/// on the shape information to do configure. The constant value is preferred
+/// for optimization by compiler.
 typedef struct __tile1024i_str {
  const unsigned short row;
  const unsigned short col;
  _tile1024i tile;
 } __tile1024i;

+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_loadd(__tile1024i *dst, const void *base,
                         __SIZE_TYPE__ stride) {
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }

-__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
-                          __tile1024i src2) {
-  dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
-                                    src1.tile, src2.tile);
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stream_loadd(__tile1024i *dst, const void *base,
+                                __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 }

+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }

+/// Zero the tile specified by "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param dst
+///    The destination tile to be zero. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_zero(__tile1024i *dst) {
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 }

+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_BF16
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+                            __tile1024i src1) {
+  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS_TILE
+#undef __DEFAULT_FN_ATTRS_INT8
+#undef __DEFAULT_FN_ATTRS_BF16
+
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -639,6 +639,49 @@ __jcvt(double __a) {
 }
 #endif

+/* Armv8.5-A FP rounding intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint32zf(float __a) {
+  return __builtin_arm_frint32zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint32z(double __a) {
+  return __builtin_arm_frint32z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint64zf(float __a) {
+  return __builtin_arm_frint64zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint64z(double __a) {
+  return __builtin_arm_frint64z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint32xf(float __a) {
+  return __builtin_arm_frint32xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint32x(double __a) {
+  return __builtin_arm_frint32x(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint64xf(float __a) {
+  return __builtin_arm_frint64xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint64x(double __a) {
+  return __builtin_arm_frint64x(__a);
+}
+#endif
+
 /* Armv8.7-A load/store 64-byte intrinsics */
 #if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
 typedef struct {
@ -709,6 +752,18 @@ __arm_st64bv0(void *__addr, data512_t __value) {

 #endif /* __ARM_FEATURE_TME */

+/* Armv8.5-A Random number generation intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__rndr(uint64_t *__p) {
+  return __builtin_arm_rndr(__p);
+}
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__rndrrs(uint64_t *__p) {
+  return __builtin_arm_rndrrs(__p);
+}
+#endif
+
 #if defined(__cplusplus)
 }
 #endif
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -9300,8 +9300,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 * computations. In vector-reduction arithmetic, the evaluation order is
 * independent of the order of the input elements of V.

- * For floating point types, we always assume the elements are reassociable even
- * if -fast-math is off.
+ * For floating-point intrinsics:
+ * 1. When using fadd/fmul intrinsics, the order of operations within the
+ * vector is unspecified (associative math).
+ * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
+ * produce unspecified results.

 * Used bisection method. At each step, we partition the vector with previous
 * step in half, and the operation is performed on its two halves.
@ -9524,75 +9527,49 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }

-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
-  __m256d __t3 = _mm256_##op(__t1, __t2); \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = _mm_##op(__t4, __t5); \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = _mm_##op(__t6, __t7); \
-  return __t8[0]
-
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(min_pd);
+  return __builtin_ia32_reduce_fmin_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
  __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
  __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(min_pd);
+  return __builtin_ia32_reduce_fmin_pd512(__V);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
-  __m256 __t3 = _mm256_##op(__t1, __t2); \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = _mm_##op(__t4, __t5); \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = _mm_##op(__t6, __t7); \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = _mm_##op(__t8, __t9); \
-  return __t10[0]

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
  __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
  __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }
-#undef _mm512_mask_reduce_operator

 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
 ///    32-bit signed integer value.
@ -9611,6 +9588,169 @@ _mm512_cvtsi512_si32(__m512i __A) {
  return __b[0];
 }

+/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
+/// locations starting at location \a base_addr at packed 32-bit integer indices
+/// stored in the lower half of \a vindex scaled by \a scale them in dst.
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
+  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory
+/// starting at location \a base_addr at packed 32-bit integer indices stored in
+/// the lower half of \a vindex scaled by \a scale into dst using writemask
+/// \a mask (elements are copied from \a src when the corresponding mask bit is
+/// not set).
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     dst[i+63:i] := MEM[addr+63:addr]
+///   ELSE
+///     dst[i+63:i] := src[i+63:i]
+///   FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
+  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
+                           (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst.
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
+  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst using writemask \a mask (elements
+/// are copied from \a src when the corresponding mask bit is not set).
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     dst[i+63:i] := MEM[addr+63:addr]
+///   ELSE
+///     dst[i+63:i] := src[i+63:i]
+///   FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
+  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
+                              (base_addr), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// and to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   MEM[addr+63:addr] := v1[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
+  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale. Only those elements
+/// whose corresponding mask bit is set in writemask \a mask are written to
+/// memory.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     MEM[addr+63:addr] := a[i+63:i]
+///   FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
+  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
+                            _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   MEM[addr+63:addr] := a[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
+  _mm512_i32scatter_epi64((base_addr),                                         \
+                          _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in a and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
+/// whose corresponding mask bit is not set are not written to memory).
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     MEM[addr+63:addr] := a[i+63:i]
+///   FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
+  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
+                               _mm512_castsi512_si256(vindex), (v1), (scale))
+
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/builtins.h
+++ b/lib/include/builtins.h
@ -0,0 +1,16 @@
+/*===---- builtins.h - Standard header for extra builtins -----------------===*\
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+\*===----------------------------------------------------------------------===*/
+
+/// Some legacy compilers have builtin definitions in a file named builtins.h.
+/// This header file has been added to allow compatibility with code that was
+/// written for those compilers. Code may have an include line for this file
+/// and to avoid an error an empty file with this name is provided.
+#ifndef __BUILTINS_H
+#define __BUILTINS_H
+
+#endif /* __BUILTINS_H */
--- a/lib/include/cuda_wrappers/complex
+++ b/lib/include/cuda_wrappers/complex
@ -72,8 +72,16 @@
 #define _GLIBCXX_USE_C99_COMPLEX 0
 #define _GLIBCXX_USE_C99_COMPLEX_TR1 0

+// Work around a compatibility issue with libstdc++ 11.1.0
+// https://bugs.llvm.org/show_bug.cgi?id=50383
+#pragma push_macro("__failed_assertion")
+#if _GLIBCXX_RELEASE == 11
+#define __failed_assertion __cuda_failed_assertion
+#endif
+
 #include_next <complex>

+#pragma pop_macro("__failed_assertion")
 #pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
 #pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")

--- a/lib/include/hexagon_circ_brev_intrinsics.h
+++ b/lib/include/hexagon_circ_brev_intrinsics.h
@ -0,0 +1,298 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_
+#define _HEXAGON_CIRC_BREV_INTRINSICS_H_ 1
+
+#include <hexagon_protos.h>
+#include <stdint.h>
+
+/* Circular Load */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_D(Word64 dst, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_D(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int64_t *) HEXAGON_circ_ldd (ptr, &(dest), ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_W(Word32 dst, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_W(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int *) HEXAGON_circ_ldw (ptr, &(dest), (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_H(Word16 dst, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_H(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int16_t *) HEXAGON_circ_ldh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_UH( UWord16 dst,  UWord16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_UH(dest,ptr,incr,bufsize,K) \
+    { ptr = (uint16_t *) HEXAGON_circ_lduh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_B(Word8 dst, Word8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_B(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int8_t *) HEXAGON_circ_ldb (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void  Q6_circ_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_UB(dest,ptr,incr,bufsize,K) \
+    { ptr = (uint8_t *) HEXAGON_circ_ldub (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
+
+/* Circular Store */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_D(Word64 *src, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_D(src,ptr,incr,bufsize,K)  \
+    { ptr = (int64_t *) HEXAGON_circ_std (ptr, src, ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_W(Word32 *src, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_W(src,ptr,incr,bufsize,K)  \
+    { ptr = (int *) HEXAGON_circ_stw (ptr, src, (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_HL(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_HL(src,ptr,incr,bufsize,K) \
+    { ptr = (int16_t *) HEXAGON_circ_sth (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_HH(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_HH(src,ptr,incr,bufsize,K) \
+    { ptr = (int16_t *) HEXAGON_circ_sthhi (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_B(Word8 *src, Word8 *ptr, UWord32 I4, UWord32 bufsize,  UWord64 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_B(src,ptr,incr,bufsize,K)  \
+    { ptr = (int8_t *) HEXAGON_circ_stb (ptr, src, ((((K)-2)<<24)|(bufsize)), incr); }
+
+
+/* Bit Reverse Load */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_D(Word64 dst, Word64 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_D(dest,ptr,log2bufsize) \
+    { ptr = (int64_t *) HEXAGON_brev_ldd (ptr, &(dest), (1<<(16-((log2bufsize) + 3)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_W(Word32 dst, Word32 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_W(dest,ptr,log2bufsize) \
+    { ptr = (int *) HEXAGON_brev_ldw (ptr, &(dest), (1<<(16-((log2bufsize) + 2)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_H(Word16 dst, Word16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_H(dest,ptr,log2bufsize) \
+    { ptr = (int16_t *) HEXAGON_brev_ldh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_UH(UWord16 dst,  UWord16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_UH(dest,ptr,log2bufsize) \
+    { ptr = (uint16_t *) HEXAGON_brev_lduh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_B(Word8 dst, Word8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_B(dest,ptr,log2bufsize) \
+    { ptr = (int8_t *) HEXAGON_brev_ldb (ptr, &(dest), (1<<(16-((log2bufsize))))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_UB(dest,ptr,log2bufsize) \
+    { ptr = (uint8_t *) HEXAGON_brev_ldub (ptr, &(dest), (1<<(16-((log2bufsize))))); }
+
+/* Bit Reverse Store */
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_D(Word64 *src, Word64 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_D(src,ptr,log2bufsize)   \
+    { ptr = (int64_t *) HEXAGON_brev_std (ptr, src, (1<<(16-((log2bufsize) + 3)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_W(Word32 *src, Word32 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_W(src,ptr,log2bufsize)   \
+    { ptr = (int *) HEXAGON_brev_stw (ptr, src, (1<<(16-((log2bufsize) + 2)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_HL(Word16 *src, Word16 *ptr, Word32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_HL(src,ptr,log2bufsize)   \
+    { ptr = (int16_t *) HEXAGON_brev_sth (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_HH(Word16 *src, Word16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_HH(src,ptr,log2bufsize)   \
+    { ptr = (int16_t *) HEXAGON_brev_sthhi (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_B(Word8 *src, Word8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_B(src,ptr,log2bufsize)   \
+    { ptr = (int8_t *) HEXAGON_brev_stb (ptr, src, (1<<(16-((log2bufsize))))); }
+
+
+#define HEXAGON_circ_ldd  __builtin_circ_ldd
+#define HEXAGON_circ_ldw  __builtin_circ_ldw
+#define HEXAGON_circ_ldh  __builtin_circ_ldh
+#define HEXAGON_circ_lduh __builtin_circ_lduh
+#define HEXAGON_circ_ldb  __builtin_circ_ldb
+#define HEXAGON_circ_ldub __builtin_circ_ldub
+
+
+#define HEXAGON_circ_std  __builtin_circ_std
+#define HEXAGON_circ_stw  __builtin_circ_stw
+#define HEXAGON_circ_sth  __builtin_circ_sth
+#define HEXAGON_circ_sthhi __builtin_circ_sthhi
+#define HEXAGON_circ_stb  __builtin_circ_stb
+
+
+#define HEXAGON_brev_ldd  __builtin_brev_ldd
+#define HEXAGON_brev_ldw  __builtin_brev_ldw
+#define HEXAGON_brev_ldh  __builtin_brev_ldh
+#define HEXAGON_brev_lduh __builtin_brev_lduh
+#define HEXAGON_brev_ldb  __builtin_brev_ldb
+#define HEXAGON_brev_ldub __builtin_brev_ldub
+
+#define HEXAGON_brev_std  __builtin_brev_std
+#define HEXAGON_brev_stw  __builtin_brev_stw
+#define HEXAGON_brev_sth  __builtin_brev_sth
+#define HEXAGON_brev_sthhi __builtin_brev_sthhi
+#define HEXAGON_brev_stb  __builtin_brev_stb
+
+#ifdef __HVX__
+/* ==========================================================================
+   Assembly Syntax:       if (Qt) vmem(Rt+#0) = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstoreq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstoreq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstoreq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (!Qt) vmem(Rt+#0) = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorenq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorenq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorenq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (Qt) vmem(Rt+#0):nt = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorentq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorentq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (!Qt) vmem(Rt+#0):nt = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorentnq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorentnq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentnq)
+
+#endif
+
+
+#endif  /* #ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_ */
+
+#ifdef __NOT_DEFINED__
+/*** comment block template  ***/
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: ReturnType Intrinsic(ParamType Rs, ParamType Rt)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#endif /***  __NOT_DEFINED__  ***/
--- a/lib/include/hexagon_protos.h
+++ b/lib/include/hexagon_protos.h
--- a/lib/include/hexagon_types.h
+++ b/lib/include/hexagon_types.h
--- a/lib/include/hvx_hexagon_protos.h
+++ b/lib/include/hvx_hexagon_protos.h
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -72,11 +72,6 @@
 #include <f16cintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VPCLMULQDQ__)
-#include <vpclmulqdqintrin.h>
-#endif
-
 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>

@ -230,6 +225,11 @@
 #include <pkuintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__VAES__)
 #include <vaesintrin.h>
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -451,24 +451,47 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
 static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
                                                  unsigned char const *__src,
                                                  size_t __n) {
-  __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+#if defined(__x86_64__)
+  __asm__ __volatile__("rep movsb"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
                                                  unsigned long const *__src,
                                                  size_t __n) {
+#if defined(__x86_64__)
  __asm__ __volatile__("rep movsl"
                       : "+D"(__dst), "+S"(__src), "+c"(__n)
                       :
                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
                                                  unsigned short const *__src,
                                                  size_t __n) {
+#if defined(__x86_64__)
  __asm__ __volatile__("rep movsw"
                       : "+D"(__dst), "+S"(__src), "+c"(__n)
                       :
                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
                                                  unsigned long __x,
@ -507,16 +530,26 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
+  __asm("cpuid"                                                                \
+        : "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
+        : "0"(__leaf), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
+  __asm("xchgq %%rbx,%q1\n"                                                    \
+        "cpuid\n"                                                              \
+        "xchgq %%rbx,%q1"                                                      \
+        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
+        : "0"(__leaf), "2"(__count))
+#endif
 static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
-  __asm__("cpuid"
-          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
-          : "a"(__level), "c"(0));
+  __cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
                                                    int __ecx) {
-  __asm__("cpuid"
-          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
-          : "a"(__level), "c"(__ecx));
+  __cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
@ -541,6 +574,9 @@ void _WriteStatusReg(int, __int64);
 unsigned short __cdecl _byteswap_ushort(unsigned short val);
 unsigned long __cdecl _byteswap_ulong (unsigned long val);
 unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
+
+__int64 __mulh(__int64 __a, __int64 __b);
+unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
 #endif

 /*----------------------------------------------------------------------------*\
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -230,10 +230,12 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -267,10 +269,12 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -304,10 +308,12 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -354,10 +360,16 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
@ -394,10 +406,16 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
@ -434,10 +452,16 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
@ -474,10 +498,16 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
 /// If (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -21,9 +21,37 @@
 #define cl_khr_subgroup_shuffle 1
 #define cl_khr_subgroup_shuffle_relative 1
 #define cl_khr_subgroup_clustered_reduce 1
+#define cl_khr_extended_bit_ops 1
+#define cl_khr_integer_dot_product 1
+#define __opencl_c_integer_dot_product_input_4x8bit 1
+#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
+
 #endif // defined(__SPIR__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)

+// Define feature macros for OpenCL C 2.0
+#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
+#define __opencl_c_pipes 1
+#define __opencl_c_generic_address_space 1
+#define __opencl_c_work_group_collective_functions 1
+#define __opencl_c_atomic_order_acq_rel 1
+#define __opencl_c_atomic_order_seq_cst 1
+#define __opencl_c_atomic_scope_device 1
+#define __opencl_c_atomic_scope_all_devices 1
+#define __opencl_c_device_enqueue 1
+#define __opencl_c_read_write_images 1
+#define __opencl_c_program_scope_global_variables 1
+#define __opencl_c_images 1
+#endif
+
+// Define header-only feature macros for OpenCL C 3.0.
+#if (__OPENCL_C_VERSION__ == 300)
+// For the SPIR target all features are supported.
+#if defined(__SPIR__)
+#define __opencl_c_atomic_scope_all_devices 1
+#endif // defined(__SPIR__)
+#endif // (__OPENCL_C_VERSION__ == 300)
+
 // built-in scalar data types:

 /**
@ -141,7 +169,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
 typedef double double16 __attribute__((ext_vector_type(16)));
 #endif

-#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__)
+#define NULL nullptr
+#elif defined(__OPENCL_C_VERSION__)
 #define NULL ((void*)0)
 #endif

@ -297,7 +327,12 @@ typedef enum memory_scope {
  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+#if defined(__opencl_c_atomic_scope_all_devices)
  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
+  memory_scope_all_devices = memory_scope_all_svm_devices,
+#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
+#endif // defined(__opencl_c_atomic_scope_all_devices)
 #if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
 #endif
@ -322,7 +357,9 @@ typedef enum memory_order
  memory_order_acquire = __ATOMIC_ACQUIRE,
  memory_order_release = __ATOMIC_RELEASE,
  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+#if defined(__opencl_c_atomic_order_seq_cst)
  memory_order_seq_cst = __ATOMIC_SEQ_CST
+#endif
 } memory_order;

 #endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@ -445,8 +482,113 @@ typedef struct {

 #endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

+/**
+ * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
+ * Reinterprets a data type as another data type of the same size
+ */
+#define as_char(x) __builtin_astype((x), char)
+#define as_char2(x) __builtin_astype((x), char2)
+#define as_char3(x) __builtin_astype((x), char3)
+#define as_char4(x) __builtin_astype((x), char4)
+#define as_char8(x) __builtin_astype((x), char8)
+#define as_char16(x) __builtin_astype((x), char16)
+
+#define as_uchar(x) __builtin_astype((x), uchar)
+#define as_uchar2(x) __builtin_astype((x), uchar2)
+#define as_uchar3(x) __builtin_astype((x), uchar3)
+#define as_uchar4(x) __builtin_astype((x), uchar4)
+#define as_uchar8(x) __builtin_astype((x), uchar8)
+#define as_uchar16(x) __builtin_astype((x), uchar16)
+
+#define as_short(x) __builtin_astype((x), short)
+#define as_short2(x) __builtin_astype((x), short2)
+#define as_short3(x) __builtin_astype((x), short3)
+#define as_short4(x) __builtin_astype((x), short4)
+#define as_short8(x) __builtin_astype((x), short8)
+#define as_short16(x) __builtin_astype((x), short16)
+
+#define as_ushort(x) __builtin_astype((x), ushort)
+#define as_ushort2(x) __builtin_astype((x), ushort2)
+#define as_ushort3(x) __builtin_astype((x), ushort3)
+#define as_ushort4(x) __builtin_astype((x), ushort4)
+#define as_ushort8(x) __builtin_astype((x), ushort8)
+#define as_ushort16(x) __builtin_astype((x), ushort16)
+
+#define as_int(x) __builtin_astype((x), int)
+#define as_int2(x) __builtin_astype((x), int2)
+#define as_int3(x) __builtin_astype((x), int3)
+#define as_int4(x) __builtin_astype((x), int4)
+#define as_int8(x) __builtin_astype((x), int8)
+#define as_int16(x) __builtin_astype((x), int16)
+
+#define as_uint(x) __builtin_astype((x), uint)
+#define as_uint2(x) __builtin_astype((x), uint2)
+#define as_uint3(x) __builtin_astype((x), uint3)
+#define as_uint4(x) __builtin_astype((x), uint4)
+#define as_uint8(x) __builtin_astype((x), uint8)
+#define as_uint16(x) __builtin_astype((x), uint16)
+
+#define as_long(x) __builtin_astype((x), long)
+#define as_long2(x) __builtin_astype((x), long2)
+#define as_long3(x) __builtin_astype((x), long3)
+#define as_long4(x) __builtin_astype((x), long4)
+#define as_long8(x) __builtin_astype((x), long8)
+#define as_long16(x) __builtin_astype((x), long16)
+
+#define as_ulong(x) __builtin_astype((x), ulong)
+#define as_ulong2(x) __builtin_astype((x), ulong2)
+#define as_ulong3(x) __builtin_astype((x), ulong3)
+#define as_ulong4(x) __builtin_astype((x), ulong4)
+#define as_ulong8(x) __builtin_astype((x), ulong8)
+#define as_ulong16(x) __builtin_astype((x), ulong16)
+
+#define as_float(x) __builtin_astype((x), float)
+#define as_float2(x) __builtin_astype((x), float2)
+#define as_float3(x) __builtin_astype((x), float3)
+#define as_float4(x) __builtin_astype((x), float4)
+#define as_float8(x) __builtin_astype((x), float8)
+#define as_float16(x) __builtin_astype((x), float16)
+
+#ifdef cl_khr_fp64
+#define as_double(x) __builtin_astype((x), double)
+#define as_double2(x) __builtin_astype((x), double2)
+#define as_double3(x) __builtin_astype((x), double3)
+#define as_double4(x) __builtin_astype((x), double4)
+#define as_double8(x) __builtin_astype((x), double8)
+#define as_double16(x) __builtin_astype((x), double16)
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+#define as_half(x) __builtin_astype((x), half)
+#define as_half2(x) __builtin_astype((x), half2)
+#define as_half3(x) __builtin_astype((x), half3)
+#define as_half4(x) __builtin_astype((x), half4)
+#define as_half8(x) __builtin_astype((x), half8)
+#define as_half16(x) __builtin_astype((x), half16)
+#endif // cl_khr_fp16
+
+#define as_size_t(x) __builtin_astype((x), size_t)
+#define as_ptrdiff_t(x) __builtin_astype((x), ptrdiff_t)
+#define as_intptr_t(x) __builtin_astype((x), intptr_t)
+#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
+
+// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
+
+#define __kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#define kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
+// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
+
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+#endif
+
 #ifdef cl_intel_device_side_avc_motion_estimation
-#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin

 #define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
 #define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
@ -580,7 +722,6 @@ typedef struct {
 #define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
 #define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0

-#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
 #endif // cl_intel_device_side_avc_motion_estimation

 // Disable any extensions we may have enabled previously.
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
--- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
@ -14,13 +14,13 @@
 #error "This file is for OpenMP compilation only."
 #endif

-#pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
-
 #ifdef __cplusplus
 extern "C" {
 #endif

+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
 #define __CUDA__
 #define __OPENMP_NVPTX__

@ -33,10 +33,74 @@ extern "C" {
 #undef __OPENMP_NVPTX__
 #undef __CUDA__

+#pragma omp end declare variant
+
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+// Import types which will be used by __clang_hip_libdevice_declares.h
+#ifndef __cplusplus
+#include <stdbool.h>
+#include <stdint.h>
+#endif
+
+#define __OPENMP_AMDGCN__
+#pragma push_macro("__device__")
+#define __device__
+
+/// Include declarations for libdevice functions.
+#include <__clang_hip_libdevice_declares.h>
+
+#pragma pop_macro("__device__")
+#undef __OPENMP_AMDGCN__
+
+#pragma omp end declare variant
+#endif
+
 #ifdef __cplusplus
 } // extern "C"
 #endif

-#pragma omp end declare variant
+// Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the
+// need to `include <new>` in C++ mode.
+#ifdef __cplusplus
+
+// We require malloc/free.
+#include <cstdlib>
+
+#pragma push_macro("OPENMP_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define OPENMP_NOEXCEPT noexcept
+#else
+#define OPENMP_NOEXCEPT
+#endif
+
+// Device overrides for non-placement new and delete.
+inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0)
+    size = 1;
+  return ::malloc(size);
+}
+
+inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
+
+inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }
+
+inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+#pragma pop_macro("OPENMP_NOEXCEPT")
+#endif

 #endif
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@ -75,4 +75,58 @@ __DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }

 #pragma omp end declare variant

+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+#pragma push_macro("__constant__")
+#define __constant__ __attribute__((constant))
+#define __OPENMP_AMDGCN__
+
+#include <__clang_hip_cmath.h>
+
+#pragma pop_macro("__constant__")
+#undef __OPENMP_AMDGCN__
+
+// Define overloads otherwise which are absent
+#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
+
+__DEVICE__ float acos(float __x) { return ::acosf(__x); }
+__DEVICE__ float acosh(float __x) { return ::acoshf(__x); }
+__DEVICE__ float asin(float __x) { return ::asinf(__x); }
+__DEVICE__ float asinh(float __x) { return ::asinhf(__x); }
+__DEVICE__ float atan(float __x) { return ::atanf(__x); }
+__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
+__DEVICE__ float atanh(float __x) { return ::atanhf(__x); }
+__DEVICE__ float cbrt(float __x) { return ::cbrtf(__x); }
+__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
+__DEVICE__ float erf(float __x) { return ::erff(__x); }
+__DEVICE__ float erfc(float __x) { return ::erfcf(__x); }
+__DEVICE__ float exp2(float __x) { return ::exp2f(__x); }
+__DEVICE__ float expm1(float __x) { return ::expm1f(__x); }
+__DEVICE__ float fdim(float __x, float __y) { return ::fdimf(__x, __y); }
+__DEVICE__ float hypot(float __x, float __y) { return ::hypotf(__x, __y); }
+__DEVICE__ int ilogb(float __x) { return ::ilogbf(__x); }
+__DEVICE__ float ldexp(float __arg, int __exp) {
+  return ::ldexpf(__arg, __exp);
+}
+__DEVICE__ float lgamma(float __x) { return ::lgammaf(__x); }
+__DEVICE__ float log1p(float __x) { return ::log1pf(__x); }
+__DEVICE__ float logb(float __x) { return ::logbf(__x); }
+__DEVICE__ float nextafter(float __x, float __y) {
+  return ::nextafterf(__x, __y);
+}
+__DEVICE__ float remainder(float __x, float __y) {
+  return ::remainderf(__x, __y);
+}
+__DEVICE__ float scalbn(float __x, int __y) { return ::scalbnf(__x, __y); }
+__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
+__DEVICE__ float tan(float __x) { return ::tanf(__x); }
+__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
+__DEVICE__ float tgamma(float __x) { return ::tgammaf(__x); }
+
+#undef __DEVICE__
+
+#pragma omp end declare variant
+#endif // __AMDGCN__
+
 #endif
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@ -17,7 +17,6 @@
 // We require std::math functions in the complex builtins below.
 #include <cmath>

-#define __CUDA__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
@ -26,9 +25,6 @@
 // Grab the host header too.
 #include_next <complex>

-
-#ifdef __cplusplus
-
 // If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
 // after including <cmath> above. Since the complex header we use is a
 // simplified version of the libc++, we don't need it in this case. If we
@ -48,5 +44,3 @@
 #pragma omp end declare variant

 #endif
-
-#endif
--- a/lib/include/openmp_wrappers/complex.h
+++ b/lib/include/openmp_wrappers/complex.h
@ -17,7 +17,6 @@
 // We require math functions in the complex builtins below.
 #include <math.h>

-#define __CUDA__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
--- a/lib/include/openmp_wrappers/math.h
+++ b/lib/include/openmp_wrappers/math.h
@ -48,4 +48,14 @@

 #pragma omp end declare variant

+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+#define __OPENMP_AMDGCN__
+#include <__clang_hip_math.h>
+#undef __OPENMP_AMDGCN__
+
+#pragma omp end declare variant
+#endif
+
 #endif
--- a/lib/include/openmp_wrappers/new
+++ b/lib/include/openmp_wrappers/new
@ -9,6 +9,8 @@
 #ifndef __CLANG_OPENMP_WRAPPERS_NEW
 #define __CLANG_OPENMP_WRAPPERS_NEW

+// We need the system <new> for the std::nothrow_t. The new/delete operators
+// which do not use nothrow_t are provided without the <new> header.
 #include_next <new>

 #if defined(__NVPTX__) && defined(_OPENMP)
@ -22,48 +24,24 @@
 #define OPENMP_NOEXCEPT
 #endif

-// Device overrides for non-placement new and delete.
-inline void *operator new(__SIZE_TYPE__ size) {
-  if (size == 0)
-    size = 1;
-  return ::malloc(size);
-}
 inline void *operator new(__SIZE_TYPE__ size,
                          const std::nothrow_t &) OPENMP_NOEXCEPT {
  return ::operator new(size);
 }

-inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
 inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
  return ::operator new(size);
 }

-inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
-  if (ptr)
-    ::free(ptr);
-}
 inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
  ::operator delete(ptr);
 }

-inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
 inline void operator delete[](void *ptr,
                              const std::nothrow_t &) OPENMP_NOEXCEPT {
  ::operator delete(ptr);
 }

-// Sized delete, C++14 only.
-#if __cplusplus >= 201402L
-inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
-inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
-#endif
-
 #pragma pop_macro("OPENMP_NOEXCEPT")
 #endif

--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@ -28,7 +28,7 @@
   Most SSE scalar float intrinsic operations can be performed more
   efficiently as C language float scalar operations or optimized to
   use vector SIMD operations. We recommend this for new applications. */
-#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif

 #ifndef _XMMINTRIN_H_INCLUDED
@ -62,14 +62,13 @@

 /* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
-typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef vector float __m128 __attribute__((__may_alias__));

 /* Unaligned version of the same type.  */
-typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
-				       __aligned__ (1)));
+typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));

 /* Internal data types for implementing the intrinsics.  */
-typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef vector float __v4sf;

 /* Create an undefined vector.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
--- a/lib/include/uintrintrin.h
+++ b/lib/include/uintrintrin.h
@ -20,6 +20,13 @@

 #ifdef __x86_64__

+struct __uintr_frame
+{
+  unsigned long long rip;
+  unsigned long long rflags;
+  unsigned long long rsp;
+};
+
 /// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
 ///    user interrupt cannot be delivered on the instruction boundary following
 ///    CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
--- a/lib/include/vaesintrin.h
+++ b/lib/include/vaesintrin.h
@ -28,13 +28,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenc_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesdec_epi128(__m256i __A, __m256i __B)
 {
@ -42,13 +35,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdec_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
 {
@ -56,13 +42,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
 {
@ -70,13 +49,35 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

+#ifdef __AVX512FINTRIN_H
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenc_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdec_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS_F
 _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
 {
  return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
              (__v8di) __B);
 }
-
+#endif // __AVX512FINTRIN_H

 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_F
--- a/lib/include/vecintrin.h
+++ b/lib/include/vecintrin.h
@ -1016,64 +1016,84 @@ vec_scatter_element(__vector double __vec,

 static inline __ATTRS_o_ai __vector signed char
 vec_xl(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned char
 vec_xl(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed short
 vec_xl(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned short
 vec_xl(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed int
 vec_xl(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned int
 vec_xl(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed long long
 vec_xl(long __offset, const signed long long *__ptr) {
-  return *(const __vector signed long long *)
-          ((const char *)__ptr + __offset);
+  __vector signed long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed long long));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned long long
 vec_xl(long __offset, const unsigned long long *__ptr) {
-  return *(const __vector unsigned long long *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned long long));
+  return V;
 }

 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_xl(long __offset, const float *__ptr) {
-  return *(const __vector float *)
-          ((const char *)__ptr + __offset);
+  __vector float V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector float));
+  return V;
 }
 #endif

 static inline __ATTRS_o_ai __vector double
 vec_xl(long __offset, const double *__ptr) {
-  return *(const __vector double *)
-          ((const char *)__ptr + __offset);
+  __vector double V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector double));
+  return V;
 }

 /*-- vec_xld2 ---------------------------------------------------------------*/
@ -1081,64 +1101,82 @@ vec_xl(long __offset, const double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_xld2(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_xld2(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_xld2(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_xld2(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_xld2(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_xld2(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_xld2(long __offset, const signed long long *__ptr) {
-  return *(const __vector signed long long *)
-          ((const char *)__ptr + __offset);
+  __vector signed long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed long long));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_xld2(long __offset, const unsigned long long *__ptr) {
-  return *(const __vector unsigned long long *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned long long));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_xld2(long __offset, const double *__ptr) {
-  return *(const __vector double *)
-          ((const char *)__ptr + __offset);
+  __vector double V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector double));
+  return V;
 }

 /*-- vec_xlw4 ---------------------------------------------------------------*/
@ -1146,99 +1184,128 @@ vec_xld2(long __offset, const double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_xlw4(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_xlw4(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_xlw4(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_xlw4(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_xlw4(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_xlw4(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 /*-- vec_xst ----------------------------------------------------------------*/

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed long long __vec, long __offset,
-          signed long long *__ptr) {
-  *(__vector signed long long *)((char *)__ptr + __offset) = __vec;
+        signed long long *__ptr) {
+  __vector signed long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed long long));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned long long __vec, long __offset,
-          unsigned long long *__ptr) {
-  *(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
+        unsigned long long *__ptr) {
+  __vector unsigned long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned long long));
 }

 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai void
 vec_xst(__vector float __vec, long __offset, float *__ptr) {
-  *(__vector float *)((char *)__ptr + __offset) = __vec;
+  __vector float V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector float));
 }
 #endif

 static inline __ATTRS_o_ai void
 vec_xst(__vector double __vec, long __offset, double *__ptr) {
-  *(__vector double *)((char *)__ptr + __offset) = __vec;
+  __vector double V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
 }

 /*-- vec_xstd2 --------------------------------------------------------------*/
@ -1246,57 +1313,73 @@ vec_xst(__vector double __vec, long __offset, double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed long long __vec, long __offset,
          signed long long *__ptr) {
-  *(__vector signed long long *)((char *)__ptr + __offset) = __vec;
+  __vector signed long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed long long));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned long long __vec, long __offset,
          unsigned long long *__ptr) {
-  *(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned long long));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
-  *(__vector double *)((char *)__ptr + __offset) = __vec;
+  __vector double V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
 }

 /*-- vec_xstw4 --------------------------------------------------------------*/
@ -1304,37 +1387,48 @@ vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 /*-- vec_load_bndry ---------------------------------------------------------*/
@ -9259,6 +9353,41 @@ vec_fp_test_data_class(__vector double __a, int __b, int *__c)
                                   __VEC_CLASS_FP_ZERO | \
                                   __VEC_CLASS_FP_INFINITY)

+/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_extend_to_fp32_hi(X, W) \
+  ((__vector float)__builtin_s390_vclfnhs((X), (W)));
+#endif
+
+/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_extend_to_fp32_lo(X, W) \
+  ((__vector float)__builtin_s390_vclfnls((X), (W)));
+#endif
+
+/*-- vec_round_from_fp32 ----------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_round_from_fp32(X, Y, W) \
+  ((__vector unsigned short)__builtin_s390_vcrnfs((X), (Y), (W)));
+#endif
+
+/*-- vec_convert_to_fp16 ----------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_convert_to_fp16(X, W) \
+  ((__vector unsigned short)__builtin_s390_vcfn((X), (W)));
+#endif
+
+/*-- vec_convert_from_fp16 --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_convert_from_fp16(X, W) \
+  ((__vector unsigned short)__builtin_s390_vcnf((X), (W)));
+#endif
+
 /*-- vec_cp_until_zero ------------------------------------------------------*/

 static inline __ATTRS_o_ai __vector signed char
--- a/lib/include/vpclmulqdqintrin.h
+++ b/lib/include/vpclmulqdqintrin.h
@ -19,10 +19,12 @@
                                       (__v4di)(__m256i)(B),  \
                                       (char)(I))

+#ifdef __AVX512FINTRIN_H
 #define _mm512_clmulepi64_epi128(A, B, I) \
  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
                                       (__v8di)(__m512i)(B),  \
                                       (char)(I))
+#endif // __AVX512FINTRIN_H

 #endif /* __VPCLMULQDQINTRIN_H */

--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
--- a/lib/libc/include/aarch64-linux-gnu/bits/floatn.h
+++ b/lib/libc/include/aarch64-linux-gnu/bits/floatn.h
@ -0,0 +1,97 @@
+/* Macros to control TS 18661-3 glibc features on ldbl-128 platforms.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_FLOATN_H
+#define _BITS_FLOATN_H
+
+#include <features.h>
+#include <bits/long-double.h>
+
+/* Defined to 1 if the current compiler invocation provides a
+   floating-point type with the IEEE 754 binary128 format, and this
+   glibc includes corresponding *f128 interfaces for it.  */
+#ifndef __NO_LONG_DOUBLE_MATH
+# define __HAVE_FLOAT128 1
+#else
+/* glibc does not support _Float128 for platforms where long double is
+   normally binary128 when building with long double as binary64.
+   GCC's default for supported scalar modes does not support it either
+   in that case.  */
+# define __HAVE_FLOAT128 0
+#endif
+
+/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
+   from the default float, double and long double types in this glibc.  */
+#define __HAVE_DISTINCT_FLOAT128 0
+
+/* Defined to 1 if the current compiler invocation provides a
+   floating-point type with the right format for _Float64x, and this
+   glibc includes corresponding *f64x interfaces for it.  */
+#define __HAVE_FLOAT64X __HAVE_FLOAT128
+
+/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
+   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
+   the format of _Float128, which must be different from that of long
+   double.  */
+#define __HAVE_FLOAT64X_LONG_DOUBLE __HAVE_FLOAT128
+
+#ifndef __ASSEMBLER__
+
+/* Defined to concatenate the literal suffix to be used with _Float128
+   types, if __HAVE_FLOAT128 is 1. */
+# if __HAVE_FLOAT128
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+/* The literal suffix f128 exists only since GCC 7.0.  */
+#   define __f128(x) x##l
+#  else
+#   define __f128(x) x##f128
+#  endif
+# endif
+
+/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.  */
+# if __HAVE_FLOAT128
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+#   define __CFLOAT128 _Complex long double
+#  else
+#   define __CFLOAT128 _Complex _Float128
+#  endif
+# endif
+
+/* The remaining of this file provides support for older compilers.  */
+# if __HAVE_FLOAT128
+
+/* The type _Float128 exists only since GCC 7.0.  */
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+typedef long double _Float128;
+#  endif
+
+/* Various built-in functions do not exist before GCC 7.0.  */
+#  if !__GNUC_PREREQ (7, 0)
+#   define __builtin_huge_valf128() (__builtin_huge_vall ())
+#   define __builtin_inff128() (__builtin_infl ())
+#   define __builtin_nanf128(x) (__builtin_nanl (x))
+#   define __builtin_nansf128(x) (__builtin_nansl (x))
+#  endif
+
+# endif
+
+#endif /* !__ASSEMBLER__.  */
+
+#include <bits/floatn-common.h>
+
+#endif /* _BITS_FLOATN_H */
--- a/lib/libc/include/aarch64-linux-musl/bits/posix.h
+++ b/lib/libc/include/aarch64-linux-musl/bits/posix.h
@ -0,0 +1,2 @@
+#define _POSIX_V6_LP64_OFF64  1
+#define _POSIX_V7_LP64_OFF64  1
--- a/lib/libc/include/aarch64_be-linux-gnu/bits/floatn.h
+++ b/lib/libc/include/aarch64_be-linux-gnu/bits/floatn.h
@ -0,0 +1,97 @@
+/* Macros to control TS 18661-3 glibc features on ldbl-128 platforms.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_FLOATN_H
+#define _BITS_FLOATN_H
+
+#include <features.h>
+#include <bits/long-double.h>
+
+/* Defined to 1 if the current compiler invocation provides a
+   floating-point type with the IEEE 754 binary128 format, and this
+   glibc includes corresponding *f128 interfaces for it.  */
+#ifndef __NO_LONG_DOUBLE_MATH
+# define __HAVE_FLOAT128 1
+#else
+/* glibc does not support _Float128 for platforms where long double is
+   normally binary128 when building with long double as binary64.
+   GCC's default for supported scalar modes does not support it either
+   in that case.  */
+# define __HAVE_FLOAT128 0
+#endif
+
+/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
+   from the default float, double and long double types in this glibc.  */
+#define __HAVE_DISTINCT_FLOAT128 0
+
+/* Defined to 1 if the current compiler invocation provides a
+   floating-point type with the right format for _Float64x, and this
+   glibc includes corresponding *f64x interfaces for it.  */
+#define __HAVE_FLOAT64X __HAVE_FLOAT128
+
+/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
+   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
+   the format of _Float128, which must be different from that of long
+   double.  */
+#define __HAVE_FLOAT64X_LONG_DOUBLE __HAVE_FLOAT128
+
+#ifndef __ASSEMBLER__
+
+/* Defined to concatenate the literal suffix to be used with _Float128
+   types, if __HAVE_FLOAT128 is 1. */
+# if __HAVE_FLOAT128
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+/* The literal suffix f128 exists only since GCC 7.0.  */
+#   define __f128(x) x##l
+#  else
+#   define __f128(x) x##f128
+#  endif
+# endif
+
+/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.  */
+# if __HAVE_FLOAT128
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+#   define __CFLOAT128 _Complex long double
+#  else
+#   define __CFLOAT128 _Complex _Float128
+#  endif
+# endif
+
+/* The remaining of this file provides support for older compilers.  */
+# if __HAVE_FLOAT128
+
+/* The type _Float128 exists only since GCC 7.0.  */
+#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
+typedef long double _Float128;
+#  endif
+
+/* Various built-in functions do not exist before GCC 7.0.  */
+#  if !__GNUC_PREREQ (7, 0)
+#   define __builtin_huge_valf128() (__builtin_huge_vall ())
+#   define __builtin_inff128() (__builtin_infl ())
+#   define __builtin_nanf128(x) (__builtin_nanl (x))
+#   define __builtin_nansf128(x) (__builtin_nansl (x))
+#  endif
+
+# endif
+
+#endif /* !__ASSEMBLER__.  */
+
+#include <bits/floatn-common.h>
+
+#endif /* _BITS_FLOATN_H */
--- a/lib/libc/include/arm-linux-gnueabi/bits/floatn.h
+++ b/lib/libc/include/arm-linux-gnueabi/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/arm-linux-gnueabihf/bits/floatn.h
+++ b/lib/libc/include/arm-linux-gnueabihf/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/arm-linux-musl/bits/posix.h
+++ b/lib/libc/include/arm-linux-musl/bits/posix.h
@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
--- a/lib/libc/include/armeb-linux-gnueabi/bits/floatn.h
+++ b/lib/libc/include/armeb-linux-gnueabi/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/armeb-linux-gnueabihf/bits/floatn.h
+++ b/lib/libc/include/armeb-linux-gnueabihf/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/csky-linux-gnueabi/bits/floatn.h
+++ b/lib/libc/include/csky-linux-gnueabi/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/csky-linux-gnueabi/bits/struct_rwlock.h
+++ b/lib/libc/include/csky-linux-gnueabi/bits/struct_rwlock.h
@ -1,61 +0,0 @@
-/* Default read-write lock implementation struct definitions.
-   Copyright (C) 2019-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef __RWLOCK_INTERNAL_H
-#define __RWLOCK_INTERNAL_H
-
-#include <bits/endian.h>
-
-/* Generic struct for both POSIX read-write lock.  New ports are expected
-   to use the default layout, however archictetures can redefine it to add
-   arch-specific extensions (such as lock-elision).  The struct have a size
-   of 32 bytes on both LP32 and LP64 architectures.  */
-
-struct __pthread_rwlock_arch_t
-{
-  unsigned int __readers;
-  unsigned int __writers;
-  unsigned int __wrphase_futex;
-  unsigned int __writers_futex;
-  unsigned int __pad3;
-  unsigned int __pad4;
-  /* FLAGS must stay at its position in the structure to maintain
-     binary compatibility.  */
-#if __BYTE_ORDER == __BIG_ENDIAN
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __shared;
-  unsigned char __flags;
-#else
-  unsigned char __flags;
-  unsigned char __shared;
-  unsigned char __pad1;
-  unsigned char __pad2;
-#endif
-  int __cur_writer;
-};
-
-#if __BYTE_ORDER == __BIG_ENDIAN
-# define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
-  0, 0, 0, 0, 0, 0, 0, 0, 0, __flags, 0
-#else
-# define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
-  0, 0, 0, 0, 0, 0, __flags, 0, 0, 0, 0
-#endif
-
-#endif
--- a/lib/libc/include/csky-linux-gnueabi/bits/wordsize.h
+++ b/lib/libc/include/csky-linux-gnueabi/bits/wordsize.h
@ -1,21 +0,0 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define __WORDSIZE			32
-#define __WORDSIZE_TIME64_COMPAT32	0
-#define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
--- a/lib/libc/include/csky-linux-gnueabihf/bits/floatn.h
+++ b/lib/libc/include/csky-linux-gnueabihf/bits/floatn.h
@ -1,52 +0,0 @@
-/* Macros to control TS 18661-3 glibc features.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this glibc
-   includes corresponding *f128 interfaces for it.  */
-#define __HAVE_FLOAT128 0
-
-/* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
-   from the default float, double and long double types in this glibc.  */
-#define __HAVE_DISTINCT_FLOAT128 0
-
-/* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the right format for _Float64x, and this
-   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X 0
-
-/* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
-   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
-   the format of _Float128, which must be different from that of long
-   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE 0
-
-#ifndef __ASSEMBLER__
-
-/* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1.
-   E.g.: #define __f128(x) x##f128.  */
-# undef __f128
-
-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
-   E.g.: #define __CFLOAT128 _Complex _Float128.  */
-# undef __CFLOAT128
-
-#endif /* !__ASSEMBLER__.  */
-
-#include <bits/floatn-common.h>
--- a/lib/libc/include/csky-linux-gnueabihf/bits/struct_rwlock.h
+++ b/lib/libc/include/csky-linux-gnueabihf/bits/struct_rwlock.h
@ -1,61 +0,0 @@
-/* Default read-write lock implementation struct definitions.
-   Copyright (C) 2019-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef __RWLOCK_INTERNAL_H
-#define __RWLOCK_INTERNAL_H
-
-#include <bits/endian.h>
-
-/* Generic struct for both POSIX read-write lock.  New ports are expected
-   to use the default layout, however archictetures can redefine it to add
-   arch-specific extensions (such as lock-elision).  The struct have a size
-   of 32 bytes on both LP32 and LP64 architectures.  */
-
-struct __pthread_rwlock_arch_t
-{
-  unsigned int __readers;
-  unsigned int __writers;
-  unsigned int __wrphase_futex;
-  unsigned int __writers_futex;
-  unsigned int __pad3;
-  unsigned int __pad4;
-  /* FLAGS must stay at its position in the structure to maintain
-     binary compatibility.  */
-#if __BYTE_ORDER == __BIG_ENDIAN
-  unsigned char __pad1;
-  unsigned char __pad2;
-  unsigned char __shared;
-  unsigned char __flags;
-#else
-  unsigned char __flags;
-  unsigned char __shared;
-  unsigned char __pad1;
-  unsigned char __pad2;
-#endif
-  int __cur_writer;
-};
-
-#if __BYTE_ORDER == __BIG_ENDIAN
-# define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
-  0, 0, 0, 0, 0, 0, 0, 0, 0, __flags, 0
-#else
-# define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
-  0, 0, 0, 0, 0, 0, __flags, 0, 0, 0, 0
-#endif
-
-#endif
--- a/lib/libc/include/csky-linux-gnueabihf/bits/wordsize.h
+++ b/lib/libc/include/csky-linux-gnueabihf/bits/wordsize.h
@ -1,21 +0,0 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#define __WORDSIZE			32
-#define __WORDSIZE_TIME64_COMPAT32	0
-#define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
--- a/lib/libc/include/generic-glibc/bits/floatn.h
+++ b/lib/libc/include/generic-glibc/bits/floatn.h
@ -1,4 +1,4 @@
-/* Macros to control TS 18661-3 glibc features on ldbl-128 platforms.
+/* Macros to control TS 18661-3 glibc features.
   Copyright (C) 2017-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

@ -16,24 +16,10 @@
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

-#ifndef _BITS_FLOATN_H
-#define _BITS_FLOATN_H
-
-#include <features.h>
-#include <bits/long-double.h>
-
 /* Defined to 1 if the current compiler invocation provides a
-   floating-point type with the IEEE 754 binary128 format, and this
-   glibc includes corresponding *f128 interfaces for it.  */
-#ifndef __NO_LONG_DOUBLE_MATH
-# define __HAVE_FLOAT128 1
-#else
-/* glibc does not support _Float128 for platforms where long double is
-   normally binary128 when building with long double as binary64.
-   GCC's default for supported scalar modes does not support it either
-   in that case.  */
-# define __HAVE_FLOAT128 0
-#endif
+   floating-point type with the IEEE 754 binary128 format, and this glibc
+   includes corresponding *f128 interfaces for it.  */
+#define __HAVE_FLOAT128 0

 /* Defined to 1 if __HAVE_FLOAT128 is 1 and the type is ABI-distinct
   from the default float, double and long double types in this glibc.  */
@ -42,56 +28,25 @@
 /* Defined to 1 if the current compiler invocation provides a
   floating-point type with the right format for _Float64x, and this
   glibc includes corresponding *f64x interfaces for it.  */
-#define __HAVE_FLOAT64X __HAVE_FLOAT128
+#define __HAVE_FLOAT64X 0

 /* Defined to 1 if __HAVE_FLOAT64X is 1 and _Float64x has the format
   of long double.  Otherwise, if __HAVE_FLOAT64X is 1, _Float64x has
   the format of _Float128, which must be different from that of long
   double.  */
-#define __HAVE_FLOAT64X_LONG_DOUBLE __HAVE_FLOAT128
+#define __HAVE_FLOAT64X_LONG_DOUBLE 0

 #ifndef __ASSEMBLER__

 /* Defined to concatenate the literal suffix to be used with _Float128
-   types, if __HAVE_FLOAT128 is 1. */
-# if __HAVE_FLOAT128
-#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
-/* The literal suffix f128 exists only since GCC 7.0.  */
-#   define __f128(x) x##l
-#  else
-#   define __f128(x) x##f128
-#  endif
-# endif
+   types, if __HAVE_FLOAT128 is 1.
+   E.g.: #define __f128(x) x##f128.  */
+# undef __f128

-/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.  */
-# if __HAVE_FLOAT128
-#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
-#   define __CFLOAT128 _Complex long double
-#  else
-#   define __CFLOAT128 _Complex _Float128
-#  endif
-# endif
-
-/* The remaining of this file provides support for older compilers.  */
-# if __HAVE_FLOAT128
-
-/* The type _Float128 exists only since GCC 7.0.  */
-#  if !__GNUC_PREREQ (7, 0) || defined __cplusplus
-typedef long double _Float128;
-#  endif
-
-/* Various built-in functions do not exist before GCC 7.0.  */
-#  if !__GNUC_PREREQ (7, 0)
-#   define __builtin_huge_valf128() (__builtin_huge_vall ())
-#   define __builtin_inff128() (__builtin_infl ())
-#   define __builtin_nanf128(x) (__builtin_nanl (x))
-#   define __builtin_nansf128(x) (__builtin_nansl (x))
-#  endif
-
-# endif
+/* Defined to a complex binary128 type if __HAVE_FLOAT128 is 1.
+   E.g.: #define __CFLOAT128 _Complex _Float128.  */
+# undef __CFLOAT128

 #endif /* !__ASSEMBLER__.  */

-#include <bits/floatn-common.h>
-
-#endif /* _BITS_FLOATN_H */
+#include <bits/floatn-common.h>
--- a/lib/libc/include/generic-glibc/bits/struct_rwlock.h
+++ b/lib/libc/include/generic-glibc/bits/struct_rwlock.h
@ -1,4 +1,4 @@
-/* MIPS internal rwlock struct definitions.
+/* Default read-write lock implementation struct definitions.
   Copyright (C) 2019-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

@ -16,8 +16,15 @@
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */

-#ifndef _RWLOCK_INTERNAL_H
-#define _RWLOCK_INTERNAL_H
+#ifndef __RWLOCK_INTERNAL_H
+#define __RWLOCK_INTERNAL_H
+
+#include <bits/endian.h>
+
+/* Generic struct for both POSIX read-write lock.  New ports are expected
+   to use the default layout, however archictetures can redefine it to add
+   arch-specific extensions (such as lock-elision).  The struct have a size
+   of 32 bytes on both LP32 and LP64 architectures.  */

 struct __pthread_rwlock_arch_t
 {
@ -27,45 +34,28 @@ struct __pthread_rwlock_arch_t
  unsigned int __writers_futex;
  unsigned int __pad3;
  unsigned int __pad4;
-#if _MIPS_SIM == _ABI64
-  int __cur_writer;
-  int __shared;
-  unsigned long int __pad1;
-  unsigned long int __pad2;
-  /* FLAGS must stay at this position in the structure to maintain
+  /* FLAGS must stay at its position in the structure to maintain
     binary compatibility.  */
-  unsigned int __flags;
-# else
-# if __BYTE_ORDER == __BIG_ENDIAN
+#if __BYTE_ORDER == __BIG_ENDIAN
  unsigned char __pad1;
  unsigned char __pad2;
  unsigned char __shared;
-  /* FLAGS must stay at this position in the structure to maintain
-     binary compatibility.  */
  unsigned char __flags;
-# else
-  /* FLAGS must stay at this position in the structure to maintain
-     binary compatibility.  */
+#else
  unsigned char __flags;
  unsigned char __shared;
  unsigned char __pad1;
  unsigned char __pad2;
-# endif
-  int __cur_writer;
 #endif
+  int __cur_writer;
 };

-#if _MIPS_SIM == _ABI64
+#if __BYTE_ORDER == __BIG_ENDIAN
 # define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, __flags
-#else
-# if __BYTE_ORDER == __BIG_ENDIAN
-#  define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
  0, 0, 0, 0, 0, 0, 0, 0, 0, __flags, 0
-# else
-#  define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
+#else
+# define __PTHREAD_RWLOCK_INITIALIZER(__flags) \
  0, 0, 0, 0, 0, 0, __flags, 0, 0, 0, 0
-# endif
 #endif

 #endif
--- a/lib/libc/include/generic-glibc/bits/wordsize.h
+++ b/lib/libc/include/generic-glibc/bits/wordsize.h
@ -1,4 +1,4 @@
-/* Copyright (C) 2002-2021 Free Software Foundation, Inc.
+/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
@ -12,20 +12,10 @@
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
+   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

-#include <sgidefs.h>
-
-#define __WORDSIZE			_MIPS_SZPTR
-
-#if _MIPS_SIM == _ABI64
-# define __WORDSIZE_TIME64_COMPAT32	1
-#else
-# define __WORDSIZE_TIME64_COMPAT32	0
-#endif
-
-#if __WORDSIZE == 32
+#define __WORDSIZE			32
+#define __WORDSIZE_TIME64_COMPAT32	0
 #define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
-#endif
+#define __WORDSIZE32_PTRDIFF_LONG	0
--- a/lib/libc/include/generic-musl/bits/posix.h
+++ b/lib/libc/include/generic-musl/bits/posix.h
@ -1,2 +1,2 @@
-#define _POSIX_V6_LP64_OFF64  1
-#define _POSIX_V7_LP64_OFF64  1
+#define _POSIX_V6_ILP32_OFFBIG  1
+#define _POSIX_V7_ILP32_OFFBIG  1
--- a/lib/libc/include/i386-linux-musl/bits/posix.h
+++ b/lib/libc/include/i386-linux-musl/bits/posix.h
@ -1,2 +0,0 @@
-#define _POSIX_V6_ILP32_OFFBIG  1
-#define _POSIX_V7_ILP32_OFFBIG  1
--- a/lib/libc/include/m68k-linux-gnu/bits/a.out.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/a.out.h
@ -0,0 +1,3 @@
+#ifndef __A_OUT_GNU_H__
+# error "Never use <bits/a.out.h> directly; include <a.out.h> instead."
+#endif
--- a/lib/libc/include/m68k-linux-gnu/bits/endianness.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/endianness.h
@ -0,0 +1,11 @@
+#ifndef _BITS_ENDIANNESS_H
+#define _BITS_ENDIANNESS_H 1
+
+#ifndef _BITS_ENDIAN_H
+# error "Never use <bits/endianness.h> directly; include <endian.h> instead."
+#endif
+
+/* m68k is big-endian.  */
+#define __BYTE_ORDER __BIG_ENDIAN
+
+#endif /* bits/endianness.h */
--- a/lib/libc/include/m68k-linux-gnu/bits/fcntl.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/fcntl.h
@ -0,0 +1,54 @@
+/* O_*, F_*, FD_* bit values for Linux.
+   Copyright (C) 2000-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef	_FCNTL_H
+# error "Never use <bits/fcntl.h> directly; include <fcntl.h> instead."
+#endif
+
+#define __O_DIRECTORY	 040000	/* Must be a directory.	 */
+#define __O_NOFOLLOW	0100000	/* Do not follow links.	 */
+#define __O_DIRECT	0200000	/* Direct disk access.	*/
+#define __O_LARGEFILE	0400000
+
+struct flock
+  {
+    short int l_type;	/* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK.	*/
+    short int l_whence;	/* Where `l_start' is relative to (like `lseek').  */
+#ifndef __USE_FILE_OFFSET64
+    __off_t l_start;	/* Offset where the lock begins.  */
+    __off_t l_len;	/* Size of the locked area; zero means until EOF.  */
+#else
+    __off64_t l_start;	/* Offset where the lock begins.  */
+    __off64_t l_len;	/* Size of the locked area; zero means until EOF.  */
+#endif
+    __pid_t l_pid;	/* Process holding the lock.  */
+  };
+
+#ifdef __USE_LARGEFILE64
+struct flock64
+  {
+    short int l_type;	/* Type of lock: F_RDLCK, F_WRLCK, or F_UNLCK.	*/
+    short int l_whence;	/* Where `l_start' is relative to (like `lseek').  */
+    __off64_t l_start;	/* Offset where the lock begins.  */
+    __off64_t l_len;	/* Size of the locked area; zero means until EOF.  */
+    __pid_t l_pid;	/* Process holding the lock.  */
+  };
+#endif
+
+/* Include generic Linux declarations.  */
+#include <bits/fcntl-linux.h>
--- a/lib/libc/include/m68k-linux-gnu/bits/fenv.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/fenv.h
@ -0,0 +1,131 @@
+/* Copyright (C) 1997-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _FENV_H
+# error "Never use <bits/fenv.h> directly; include <fenv.h> instead."
+#endif
+
+
+#if defined __HAVE_68881__ || defined __HAVE_FPU__ || defined __mcffpu__
+
+/* Define bits representing the exception.  We use the bit positions of
+   the appropriate bits in the FPSR Accrued Exception Byte.  */
+enum
+  {
+    FE_INEXACT =
+# define FE_INEXACT	(1 << 3)
+      FE_INEXACT,
+    FE_DIVBYZERO =
+# define FE_DIVBYZERO	(1 << 4)
+      FE_DIVBYZERO,
+    FE_UNDERFLOW =
+# define FE_UNDERFLOW	(1 << 5)
+      FE_UNDERFLOW,
+    FE_OVERFLOW =
+# define FE_OVERFLOW	(1 << 6)
+      FE_OVERFLOW,
+    FE_INVALID =
+# define FE_INVALID	(1 << 7)
+      FE_INVALID
+  };
+
+# define FE_ALL_EXCEPT \
+	(FE_INEXACT | FE_DIVBYZERO | FE_UNDERFLOW | FE_OVERFLOW | FE_INVALID)
+
+/* The m68k FPU supports all of the four defined rounding modes.  We use
+   the bit positions in the FPCR Mode Control Byte as the values for the
+   appropriate macros.  */
+enum
+  {
+    FE_TONEAREST =
+# define FE_TONEAREST	0
+      FE_TONEAREST,
+    FE_TOWARDZERO =
+# define FE_TOWARDZERO	(1 << 4)
+      FE_TOWARDZERO,
+    FE_DOWNWARD =
+# define FE_DOWNWARD	(2 << 4)
+      FE_DOWNWARD,
+    FE_UPWARD =
+# define FE_UPWARD	(3 << 4)
+      FE_UPWARD
+  };
+
+#else
+
+/* In the soft-float case, only rounding to nearest is supported, with
+   no exceptions.  */
+
+# define FE_ALL_EXCEPT 0
+
+enum
+  {
+    __FE_UNDEFINED = -1,
+
+    FE_TONEAREST =
+# define FE_TONEAREST	0
+      FE_TONEAREST
+  };
+
+#endif
+
+
+/* Type representing exception flags.  */
+typedef unsigned int fexcept_t;
+
+
+#if defined __HAVE_68881__ || defined __HAVE_FPU__ || defined __mcffpu__
+
+/* Type representing floating-point environment.  This structure
+   corresponds to the layout of the block written by `fmovem'.  */
+typedef struct
+  {
+    unsigned int __control_register;
+    unsigned int __status_register;
+    unsigned int __instruction_address;
+  }
+fenv_t;
+
+#else
+
+/* Keep ABI compatibility with the type used in the generic
+   bits/fenv.h, formerly used for no-FPU ColdFire.  */
+typedef struct
+  {
+    fexcept_t __excepts;
+  }
+fenv_t;
+
+#endif
+
+/* If the default argument is used we use this value.  */
+#define FE_DFL_ENV	((const fenv_t *) -1)
+
+#if defined __USE_GNU && (defined __HAVE_68881__	\
+			  || defined __HAVE_FPU__	\
+			  || defined __mcffpu__)
+/* Floating-point environment where none of the exceptions are masked.  */
+# define FE_NOMASK_ENV	((const fenv_t *) -2)
+#endif
+
+#if __GLIBC_USE (IEC_60559_BFP_EXT_C2X)
+/* Type representing floating-point control modes.  */
+typedef unsigned int femode_t;
+
+/* Default floating-point control modes.  */
+# define FE_DFL_MODE	((const femode_t *) -1L)
+#endif
--- a/lib/libc/include/m68k-linux-gnu/bits/flt-eval-method.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/flt-eval-method.h
@ -0,0 +1,25 @@
+/* Define __GLIBC_FLT_EVAL_METHOD.  M68K version.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_H
+# error "Never use <bits/flt-eval-method.h> directly; include <math.h> instead."
+#endif
+
+/* The m68k FPUs evaluate all values in the 96-bit floating-point
+   format which is also available for the user as 'long double'.  */
+#define __GLIBC_FLT_EVAL_METHOD	2
--- a/lib/libc/include/armeb-linux-gnueabi/bits/wordsize.h
+++ b/lib/libc/include/armeb-linux-gnueabi/bits/wordsize.h
@ -1,4 +1,5 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
+/* Define __FP_LOGB0_IS_MIN and __FP_LOGBNAN_IS_MIN.  M68K version.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
@ -15,7 +16,9 @@
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

-#define __WORDSIZE			32
-#define __WORDSIZE_TIME64_COMPAT32	0
-#define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
+#ifndef _MATH_H
+# error "Never use <bits/fp-logb.h> directly; include <math.h> instead."
+#endif
+
+#define __FP_LOGB0_IS_MIN	1
+#define __FP_LOGBNAN_IS_MIN	0
--- a/lib/libc/include/m68k-linux-gnu/bits/iscanonical.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/iscanonical.h
@ -0,0 +1,54 @@
+/* Define iscanonical macro.  ldbl-96 version.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_H
+# error "Never use <bits/iscanonical.h> directly; include <math.h> instead."
+#endif
+
+extern int __iscanonicall (long double __x)
+     __THROW __attribute__ ((__const__));
+#define __iscanonicalf(x) ((void) (__typeof (x)) (x), 1)
+#define __iscanonical(x) ((void) (__typeof (x)) (x), 1)
+#if __HAVE_DISTINCT_FLOAT128
+# define __iscanonicalf128(x) ((void) (__typeof (x)) (x), 1)
+#endif
+
+/* Return nonzero value if X is canonical.  In IEEE interchange binary
+   formats, all values are canonical, but the argument must still be
+   converted to its semantic type for any exceptions arising from the
+   conversion, before being discarded; in extended precision, there
+   are encodings that are not consistently handled as corresponding to
+   any particular value of the type, and we return 0 for those.  */
+#ifndef __cplusplus
+# define iscanonical(x) __MATH_TG ((x), __iscanonical, (x))
+#else
+/* In C++ mode, __MATH_TG cannot be used, because it relies on
+   __builtin_types_compatible_p, which is a C-only builtin.  On the
+   other hand, overloading provides the means to distinguish between
+   the floating-point types.  The overloading resolution will match
+   the correct parameter (regardless of type qualifiers (i.e.: const
+   and volatile)).  */
+extern "C++" {
+inline int iscanonical (float __val) { return __iscanonicalf (__val); }
+inline int iscanonical (double __val) { return __iscanonical (__val); }
+inline int iscanonical (long double __val) { return __iscanonicall (__val); }
+# if __HAVE_DISTINCT_FLOAT128
+inline int iscanonical (_Float128 __val) { return __iscanonicalf128 (__val); }
+# endif
+}
+#endif /* __cplusplus */
--- a/lib/libc/include/m68k-linux-gnu/bits/link.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/link.h
@ -0,0 +1,57 @@
+/* Copyright (C) 2005-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef	_LINK_H
+# error "Never include <bits/link.h> directly; use <link.h> instead."
+#endif
+
+
+/* Registers for entry into PLT on M68K.  */
+typedef struct La_m68k_regs
+{
+  uint32_t lr_a0;
+  uint32_t lr_a1;
+  uint32_t lr_sp;
+} La_m68k_regs;
+
+/* Return values for calls from PLT on M68K.  */
+typedef struct La_m68k_retval
+{
+  uint32_t lrv_d0;
+  uint32_t lrv_d1;
+  uint32_t lrv_a0;
+  long double lrv_fp0;
+} La_m68k_retval;
+
+
+__BEGIN_DECLS
+
+extern Elf32_Addr la_m68k_gnu_pltenter (Elf32_Sym *__sym, unsigned int __ndx,
+					uintptr_t *__refcook,
+					uintptr_t *__defcook,
+					La_m68k_regs *__regs,
+					unsigned int *__flags,
+					const char *__symname,
+					long int *__framesizep);
+extern unsigned int la_m68k_gnu_pltexit (Elf32_Sym *__sym, unsigned int __ndx,
+					 uintptr_t *__refcook,
+					 uintptr_t *__defcook,
+					 const La_m68k_regs *__inregs,
+					 La_m68k_retval *__outregs,
+					 const char *__symname);
+
+__END_DECLS
--- a/lib/libc/include/armeb-linux-gnueabihf/bits/wordsize.h
+++ b/lib/libc/include/armeb-linux-gnueabihf/bits/wordsize.h
@ -1,9 +1,10 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
+/* Properties of long double type.  ldbl-96 version.
+   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
+   License  published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
@ -15,7 +16,6 @@
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

-#define __WORDSIZE			32
-#define __WORDSIZE_TIME64_COMPAT32	0
-#define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
+/* long double is distinct from double, so there is nothing to
+   define here.  */
+#define __LDOUBLE_REDIRECTS_TO_FLOAT128_ABI 0
--- a/lib/libc/include/m68k-linux-gnu/bits/poll.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/poll.h
@ -0,0 +1,49 @@
+/* Copyright (C) 1997-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_POLL_H
+# error "Never use <bits/poll.h> directly; include <sys/poll.h> instead."
+#endif
+
+/* Event types that can be polled for.  These bits may be set in `events'
+   to indicate the interesting event types; they will appear in `revents'
+   to indicate the status of the file descriptor.  */
+#define POLLIN		0x001		/* There is data to read.  */
+#define POLLPRI		0x002		/* There is urgent data to read.  */
+#define POLLOUT		0x004		/* Writing now will not block.  */
+
+#if defined __USE_XOPEN || defined __USE_XOPEN2K8
+/* These values are defined in XPG4.2.  */
+# define POLLRDNORM	0x040		/* Normal data may be read.  */
+# define POLLRDBAND	0x080		/* Priority data may be read.  */
+# define POLLWRNORM	POLLOUT		/* Writing now will not block.  */
+# define POLLWRBAND	0x100		/* Priority data may be written.  */
+#endif
+
+#ifdef __USE_GNU
+/* These are extensions for Linux.  */
+# define POLLMSG	0x400
+# define POLLREMOVE	0x1000
+# define POLLRDHUP	0x2000
+#endif
+
+/* Event types always implicitly polled for.  These bits need not be set in
+   `events', but they will appear in `revents' to indicate the status of
+   the file descriptor.  */
+#define POLLERR		0x008		/* Error condition.  */
+#define POLLHUP		0x010		/* Hung up.  */
+#define POLLNVAL	0x020		/* Invalid polling request.  */
--- a/lib/libc/include/arm-linux-gnueabihf/bits/wordsize.h
+++ b/lib/libc/include/arm-linux-gnueabihf/bits/wordsize.h
@ -1,4 +1,6 @@
-/* Copyright (C) 1999-2021 Free Software Foundation, Inc.
+/* Types of pr_uid and pr_gid in struct elf_prpsinfo.  M68K version.
+   Copyright (C) 2018-2021 Free Software Foundation, Inc.
+
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
@ -15,7 +17,9 @@
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

-#define __WORDSIZE			32
-#define __WORDSIZE_TIME64_COMPAT32	0
-#define __WORDSIZE32_SIZE_ULONG		0
-#define __WORDSIZE32_PTRDIFF_LONG	0
+#ifndef _SYS_PROCFS_H
+# error "Never include <bits/procfs-id.h> directly; use <sys/procfs.h> instead."
+#endif
+
+typedef unsigned short int __pr_uid_t;
+typedef unsigned short int __pr_gid_t;
--- a/lib/libc/include/m68k-linux-gnu/bits/procfs.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/procfs.h
@ -0,0 +1,34 @@
+/* Types for registers for sys/procfs.h.  M68K version.
+   Copyright (C) 1996-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_PROCFS_H
+# error "Never include <bits/procfs.h> directly; use <sys/procfs.h> instead."
+#endif
+
+/* Type for a general-purpose register.  */
+typedef unsigned long elf_greg_t;
+
+/* And the whole bunch of them.  We could have used `struct
+   user_regs_struct' directly in the typedef, but tradition says that
+   the register set is an array, which does have some peculiar
+   semantics, so leave it that way.  */
+#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof (elf_greg_t))
+typedef elf_greg_t elf_gregset_t[ELF_NGREG];
+
+/* Register set for the floating-point registers.  */
+typedef struct user_m68kfp_struct elf_fpregset_t;
--- a/lib/libc/include/m68k-linux-gnu/bits/pthreadtypes-arch.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/pthreadtypes-arch.h
@ -0,0 +1,37 @@
+/* Copyright (C) 2010-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Maxim Kuvyrkov <maxim@codesourcery.com>, 2010.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _BITS_PTHREADTYPES_ARCH_H
+#define _BITS_PTHREADTYPES_ARCH_H	1
+
+#include <bits/endian.h>
+
+#define __SIZEOF_PTHREAD_ATTR_T 36
+#define __SIZEOF_PTHREAD_MUTEX_T 24
+#define __SIZEOF_PTHREAD_MUTEXATTR_T 4
+#define __SIZEOF_PTHREAD_COND_T 48
+#define __SIZEOF_PTHREAD_CONDATTR_T 4
+#define __SIZEOF_PTHREAD_RWLOCK_T 32
+#define __SIZEOF_PTHREAD_RWLOCKATTR_T 8
+#define __SIZEOF_PTHREAD_BARRIER_T 20
+#define __SIZEOF_PTHREAD_BARRIERATTR_T 4
+
+#define __LOCK_ALIGNMENT __attribute__ ((__aligned__ (4)))
+#define __ONCE_ALIGNMENT __attribute__ ((__aligned__ (4)))
+
+#endif	/* bits/pthreadtypes.h */
--- a/lib/libc/include/m68k-linux-gnu/bits/semaphore.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/semaphore.h
@ -0,0 +1,35 @@
+/* Copyright (C) 2010-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Maxim Kuvyrkov <maxim@codesourcery.com>, 2010.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SEMAPHORE_H
+# error "Never use <bits/semaphore.h> directly; include <semaphore.h> instead."
+#endif
+
+
+#define __SIZEOF_SEM_T	16
+
+
+/* Value returned if `sem_open' failed.  */
+#define SEM_FAILED      ((sem_t *) 0)
+
+
+typedef union
+{
+  char __size[__SIZEOF_SEM_T];
+  long int __align __attribute__ ((__aligned__ (4)));
+} sem_t;
--- a/lib/libc/include/m68k-linux-gnu/bits/setjmp.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/setjmp.h
@ -0,0 +1,46 @@
+/* Copyright (C) 1997-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define the machine-dependent type `jmp_buf'.  m68k version.  */
+#ifndef _BITS_SETJMP_H
+#define _BITS_SETJMP_H	1
+
+#if !defined _SETJMP_H && !defined _PTHREAD_H
+# error "Never include <bits/setjmp.h> directly; use <setjmp.h> instead."
+#endif
+
+typedef struct __jmp_buf_internal_tag
+  {
+    /* There are eight 4-byte data registers, but D0 is not saved.  */
+    long int __dregs[7];
+
+    /* There are six 4-byte address registers, plus the FP and SP.  */
+    int *__aregs[6];
+    int *__fp;
+    int *__sp;
+
+#if defined __HAVE_68881__ || defined __HAVE_FPU__
+    /* There are eight floating point registers which
+       are saved in IEEE 96-bit extended format.  */
+    char __fpregs[8 * (96 / 8)];
+#elif defined __mcffpu__
+    char __fpregs[8 * (64 / 8)];
+#endif
+
+  } __jmp_buf[1];
+
+#endif	/* bits/setjmp.h */
--- a/lib/libc/include/m68k-linux-gnu/bits/sockaddr.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/sockaddr.h
@ -0,0 +1,42 @@
+/* Definition of struct sockaddr_* members and sizes, Linux/m68k version.
+   Copyright (C) 1995-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/*
+ * Never include this file directly; use <sys/socket.h> instead.
+ */
+
+#ifndef _BITS_SOCKADDR_H
+#define _BITS_SOCKADDR_H	1
+
+
+/* POSIX.1g specifies this type name for the `sa_family' member.  */
+typedef unsigned short int sa_family_t;
+
+/* This macro is used to declare the initial common members
+   of the data types used for socket addresses, `struct sockaddr',
+   `struct sockaddr_in', `struct sockaddr_un', etc.  */
+
+#define	__SOCKADDR_COMMON(sa_prefix) \
+  sa_family_t sa_prefix##family
+
+#define __SOCKADDR_COMMON_SIZE	(sizeof (unsigned short int))
+
+/* Size of struct sockaddr_storage.  */
+#define _SS_SIZE 126
+
+#endif	/* bits/sockaddr.h */
--- a/lib/libc/include/m68k-linux-gnu/bits/struct_stat.h
+++ b/lib/libc/include/m68k-linux-gnu/bits/struct_stat.h
@ -0,0 +1,127 @@
+/* Definition for struct stat.
+   Copyright (C) 2020-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if !defined _SYS_STAT_H && !defined _FCNTL_H
+# error "Never include <bits/struct_stat.h> directly; use <sys/stat.h> instead."
+#endif
+
+#ifndef _BITS_STRUCT_STAT_H
+#define _BITS_STRUCT_STAT_H	1
+
+struct stat
+  {
+    __dev_t st_dev;			/* Device.  */
+    unsigned short int __pad1;
+#ifndef __USE_FILE_OFFSET64
+    __ino_t st_ino;			/* File serial number.	*/
+#else
+    __ino_t __st_ino;			/* 32bit file serial number.	*/
+#endif
+    __mode_t st_mode;			/* File mode.  */
+    __nlink_t st_nlink;			/* Link count.  */
+    __uid_t st_uid;			/* User ID of the file's owner.	*/
+    __gid_t st_gid;			/* Group ID of the file's group.*/
+    __dev_t st_rdev;			/* Device number, if device.  */
+    unsigned short int __pad2;
+#ifndef __USE_FILE_OFFSET64
+    __off_t st_size;			/* Size of file, in bytes.  */
+#else
+    __off64_t st_size;			/* Size of file, in bytes.  */
+#endif
+    __blksize_t st_blksize;		/* Optimal block size for I/O.  */
+
+#ifndef __USE_FILE_OFFSET64
+    __blkcnt_t st_blocks;		/* Number 512-byte blocks allocated. */
+#else
+    __blkcnt64_t st_blocks;		/* Number 512-byte blocks allocated. */
+#endif
+#ifdef __USE_XOPEN2K8
+    /* Nanosecond resolution timestamps are stored in a format
+       equivalent to 'struct timespec'.  This is the type used
+       whenever possible but the Unix namespace rules do not allow the
+       identifier 'timespec' to appear in the <sys/stat.h> header.
+       Therefore we have to handle the use of this header in strictly
+       standard-compliant sources special.  */
+    struct timespec st_atim;		/* Time of last access.  */
+    struct timespec st_mtim;		/* Time of last modification.  */
+    struct timespec st_ctim;		/* Time of last status change.  */
+# define st_atime st_atim.tv_sec	/* Backward compatibility.  */
+# define st_mtime st_mtim.tv_sec
+# define st_ctime st_ctim.tv_sec
+#else
+    __time_t st_atime;			/* Time of last access.  */
+    unsigned long int st_atimensec;	/* Nscecs of last access.  */
+    __time_t st_mtime;			/* Time of last modification.  */
+    unsigned long int st_mtimensec;	/* Nsecs of last modification.  */
+    __time_t st_ctime;			/* Time of last status change.  */
+    unsigned long int st_ctimensec;	/* Nsecs of last status change.  */
+#endif
+#ifndef __USE_FILE_OFFSET64
+    unsigned long int __glibc_reserved4;
+    unsigned long int __glibc_reserved5;
+#else
+    __ino64_t st_ino;			/* File serial number.	*/
+#endif
+  };
+
+#ifdef __USE_LARGEFILE64
+struct stat64
+  {
+    __dev_t st_dev;			/* Device.  */
+    unsigned short int __pad1;
+
+    __ino_t __st_ino;			/* 32bit file serial number.	*/
+    __mode_t st_mode;			/* File mode.  */
+    __nlink_t st_nlink;			/* Link count.  */
+    __uid_t st_uid;			/* User ID of the file's owner.	*/
+    __gid_t st_gid;			/* Group ID of the file's group.*/
+    __dev_t st_rdev;			/* Device number, if device.  */
+    unsigned short int __pad2;
+    __off64_t st_size;			/* Size of file, in bytes.  */
+    __blksize_t st_blksize;		/* Optimal block size for I/O.  */
+
+    __blkcnt64_t st_blocks;		/* Number 512-byte blocks allocated. */
+# ifdef __USE_XOPEN2K8
+    /* Nanosecond resolution timestamps are stored in a format
+       equivalent to 'struct timespec'.  This is the type used
+       whenever possible but the Unix namespace rules do not allow the
+       identifier 'timespec' to appear in the <sys/stat.h> header.
+       Therefore we have to handle the use of this header in strictly
+       standard-compliant sources special.  */
+    struct timespec st_atim;		/* Time of last access.  */
+    struct timespec st_mtim;		/* Time of last modification.  */
+    struct timespec st_ctim;		/* Time of last status change.  */
+# else
+    __time_t st_atime;			/* Time of last access.  */
+    unsigned long int st_atimensec;	/* Nscecs of last access.  */
+    __time_t st_mtime;			/* Time of last modification.  */
+    unsigned long int st_mtimensec;	/* Nsecs of last modification.  */
+    __time_t st_ctime;			/* Time of last status change.  */
+    unsigned long int st_ctimensec;	/* Nsecs of last status change.  */
+# endif
+    __ino64_t st_ino;			/* File serial number.		*/
+  };
+#endif
+
+/* Tell code we have these members.  */
+#define	_STATBUF_ST_BLKSIZE
+#define _STATBUF_ST_RDEV
+/* Nanosecond resolution time values are supported.  */
+#define _STATBUF_ST_NSEC
+
+#endif /* _BITS_STRUCT_STAT_H  */
--- a/lib/libc/include/m68k-linux-gnu/fpu_control.h
+++ b/lib/libc/include/m68k-linux-gnu/fpu_control.h
@ -0,0 +1,118 @@
+/* 68k FPU control word definitions.
+   Copyright (C) 1996-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _FPU_CONTROL_H
+#define _FPU_CONTROL_H
+
+/*
+ * Motorola floating point control register bits.
+ *
+ * 31-16  -> reserved (read as 0, ignored on write)
+ * 15     -> enable trap for BSUN exception
+ * 14     -> enable trap for SNAN exception
+ * 13     -> enable trap for OPERR exception
+ * 12     -> enable trap for OVFL exception
+ * 11     -> enable trap for UNFL exception
+ * 10     -> enable trap for DZ exception
+ *  9     -> enable trap for INEX2 exception (INEX on Coldfire)
+ *  8     -> enable trap for INEX1 exception (IDE on Coldfire)
+ *  7-6   -> Precision Control (only bit 6 is used on Coldfire)
+ *  5-4   -> Rounding Control
+ *  3-0   -> zero (read as 0, write as 0)
+ *
+ *
+ * Precision Control:
+ * 00 - round to extended precision
+ * 01 - round to single precision
+ * 10 - round to double precision
+ * 11 - undefined
+ *
+ * Rounding Control:
+ * 00 - rounding to nearest (RN)
+ * 01 - rounding toward zero (RZ)
+ * 10 - rounding (down)toward minus infinity (RM)
+ * 11 - rounding (up) toward plus infinity (RP)
+ *
+ * The hardware default is 0x0000. I choose 0x5400.
+ */
+
+#include <features.h>
+
+#if defined (__mcoldfire__) && !defined (__mcffpu__)
+
+# define _FPU_RESERVED 0xffffffff
+# define _FPU_DEFAULT  0x00000000
+# define _FPU_GETCW(cw) ((cw) = 0)
+# define _FPU_SETCW(cw) ((void) (cw))
+
+#else
+
+/* masking of interrupts */
+# define _FPU_MASK_BSUN  0x8000
+# define _FPU_MASK_SNAN  0x4000
+# define _FPU_MASK_OPERR 0x2000
+# define _FPU_MASK_OVFL  0x1000
+# define _FPU_MASK_UNFL  0x0800
+# define _FPU_MASK_DZ    0x0400
+# define _FPU_MASK_INEX1 0x0200
+# define _FPU_MASK_INEX2 0x0100
+
+/* precision control */
+# ifdef __mcoldfire__
+#  define _FPU_DOUBLE   0x00
+# else
+#  define _FPU_EXTENDED 0x00   /* RECOMMENDED */
+#  define _FPU_DOUBLE   0x80
+# endif
+# define _FPU_SINGLE   0x40     /* DO NOT USE */
+
+/* rounding control */
+# define _FPU_RC_NEAREST 0x00    /* RECOMMENDED */
+# define _FPU_RC_ZERO    0x10
+# define _FPU_RC_DOWN    0x20
+# define _FPU_RC_UP      0x30
+
+# ifdef __mcoldfire__
+#  define _FPU_RESERVED 0xFFFF800F
+# else
+#  define _FPU_RESERVED 0xFFFF000F  /* Reserved bits in fpucr */
+# endif
+
+
+/* Now two recommended fpucr */
+
+/* The fdlibm code requires no interrupts for exceptions.  Don't
+   change the rounding mode, it would break long double I/O!  */
+# define _FPU_DEFAULT  0x00000000
+
+/* IEEE:  same as above, but exceptions.  We must make it non-zero so
+   that __setfpucw works.  This bit will be ignored.  */
+# define _FPU_IEEE     0x00000001
+
+/* Macros for accessing the hardware control word.  */
+# define _FPU_GETCW(cw) __asm__ ("fmove%.l %!, %0" : "=dm" (cw))
+# define _FPU_SETCW(cw) __asm__ volatile ("fmove%.l %0, %!" : : "dm" (cw))
+#endif
+
+/* Type of the control word.  */
+typedef unsigned int fpu_control_t __attribute__ ((__mode__ (__SI__)));
+
+/* Default control word set at startup.  */
+extern fpu_control_t __fpu_control;
+
+#endif /* _M68K_FPU_CONTROL_H */
--- a/lib/libc/include/m68k-linux-gnu/gnu/lib-names.h
+++ b/lib/libc/include/m68k-linux-gnu/gnu/lib-names.h
@ -0,0 +1,31 @@
+/* This file is automatically generated.
+   It defines macros to allow user program to find the shared
+   library files which come as part of GNU libc.  */
+#ifndef __GNU_LIB_NAMES_H
+#define __GNU_LIB_NAMES_H	1
+
+#define LD_SO                           "ld.so.1"
+#define LIBANL_SO                       "libanl.so.1"
+#define LIBBROKENLOCALE_SO              "libBrokenLocale.so.1"
+#define LIBCRYPT_SO                     "libcrypt.so.1"
+#define LIBC_SO                         "libc.so.6"
+#define LIBDL_SO                        "libdl.so.2"
+#define LIBGCC_S_SO                     "libgcc_s.so.2"
+#define LIBMVEC_SO                      "libmvec.so.1"
+#define LIBM_SO                         "libm.so.6"
+#define LIBNSL_SO                       "libnsl.so.1"
+#define LIBNSS_COMPAT_SO                "libnss_compat.so.2"
+#define LIBNSS_DB_SO                    "libnss_db.so.2"
+#define LIBNSS_DNS_SO                   "libnss_dns.so.2"
+#define LIBNSS_FILES_SO                 "libnss_files.so.2"
+#define LIBNSS_HESIOD_SO                "libnss_hesiod.so.2"
+#define LIBNSS_LDAP_SO                  "libnss_ldap.so.2"
+#define LIBNSS_TEST1_SO                 "libnss_test1.so.2"
+#define LIBNSS_TEST2_SO                 "libnss_test2.so.2"
+#define LIBPTHREAD_SO                   "libpthread.so.0"
+#define LIBRESOLV_SO                    "libresolv.so.2"
+#define LIBRT_SO                        "librt.so.1"
+#define LIBTHREAD_DB_SO                 "libthread_db.so.1"
+#define LIBUTIL_SO                      "libutil.so.1"
+
+#endif	/* gnu/lib-names.h */
--- a/lib/libc/include/m68k-linux-gnu/gnu/stubs.h
+++ b/lib/libc/include/m68k-linux-gnu/gnu/stubs.h
@ -0,0 +1,16 @@
+/* This file is automatically generated.
+   It defines a symbol `__stub_FUNCTION' for each function
+   in the C library which is a stub, meaning it will fail
+   every time called, usually setting errno to ENOSYS.  */
+
+#ifdef _LIBC
+ #error Applications may not define the macro _LIBC
+#endif
+
+#define __stub_chflags
+#define __stub_fchflags
+#define __stub_gtty
+#define __stub_revoke
+#define __stub_setlogin
+#define __stub_sigreturn
+#define __stub_stty
--- a/lib/libc/include/m68k-linux-gnu/sys/reg.h
+++ b/lib/libc/include/m68k-linux-gnu/sys/reg.h
@ -0,0 +1,101 @@
+/* Copyright (C) 1998-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_REG_H
+#define _SYS_REG_H	1
+
+/* Index into an array of 4 byte integers returned from ptrace for
+   location of the users' stored general purpose registers. */
+
+enum
+{
+  PT_D1 = 0,
+#define PT_D1 PT_D1
+  PT_D2 = 1,
+#define PT_D2 PT_D2
+  PT_D3 = 2,
+#define PT_D3 PT_D3
+  PT_D4 = 3,
+#define PT_D4 PT_D4
+  PT_D5 = 4,
+#define PT_D5 PT_D5
+  PT_D6 = 5,
+#define PT_D6 PT_D6
+  PT_D7 = 6,
+#define PT_D7 PT_D7
+  PT_A0 = 7,
+#define PT_A0 PT_A0
+  PT_A1 = 8,
+#define PT_A1 PT_A1
+  PT_A2 = 9,
+#define PT_A2 PT_A2
+  PT_A3 = 10,
+#define PT_A3 PT_A3
+  PT_A4 = 11,
+#define PT_A4 PT_A4
+  PT_A5 = 12,
+#define PT_A5 PT_A5
+  PT_A6 = 13,
+#define PT_A6 PT_A6
+  PT_D0 = 14,
+#define PT_D0 PT_D0
+  PT_USP = 15,
+#define PT_USP PT_USP
+  PT_ORIG_D0 = 16,
+#define PT_ORIG_D0 PT_ORIG_D0
+  PT_SR = 17,
+#define PT_SR PT_SR
+  PT_PC = 18,
+#define PT_PC PT_PC
+
+#ifdef __mcoldfire__
+  PT_FP0 = 21,
+  PT_FP1 = 23,
+  PT_FP2 = 25,
+  PT_FP3 = 27,
+  PT_FP4 = 29,
+  PT_FP5 = 31,
+  PT_FP6 = 33,
+  PT_FP7 = 35,
+#else
+  PT_FP0 = 21,
+  PT_FP1 = 24,
+  PT_FP2 = 27,
+  PT_FP3 = 30,
+  PT_FP4 = 33,
+  PT_FP5 = 36,
+  PT_FP6 = 39,
+  PT_FP7 = 42,
+#endif
+#define PT_FP0 PT_FP0
+#define PT_FP1 PT_FP1
+#define PT_FP2 PT_FP2
+#define PT_FP3 PT_FP3
+#define PT_FP4 PT_FP4
+#define PT_FP5 PT_FP5
+#define PT_FP6 PT_FP6
+#define PT_FP7 PT_FP7
+
+  PT_FPCR = 45,
+#define PT_FPCR PT_FPCR
+  PT_FPSR = 46,
+#define PT_FPSR PT_FPSR
+  PT_FPIAR = 47
+#define PT_FPIAR PT_FPIAR
+};
+
+#endif	/* _SYS_REG_H */
--- a/lib/libc/include/m68k-linux-gnu/sys/ucontext.h
+++ b/lib/libc/include/m68k-linux-gnu/sys/ucontext.h
@ -0,0 +1,130 @@
+/* Copyright (C) 1997-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* System V/m68k ABI compliant context switching support.  */
+
+#ifndef _SYS_UCONTEXT_H
+#define _SYS_UCONTEXT_H	1
+
+#include <features.h>
+
+#include <bits/types/sigset_t.h>
+#include <bits/types/stack_t.h>
+
+
+/* Type for general register.  */
+typedef int greg_t;
+
+/* Number of general registers.  */
+#define __NGREG	18
+#ifdef __USE_MISC
+# define NGREG	__NGREG
+#endif
+
+/* Container for all general registers.  */
+typedef greg_t gregset_t[__NGREG];
+
+#ifdef __USE_MISC
+/* Number of each register is the `gregset_t' array.  */
+enum
+{
+  R_D0 = 0,
+# define R_D0	R_D0
+  R_D1 = 1,
+# define R_D1	R_D1
+  R_D2 = 2,
+# define R_D2	R_D2
+  R_D3 = 3,
+# define R_D3	R_D3
+  R_D4 = 4,
+# define R_D4	R_D4
+  R_D5 = 5,
+# define R_D5	R_D5
+  R_D6 = 6,
+# define R_D6	R_D6
+  R_D7 = 7,
+# define R_D7	R_D7
+  R_A0 = 8,
+# define R_A0	R_A0
+  R_A1 = 9,
+# define R_A1	R_A1
+  R_A2 = 10,
+# define R_A2	R_A2
+  R_A3 = 11,
+# define R_A3	R_A3
+  R_A4 = 12,
+# define R_A4	R_A4
+  R_A5 = 13,
+# define R_A5	R_A5
+  R_A6 = 14,
+# define R_A6	R_A6
+  R_A7 = 15,
+# define R_A7	R_A7
+  R_SP = 15,
+# define R_SP	R_SP
+  R_PC = 16,
+# define R_PC	R_PC
+  R_PS = 17
+# define R_PS	R_PS
+};
+#endif
+
+#ifdef __USE_MISC
+# define __ctx(fld) fld
+#else
+# define __ctx(fld) __ ## fld
+#endif
+
+/* Structure to describe FPU registers.  */
+typedef struct
+{
+  int __ctx(f_pcr);
+  int __ctx(f_psr);
+  int __ctx(f_fpiaddr);
+#ifdef __mcoldfire__
+  int __ctx(f_fpregs)[8][2];
+#else
+  int __ctx(f_fpregs)[8][3];
+#endif
+} fpregset_t;
+
+/* Context to describe whole processor state.  */
+typedef struct
+{
+  int __ctx(version);
+  gregset_t __ctx(gregs);
+  fpregset_t __ctx(fpregs);
+} mcontext_t;
+
+#ifdef __USE_MISC
+# define MCONTEXT_VERSION 2
+#endif
+
+/* Userlevel context.  */
+typedef struct ucontext_t
+{
+  unsigned long __ctx(uc_flags);
+  struct ucontext_t *uc_link;
+  stack_t uc_stack;
+  mcontext_t uc_mcontext;
+  unsigned long __glibc_reserved1[80];
+  sigset_t uc_sigmask;
+} ucontext_t;
+
+#undef __ctx
+
+#endif /* sys/ucontext.h */
--- a/lib/libc/include/m68k-linux-gnu/sys/user.h
+++ b/lib/libc/include/m68k-linux-gnu/sys/user.h
@ -0,0 +1,65 @@
+/* Copyright (C) 2008-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SYS_USER_H
+#define _SYS_USER_H	1
+
+/* The whole purpose of this file is for GDB and GDB only.  Don't read
+   too much into it.  Don't use it for anything other than GDB unless
+   you know what you are doing.  */
+
+struct user_m68kfp_struct {
+	unsigned long fpregs[8*3];
+	unsigned long fpcntl[3];
+};
+
+struct user_regs_struct {
+	long d1, d2, d3, d4, d5, d6, d7;
+	long a0, a1, a2, a3, a4, a5, a6;
+	long d0;
+	long usp;
+	long orig_d0;
+	short stkadj;
+	short sr;
+	long pc;
+	short fmtvec;
+	short __fill;
+};
+
+struct user {
+	struct user_regs_struct regs;
+	int u_fpvalid;
+	struct user_m68kfp_struct m68kfp;
+	unsigned long int u_tsize;
+	unsigned long int u_dsize;
+	unsigned long int u_ssize;
+	unsigned long start_code;
+	unsigned long start_stack;
+	long int signal;
+	int reserved;
+	unsigned long u_ar0;
+	struct user_m68kfp_struct *u_fpstate;
+	unsigned long magic;
+	char u_comm[32];
+};
+
+#define NBPG 4096
+#define UPAGES 1
+#define HOST_TEXT_START_ADDR u.start_code
+#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
+
+#endif
--- a/lib/libc/include/m68k-linux-musl/bits/alltypes.h
+++ b/lib/libc/include/m68k-linux-musl/bits/alltypes.h
@ -0,0 +1,424 @@
+#define _REDIR_TIME64 1
+#define _Addr int
+#define _Int64 long long
+#define _Reg int
+
+#define __BYTE_ORDER 4321
+#define __LONG_MAX 0x7fffffffL
+
+#ifndef __cplusplus
+#ifdef __WCHAR_TYPE__
+#if defined(__NEED_wchar_t) && !defined(__DEFINED_wchar_t)
+typedef __WCHAR_TYPE__ wchar_t;
+#define __DEFINED_wchar_t
+#endif
+
+#else
+#if defined(__NEED_wchar_t) && !defined(__DEFINED_wchar_t)
+typedef long wchar_t;
+#define __DEFINED_wchar_t
+#endif
+
+#endif
+#endif
+
+#if __mcffpu__
+#if defined(__NEED_float_t) && !defined(__DEFINED_float_t)
+typedef float float_t;
+#define __DEFINED_float_t
+#endif
+
+#if defined(__NEED_double_t) && !defined(__DEFINED_double_t)
+typedef double double_t;
+#define __DEFINED_double_t
+#endif
+
+#else
+#if defined(__NEED_float_t) && !defined(__DEFINED_float_t)
+typedef long double float_t;
+#define __DEFINED_float_t
+#endif
+
+#if defined(__NEED_double_t) && !defined(__DEFINED_double_t)
+typedef long double double_t;
+#define __DEFINED_double_t
+#endif
+
+#endif
+
+#if defined(__NEED_max_align_t) && !defined(__DEFINED_max_align_t)
+typedef struct { long long __ll; long double __ld; } max_align_t;
+#define __DEFINED_max_align_t
+#endif
+
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+#define __USE_TIME_BITS64 1
+
+#if defined(__NEED_size_t) && !defined(__DEFINED_size_t)
+typedef unsigned _Addr size_t;
+#define __DEFINED_size_t
+#endif
+
+#if defined(__NEED_uintptr_t) && !defined(__DEFINED_uintptr_t)
+typedef unsigned _Addr uintptr_t;
+#define __DEFINED_uintptr_t
+#endif
+
+#if defined(__NEED_ptrdiff_t) && !defined(__DEFINED_ptrdiff_t)
+typedef _Addr ptrdiff_t;
+#define __DEFINED_ptrdiff_t
+#endif
+
+#if defined(__NEED_ssize_t) && !defined(__DEFINED_ssize_t)
+typedef _Addr ssize_t;
+#define __DEFINED_ssize_t
+#endif
+
+#if defined(__NEED_intptr_t) && !defined(__DEFINED_intptr_t)
+typedef _Addr intptr_t;
+#define __DEFINED_intptr_t
+#endif
+
+#if defined(__NEED_regoff_t) && !defined(__DEFINED_regoff_t)
+typedef _Addr regoff_t;
+#define __DEFINED_regoff_t
+#endif
+
+#if defined(__NEED_register_t) && !defined(__DEFINED_register_t)
+typedef _Reg register_t;
+#define __DEFINED_register_t
+#endif
+
+#if defined(__NEED_time_t) && !defined(__DEFINED_time_t)
+typedef _Int64 time_t;
+#define __DEFINED_time_t
+#endif
+
+#if defined(__NEED_suseconds_t) && !defined(__DEFINED_suseconds_t)
+typedef _Int64 suseconds_t;
+#define __DEFINED_suseconds_t
+#endif
+
+
+#if defined(__NEED_int8_t) && !defined(__DEFINED_int8_t)
+typedef signed char     int8_t;
+#define __DEFINED_int8_t
+#endif
+
+#if defined(__NEED_int16_t) && !defined(__DEFINED_int16_t)
+typedef signed short    int16_t;
+#define __DEFINED_int16_t
+#endif
+
+#if defined(__NEED_int32_t) && !defined(__DEFINED_int32_t)
+typedef signed int      int32_t;
+#define __DEFINED_int32_t
+#endif
+
+#if defined(__NEED_int64_t) && !defined(__DEFINED_int64_t)
+typedef signed _Int64   int64_t;
+#define __DEFINED_int64_t
+#endif
+
+#if defined(__NEED_intmax_t) && !defined(__DEFINED_intmax_t)
+typedef signed _Int64   intmax_t;
+#define __DEFINED_intmax_t
+#endif
+
+#if defined(__NEED_uint8_t) && !defined(__DEFINED_uint8_t)
+typedef unsigned char   uint8_t;
+#define __DEFINED_uint8_t
+#endif
+
+#if defined(__NEED_uint16_t) && !defined(__DEFINED_uint16_t)
+typedef unsigned short  uint16_t;
+#define __DEFINED_uint16_t
+#endif
+
+#if defined(__NEED_uint32_t) && !defined(__DEFINED_uint32_t)
+typedef unsigned int    uint32_t;
+#define __DEFINED_uint32_t
+#endif
+
+#if defined(__NEED_uint64_t) && !defined(__DEFINED_uint64_t)
+typedef unsigned _Int64 uint64_t;
+#define __DEFINED_uint64_t
+#endif
+
+#if defined(__NEED_u_int64_t) && !defined(__DEFINED_u_int64_t)
+typedef unsigned _Int64 u_int64_t;
+#define __DEFINED_u_int64_t
+#endif
+
+#if defined(__NEED_uintmax_t) && !defined(__DEFINED_uintmax_t)
+typedef unsigned _Int64 uintmax_t;
+#define __DEFINED_uintmax_t
+#endif
+
+
+#if defined(__NEED_mode_t) && !defined(__DEFINED_mode_t)
+typedef unsigned mode_t;
+#define __DEFINED_mode_t
+#endif
+
+#if defined(__NEED_nlink_t) && !defined(__DEFINED_nlink_t)
+typedef unsigned _Reg nlink_t;
+#define __DEFINED_nlink_t
+#endif
+
+#if defined(__NEED_off_t) && !defined(__DEFINED_off_t)
+typedef _Int64 off_t;
+#define __DEFINED_off_t
+#endif
+
+#if defined(__NEED_ino_t) && !defined(__DEFINED_ino_t)
+typedef unsigned _Int64 ino_t;
+#define __DEFINED_ino_t
+#endif
+
+#if defined(__NEED_dev_t) && !defined(__DEFINED_dev_t)
+typedef unsigned _Int64 dev_t;
+#define __DEFINED_dev_t
+#endif
+
+#if defined(__NEED_blksize_t) && !defined(__DEFINED_blksize_t)
+typedef long blksize_t;
+#define __DEFINED_blksize_t
+#endif
+
+#if defined(__NEED_blkcnt_t) && !defined(__DEFINED_blkcnt_t)
+typedef _Int64 blkcnt_t;
+#define __DEFINED_blkcnt_t
+#endif
+
+#if defined(__NEED_fsblkcnt_t) && !defined(__DEFINED_fsblkcnt_t)
+typedef unsigned _Int64 fsblkcnt_t;
+#define __DEFINED_fsblkcnt_t
+#endif
+
+#if defined(__NEED_fsfilcnt_t) && !defined(__DEFINED_fsfilcnt_t)
+typedef unsigned _Int64 fsfilcnt_t;
+#define __DEFINED_fsfilcnt_t
+#endif
+
+
+#if defined(__NEED_wint_t) && !defined(__DEFINED_wint_t)
+typedef unsigned wint_t;
+#define __DEFINED_wint_t
+#endif
+
+#if defined(__NEED_wctype_t) && !defined(__DEFINED_wctype_t)
+typedef unsigned long wctype_t;
+#define __DEFINED_wctype_t
+#endif
+
+
+#if defined(__NEED_timer_t) && !defined(__DEFINED_timer_t)
+typedef void * timer_t;
+#define __DEFINED_timer_t
+#endif
+
+#if defined(__NEED_clockid_t) && !defined(__DEFINED_clockid_t)
+typedef int clockid_t;
+#define __DEFINED_clockid_t
+#endif
+
+#if defined(__NEED_clock_t) && !defined(__DEFINED_clock_t)
+typedef long clock_t;
+#define __DEFINED_clock_t
+#endif
+
+#if defined(__NEED_struct_timeval) && !defined(__DEFINED_struct_timeval)
+struct timeval { time_t tv_sec; suseconds_t tv_usec; };
+#define __DEFINED_struct_timeval
+#endif
+
+#if defined(__NEED_struct_timespec) && !defined(__DEFINED_struct_timespec)
+struct timespec { time_t tv_sec; int :8*(sizeof(time_t)-sizeof(long))*(__BYTE_ORDER==4321); long tv_nsec; int :8*(sizeof(time_t)-sizeof(long))*(__BYTE_ORDER!=4321); };
+#define __DEFINED_struct_timespec
+#endif
+
+
+#if defined(__NEED_pid_t) && !defined(__DEFINED_pid_t)
+typedef int pid_t;
+#define __DEFINED_pid_t
+#endif
+
+#if defined(__NEED_id_t) && !defined(__DEFINED_id_t)
+typedef unsigned id_t;
+#define __DEFINED_id_t
+#endif
+
+#if defined(__NEED_uid_t) && !defined(__DEFINED_uid_t)
+typedef unsigned uid_t;
+#define __DEFINED_uid_t
+#endif
+
+#if defined(__NEED_gid_t) && !defined(__DEFINED_gid_t)
+typedef unsigned gid_t;
+#define __DEFINED_gid_t
+#endif
+
+#if defined(__NEED_key_t) && !defined(__DEFINED_key_t)
+typedef int key_t;
+#define __DEFINED_key_t
+#endif
+
+#if defined(__NEED_useconds_t) && !defined(__DEFINED_useconds_t)
+typedef unsigned useconds_t;
+#define __DEFINED_useconds_t
+#endif
+
+
+#ifdef __cplusplus
+#if defined(__NEED_pthread_t) && !defined(__DEFINED_pthread_t)
+typedef unsigned long pthread_t;
+#define __DEFINED_pthread_t
+#endif
+
+#else
+#if defined(__NEED_pthread_t) && !defined(__DEFINED_pthread_t)
+typedef struct __pthread * pthread_t;
+#define __DEFINED_pthread_t
+#endif
+
+#endif
+#if defined(__NEED_pthread_once_t) && !defined(__DEFINED_pthread_once_t)
+typedef int pthread_once_t;
+#define __DEFINED_pthread_once_t
+#endif
+
+#if defined(__NEED_pthread_key_t) && !defined(__DEFINED_pthread_key_t)
+typedef unsigned pthread_key_t;
+#define __DEFINED_pthread_key_t
+#endif
+
+#if defined(__NEED_pthread_spinlock_t) && !defined(__DEFINED_pthread_spinlock_t)
+typedef int pthread_spinlock_t;
+#define __DEFINED_pthread_spinlock_t
+#endif
+
+#if defined(__NEED_pthread_mutexattr_t) && !defined(__DEFINED_pthread_mutexattr_t)
+typedef struct { unsigned __attr; } pthread_mutexattr_t;
+#define __DEFINED_pthread_mutexattr_t
+#endif
+
+#if defined(__NEED_pthread_condattr_t) && !defined(__DEFINED_pthread_condattr_t)
+typedef struct { unsigned __attr; } pthread_condattr_t;
+#define __DEFINED_pthread_condattr_t
+#endif
+
+#if defined(__NEED_pthread_barrierattr_t) && !defined(__DEFINED_pthread_barrierattr_t)
+typedef struct { unsigned __attr; } pthread_barrierattr_t;
+#define __DEFINED_pthread_barrierattr_t
+#endif
+
+#if defined(__NEED_pthread_rwlockattr_t) && !defined(__DEFINED_pthread_rwlockattr_t)
+typedef struct { unsigned __attr[2]; } pthread_rwlockattr_t;
+#define __DEFINED_pthread_rwlockattr_t
+#endif
+
+
+#if defined(__NEED_struct__IO_FILE) && !defined(__DEFINED_struct__IO_FILE)
+struct _IO_FILE { char __x; };
+#define __DEFINED_struct__IO_FILE
+#endif
+
+#if defined(__NEED_FILE) && !defined(__DEFINED_FILE)
+typedef struct _IO_FILE FILE;
+#define __DEFINED_FILE
+#endif
+
+
+#if defined(__NEED_va_list) && !defined(__DEFINED_va_list)
+typedef __builtin_va_list va_list;
+#define __DEFINED_va_list
+#endif
+
+#if defined(__NEED___isoc_va_list) && !defined(__DEFINED___isoc_va_list)
+typedef __builtin_va_list __isoc_va_list;
+#define __DEFINED___isoc_va_list
+#endif
+
+
+#if defined(__NEED_mbstate_t) && !defined(__DEFINED_mbstate_t)
+typedef struct __mbstate_t { unsigned __opaque1, __opaque2; } mbstate_t;
+#define __DEFINED_mbstate_t
+#endif
+
+
+#if defined(__NEED_locale_t) && !defined(__DEFINED_locale_t)
+typedef struct __locale_struct * locale_t;
+#define __DEFINED_locale_t
+#endif
+
+
+#if defined(__NEED_sigset_t) && !defined(__DEFINED_sigset_t)
+typedef struct __sigset_t { unsigned long __bits[128/sizeof(long)]; } sigset_t;
+#define __DEFINED_sigset_t
+#endif
+
+
+#if defined(__NEED_struct_iovec) && !defined(__DEFINED_struct_iovec)
+struct iovec { void *iov_base; size_t iov_len; };
+#define __DEFINED_struct_iovec
+#endif
+
+
+#if defined(__NEED_struct_winsize) && !defined(__DEFINED_struct_winsize)
+struct winsize { unsigned short ws_row, ws_col, ws_xpixel, ws_ypixel; };
+#define __DEFINED_struct_winsize
+#endif
+
+
+#if defined(__NEED_socklen_t) && !defined(__DEFINED_socklen_t)
+typedef unsigned socklen_t;
+#define __DEFINED_socklen_t
+#endif
+
+#if defined(__NEED_sa_family_t) && !defined(__DEFINED_sa_family_t)
+typedef unsigned short sa_family_t;
+#define __DEFINED_sa_family_t
+#endif
+
+
+#if defined(__NEED_pthread_attr_t) && !defined(__DEFINED_pthread_attr_t)
+typedef struct { union { int __i[sizeof(long)==8?14:9]; volatile int __vi[sizeof(long)==8?14:9]; unsigned long __s[sizeof(long)==8?7:9]; } __u; } pthread_attr_t;
+#define __DEFINED_pthread_attr_t
+#endif
+
+#if defined(__NEED_pthread_mutex_t) && !defined(__DEFINED_pthread_mutex_t)
+typedef struct { union { int __i[sizeof(long)==8?10:6]; volatile int __vi[sizeof(long)==8?10:6]; volatile void *volatile __p[sizeof(long)==8?5:6]; } __u; } pthread_mutex_t;
+#define __DEFINED_pthread_mutex_t
+#endif
+
+#if defined(__NEED_mtx_t) && !defined(__DEFINED_mtx_t)
+typedef struct { union { int __i[sizeof(long)==8?10:6]; volatile int __vi[sizeof(long)==8?10:6]; volatile void *volatile __p[sizeof(long)==8?5:6]; } __u; } mtx_t;
+#define __DEFINED_mtx_t
+#endif
+
+#if defined(__NEED_pthread_cond_t) && !defined(__DEFINED_pthread_cond_t)
+typedef struct { union { int __i[12]; volatile int __vi[12]; void *__p[12*sizeof(int)/sizeof(void*)]; } __u; } pthread_cond_t;
+#define __DEFINED_pthread_cond_t
+#endif
+
+#if defined(__NEED_cnd_t) && !defined(__DEFINED_cnd_t)
+typedef struct { union { int __i[12]; volatile int __vi[12]; void *__p[12*sizeof(int)/sizeof(void*)]; } __u; } cnd_t;
+#define __DEFINED_cnd_t
+#endif
+
+#if defined(__NEED_pthread_rwlock_t) && !defined(__DEFINED_pthread_rwlock_t)
+typedef struct { union { int __i[sizeof(long)==8?14:8]; volatile int __vi[sizeof(long)==8?14:8]; void *__p[sizeof(long)==8?7:8]; } __u; } pthread_rwlock_t;
+#define __DEFINED_pthread_rwlock_t
+#endif
+
+#if defined(__NEED_pthread_barrier_t) && !defined(__DEFINED_pthread_barrier_t)
+typedef struct { union { int __i[sizeof(long)==8?8:5]; volatile int __vi[sizeof(long)==8?8:5]; void *__p[sizeof(long)==8?4:5]; } __u; } pthread_barrier_t;
+#define __DEFINED_pthread_barrier_t
+#endif
+
+
+#undef _Addr
+#undef _Int64
+#undef _Reg
--- a/lib/libc/include/m68k-linux-musl/bits/fcntl.h
+++ b/lib/libc/include/m68k-linux-musl/bits/fcntl.h
@ -7,13 +7,13 @@
 #define O_DSYNC      010000
 #define O_SYNC     04010000
 #define O_RSYNC    04010000
-#define O_DIRECTORY 0200000
-#define O_NOFOLLOW  0400000
+#define O_DIRECTORY  040000
+#define O_NOFOLLOW  0100000
 #define O_CLOEXEC  02000000

 #define O_ASYNC      020000
-#define O_DIRECT     040000
-#define O_LARGEFILE 0100000
+#define O_DIRECT    0200000
+#define O_LARGEFILE 0400000
 #define O_NOATIME  01000000
 #define O_PATH    010000000
 #define O_TMPFILE 020200000
--- a/lib/libc/include/m68k-linux-musl/bits/fenv.h
+++ b/lib/libc/include/m68k-linux-musl/bits/fenv.h
@ -0,0 +1,29 @@
+#if __HAVE_68881__ || __mcffpu__
+
+#define FE_INEXACT    8
+#define FE_DIVBYZERO  16
+#define FE_UNDERFLOW  32
+#define FE_OVERFLOW   64
+#define FE_INVALID    128
+
+#define FE_ALL_EXCEPT 0xf8
+
+#define FE_TONEAREST  0
+#define FE_TOWARDZERO 16
+#define FE_DOWNWARD   32
+#define FE_UPWARD     48
+
+#else
+
+#define FE_ALL_EXCEPT 0
+#define FE_TONEAREST  0
+
+#endif
+
+typedef unsigned fexcept_t;
+
+typedef struct {
+	unsigned __control_register, __status_register, __instruction_address;
+} fenv_t;
+
+#define FE_DFL_ENV      ((const fenv_t *) -1)
--- a/lib/libc/include/m68k-linux-musl/bits/float.h
+++ b/lib/libc/include/m68k-linux-musl/bits/float.h
@ -0,0 +1,39 @@
+#if !__mcffpu__
+
+#define FLT_EVAL_METHOD 2
+
+#define LDBL_TRUE_MIN 3.6451995318824746025e-4951L
+#define LDBL_MIN     1.68105157155604675313e-4932L
+#define LDBL_MAX     1.1897314953572317650e+4932L
+#define LDBL_EPSILON 1.0842021724855044340e-19L
+
+#define LDBL_MANT_DIG 64
+#define LDBL_MIN_EXP (-16382)
+#define LDBL_MAX_EXP 16384
+
+#define LDBL_DIG 18
+#define LDBL_MIN_10_EXP (-4931)
+#define LDBL_MAX_10_EXP 4932
+
+#define DECIMAL_DIG 21
+
+#else
+
+#define FLT_EVAL_METHOD 0
+
+#define LDBL_TRUE_MIN 4.94065645841246544177e-324L
+#define LDBL_MIN 2.22507385850720138309e-308L
+#define LDBL_MAX 1.79769313486231570815e+308L
+#define LDBL_EPSILON 2.22044604925031308085e-16L
+
+#define LDBL_MANT_DIG 53
+#define LDBL_MIN_EXP (-1021)
+#define LDBL_MAX_EXP 1024
+
+#define LDBL_DIG 15
+#define LDBL_MIN_10_EXP (-307)
+#define LDBL_MAX_10_EXP 308
+
+#define DECIMAL_DIG 17
+
+#endif
--- a/lib/libc/include/m68k-linux-musl/bits/ipcstat.h
+++ b/lib/libc/include/m68k-linux-musl/bits/ipcstat.h
@ -0,0 +1 @@
+#define IPC_STAT 0x102
--- a/lib/libc/include/m68k-linux-musl/bits/msg.h
+++ b/lib/libc/include/m68k-linux-musl/bits/msg.h
@ -0,0 +1,18 @@
+struct msqid_ds {
+	struct ipc_perm msg_perm;
+	unsigned long __msg_stime_lo;
+	unsigned long __msg_stime_hi;
+	unsigned long __msg_rtime_lo;
+	unsigned long __msg_rtime_hi;
+	unsigned long __msg_ctime_lo;
+	unsigned long __msg_ctime_hi;
+	unsigned long msg_cbytes;
+	msgqnum_t msg_qnum;
+	msglen_t msg_qbytes;
+	pid_t msg_lspid;
+	pid_t msg_lrpid;
+	unsigned long __unused[2];
+	time_t msg_stime;
+	time_t msg_rtime;
+	time_t msg_ctime;
+};
--- a/lib/libc/include/m68k-linux-musl/bits/ptrace.h
+++ b/lib/libc/include/m68k-linux-musl/bits/ptrace.h
@ -0,0 +1,2 @@
+#define PTRACE_GET_THREAD_AREA	25
+#define PTRACE_SINGLEBLOCK	33
--- a/Show More
+++ b/Show More