update C header files to clang 13 rc1

2025-12-06 06:13:07 +00:00 · 2021-08-15 18:00:10 -07:00 · 2021-08-15 18:00:10 -07:00 · 21606339af
commit 21606339af
parent 78ff2a148a
34 changed files with 156298 additions and 7363 deletions
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -166,6 +166,8 @@ __DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
 __DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
 __DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
 __DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
+__DEVICE__ double round(double __a) { return __nv_round(__a); }
+__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
 __DEVICE__ double log(double __a) { return __nv_log(__a); }
 __DEVICE__ double log10(double __a) { return __nv_log10(__a); }
 __DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
@ -270,8 +272,6 @@ __DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
 __DEVICE__ float rnormf(int __dim, const float *__t) {
  return __nv_rnormf(__dim, __t);
 }
-__DEVICE__ double round(double __a) { return __nv_round(__a); }
-__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
 __DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
 __DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
 __DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -349,9 +349,14 @@ extern "C" {
 __device__ int vprintf(const char *, const char *);
 __device__ void free(void *) __attribute((nothrow));
 __device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
+
+// __assertfail() used to have a `noreturn` attribute. Unfortunately that
+// contributed to triggering the longstanding bug in ptxas when assert was used
+// in sufficiently convoluted code. See
+// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
 __device__ void __assertfail(const char *__message, const char *__file,
                             unsigned __line, const char *__function,
-                             size_t __charSize) __attribute__((noreturn));
+                             size_t __charSize);

 // In order for standard assert() macro on linux to work we need to
 // provide device-side __assert_fail()
--- a/lib/include/__clang_hip_cmath.h
+++ b/lib/include/__clang_hip_cmath.h
@ -14,6 +14,7 @@
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif

+#if !defined(__HIPCC_RTC__)
 #if defined(__cplusplus)
 #include <limits>
 #include <type_traits>
@ -21,6 +22,7 @@
 #endif
 #include <limits.h>
 #include <stdint.h>
+#endif // !defined(__HIPCC_RTC__)

 #pragma push_macro("__DEVICE__")
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
@ -34,6 +36,9 @@ __DEVICE__ long abs(long __n) { return ::labs(__n); }
 __DEVICE__ float fma(float __x, float __y, float __z) {
  return ::fmaf(__x, __y, __z);
 }
+#if !defined(__HIPCC_RTC__)
+// The value returned by fpclassify is platform dependent, therefore it is not
+// supported by hipRTC.
 __DEVICE__ int fpclassify(float __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
@ -42,11 +47,51 @@ __DEVICE__ int fpclassify(double __x) {
  return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
                              FP_ZERO, __x);
 }
+#endif // !defined(__HIPCC_RTC__)
+
 __DEVICE__ float frexp(float __arg, int *__exp) {
  return ::frexpf(__arg, __exp);
 }
+
+#if defined(__OPENMP_AMDGCN__)
+// For OpenMP we work around some old system headers that have non-conforming
+// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
+// this by providing two versions of these functions, differing only in the
+// return type. To avoid conflicting definitions we disable implicit base
+// function generation. That means we will end up with two specializations, one
+// per type, but only one has a base function defined by the system header.
+#pragma omp begin declare variant match(                                       \
+    implementation = {extension(disable_implicit_base)})
+
+// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
+//        add a suffix. This means we would clash with the names of the variants
+//        (note that we do not create implicit base functions here). To avoid
+//        this clash we add a new trait to some of them that is always true
+//        (this is LLVM after all ;)). It will only influence the mangled name
+//        of the variants inside the inner region and avoid the clash.
+#pragma omp begin declare variant match(implementation = {vendor(llvm)})
+
+__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
+__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
+__DEVICE__ int isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
+
+#pragma omp end declare variant
+#endif // defined(__OPENMP_AMDGCN__)
+
+__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
+__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
 __DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
+__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
+__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
+
+#if defined(__OPENMP_AMDGCN__)
+#pragma omp end declare variant
+#endif // defined(__OPENMP_AMDGCN__)
+
 __DEVICE__ bool isgreater(float __x, float __y) {
  return __builtin_isgreater(__x, __y);
 }
@ -59,8 +104,6 @@ __DEVICE__ bool isgreaterequal(float __x, float __y) {
 __DEVICE__ bool isgreaterequal(double __x, double __y) {
  return __builtin_isgreaterequal(__x, __y);
 }
-__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
-__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
 __DEVICE__ bool isless(float __x, float __y) {
  return __builtin_isless(__x, __y);
 }
@ -79,8 +122,6 @@ __DEVICE__ bool islessgreater(float __x, float __y) {
 __DEVICE__ bool islessgreater(double __x, double __y) {
  return __builtin_islessgreater(__x, __y);
 }
-__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
-__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
 __DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
 __DEVICE__ bool isunordered(float __x, float __y) {
@ -207,11 +248,117 @@ template <bool __B, class __T = void> struct __hip_enable_if {};

 template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };

+namespace __hip {
+template <class _Tp> struct is_integral {
+  enum { value = 0 };
+};
+template <> struct is_integral<bool> {
+  enum { value = 1 };
+};
+template <> struct is_integral<char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<signed char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned char> {
+  enum { value = 1 };
+};
+template <> struct is_integral<wchar_t> {
+  enum { value = 1 };
+};
+template <> struct is_integral<short> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned short> {
+  enum { value = 1 };
+};
+template <> struct is_integral<int> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned int> {
+  enum { value = 1 };
+};
+template <> struct is_integral<long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<long long> {
+  enum { value = 1 };
+};
+template <> struct is_integral<unsigned long long> {
+  enum { value = 1 };
+};
+
+// ToDo: specializes is_arithmetic<_Float16>
+template <class _Tp> struct is_arithmetic {
+  enum { value = 0 };
+};
+template <> struct is_arithmetic<bool> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<signed char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned char> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<wchar_t> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<short> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned short> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<int> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned int> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<long long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<unsigned long long> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<float> {
+  enum { value = 1 };
+};
+template <> struct is_arithmetic<double> {
+  enum { value = 1 };
+};
+
+struct true_type {
+  static const __constant__ bool value = true;
+};
+struct false_type {
+  static const __constant__ bool value = false;
+};
+
+template <typename __T, typename __U> struct is_same : public false_type {};
+template <typename __T> struct is_same<__T, __T> : public true_type {};
+
+template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
+
+template <typename __T> typename add_rvalue_reference<__T>::type declval();
+
 // decltype is only available in C++11 and above.
 #if __cplusplus >= 201103L
 // __hip_promote
-namespace __hip {
-
 template <class _Tp> struct __numeric_type {
  static void __test(...);
  static _Float16 __test(_Float16);
@ -227,8 +374,8 @@ template <class _Tp> struct __numeric_type {
  // No support for long double, use double instead.
  static double __test(long double);

-  typedef decltype(__test(std::declval<_Tp>())) type;
-  static const bool value = !std::is_same<type, void>::value;
+  typedef decltype(__test(declval<_Tp>())) type;
+  static const bool value = !is_same<type, void>::value;
 };

 template <> struct __numeric_type<void> { static const bool value = true; };
@ -271,18 +418,17 @@ public:

 template <class _A1, class _A2 = void, class _A3 = void>
 class __promote : public __promote_imp<_A1, _A2, _A3> {};
-
-} // namespace __hip
 #endif //__cplusplus >= 201103L
+} // namespace __hip

 // __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
 // avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
 // floor(double).
 #define __HIP_OVERLOAD1(__retty, __fn)                                         \
  template <typename __T>                                                      \
-  __DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer,    \
-                                      __retty>::type                           \
-  __fn(__T __x) {                                                              \
+  __DEVICE__                                                                   \
+      typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type  \
+      __fn(__T __x) {                                                          \
    return ::__fn((double)__x);                                                \
  }

@ -293,8 +439,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
  template <typename __T1, typename __T2>                                      \
  __DEVICE__ typename __hip_enable_if<                                         \
-      std::numeric_limits<__T1>::is_specialized &&                             \
-          std::numeric_limits<__T2>::is_specialized,                           \
+      __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value,  \
      typename __hip::__promote<__T1, __T2>::type>::type                       \
  __fn(__T1 __x, __T2 __y) {                                                   \
    typedef typename __hip::__promote<__T1, __T2>::type __result_type;         \
@ -303,16 +448,14 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
 #else
 #define __HIP_OVERLOAD2(__retty, __fn)                                         \
  template <typename __T1, typename __T2>                                      \
-  __DEVICE__                                                                   \
-      typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&    \
-                                   std::numeric_limits<__T2>::is_specialized,  \
-                               __retty>::type                                  \
-      __fn(__T1 __x, __T2 __y) {                                               \
+  __DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&     \
+                                          __hip::is_arithmetic<__T2>::value,   \
+                                      __retty>::type                           \
+  __fn(__T1 __x, __T2 __y) {                                                   \
    return __fn((double)__x, (double)__y);                                     \
  }
 #endif

-__HIP_OVERLOAD1(double, abs)
 __HIP_OVERLOAD1(double, acos)
 __HIP_OVERLOAD1(double, acosh)
 __HIP_OVERLOAD1(double, asin)
@ -336,7 +479,9 @@ __HIP_OVERLOAD1(double, floor)
 __HIP_OVERLOAD2(double, fmax)
 __HIP_OVERLOAD2(double, fmin)
 __HIP_OVERLOAD2(double, fmod)
+#if !defined(__HIPCC_RTC__)
 __HIP_OVERLOAD1(int, fpclassify)
+#endif // !defined(__HIPCC_RTC__)
 __HIP_OVERLOAD2(double, hypot)
 __HIP_OVERLOAD1(int, ilogb)
 __HIP_OVERLOAD1(bool, isfinite)
@ -382,9 +527,8 @@ __HIP_OVERLOAD2(double, min)
 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2, typename __T3>
 __DEVICE__ typename __hip_enable_if<
-    std::numeric_limits<__T1>::is_specialized &&
-        std::numeric_limits<__T2>::is_specialized &&
-        std::numeric_limits<__T3>::is_specialized,
+    __hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
+        __hip::is_arithmetic<__T3>::value,
    typename __hip::__promote<__T1, __T2, __T3>::type>::type
 fma(__T1 __x, __T2 __y, __T3 __z) {
  typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
@ -392,33 +536,32 @@ fma(__T1 __x, __T2 __y, __T3 __z) {
 }
 #else
 template <typename __T1, typename __T2, typename __T3>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized &&
-                                 std::numeric_limits<__T3>::is_specialized,
-                             double>::type
-    fma(__T1 __x, __T2 __y, __T3 __z) {
+__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                        __hip::is_arithmetic<__T2>::value &&
+                                        __hip::is_arithmetic<__T3>::value,
+                                    double>::type
+fma(__T1 __x, __T2 __y, __T3 __z) {
  return ::fma((double)__x, (double)__y, (double)__z);
 }
 #endif

 template <typename __T>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    frexp(__T __x, int *__exp) {
  return ::frexp((double)__x, __exp);
 }

 template <typename __T>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    ldexp(__T __x, int __exp) {
  return ::ldexp((double)__x, __exp);
 }

 template <typename __T>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    modf(__T __x, double *__exp) {
  return ::modf((double)__x, __exp);
 }
@ -426,8 +569,8 @@ __DEVICE__
 #if __cplusplus >= 201103L
 template <typename __T1, typename __T2>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized,
+    typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                 __hip::is_arithmetic<__T2>::value,
                             typename __hip::__promote<__T1, __T2>::type>::type
    remquo(__T1 __x, __T2 __y, int *__quo) {
  typedef typename __hip::__promote<__T1, __T2>::type __result_type;
@ -435,25 +578,24 @@ __DEVICE__
 }
 #else
 template <typename __T1, typename __T2>
-__DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
-                                 std::numeric_limits<__T2>::is_specialized,
-                             double>::type
-    remquo(__T1 __x, __T2 __y, int *__quo) {
+__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
+                                        __hip::is_arithmetic<__T2>::value,
+                                    double>::type
+remquo(__T1 __x, __T2 __y, int *__quo) {
  return ::remquo((double)__x, (double)__y, __quo);
 }
 #endif

 template <typename __T>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbln(__T __x, long int __exp) {
  return ::scalbln((double)__x, __exp);
 }

 template <typename __T>
 __DEVICE__
-    typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
+    typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
    scalbn(__T __x, int __exp) {
  return ::scalbn((double)__x, __exp);
 }
@ -468,14 +610,15 @@ __DEVICE__
 #endif // defined(__cplusplus)

 // Define these overloads inside the namespace our standard library uses.
+#if !defined(__HIPCC_RTC__)
 #ifdef _LIBCPP_BEGIN_NAMESPACE_STD
 _LIBCPP_BEGIN_NAMESPACE_STD
 #else
 namespace std {
 #ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
-#endif
-#endif
+#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
+#endif // _LIBCPP_BEGIN_NAMESPACE_STD

 // Pull the new overloads we defined above into namespace std.
 // using ::abs; - This may be considered for C++.
@ -620,11 +763,13 @@ _LIBCPP_END_NAMESPACE_STD
 #else
 #ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _GLIBCXX_END_NAMESPACE_VERSION
-#endif
+#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
 } // namespace std
-#endif
+#endif // _LIBCPP_END_NAMESPACE_STD
+#endif // !defined(__HIPCC_RTC__)

 // Define device-side math functions from <ymath.h> on MSVC.
+#if !defined(__HIPCC_RTC__)
 #if defined(_MSC_VER)

 // Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
@ -658,6 +803,7 @@ __DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
 }
 #endif // defined(__cplusplus)
 #endif // defined(_MSC_VER)
+#endif // !defined(__HIPCC_RTC__)

 #pragma pop_macro("__DEVICE__")

--- a/lib/include/__clang_hip_libdevice_declares.h
+++ b/lib/include/__clang_hip_libdevice_declares.h
@ -138,14 +138,22 @@ __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
 __device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);

-__device__ __attribute__((const)) float
-__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
-__device__ __attribute__((const)) float
-__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_cos_f32(float __x) {
+  return __builtin_amdgcn_cosf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_rcp_f32(float __x) {
+  return __builtin_amdgcn_rcpf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_rsq_f32(float __x) {
+  return __builtin_amdgcn_rsqf(__x);
+}
+__device__ inline __attribute__((const)) float
+__llvm_amdgcn_sin_f32(float __x) {
+  return __builtin_amdgcn_sinf(__x);
+}
 // END INTRINSICS
 // END FLOAT

@ -269,10 +277,14 @@ __device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
 __device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
                                                            double);

-__device__ __attribute__((const)) double
-__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
-__device__ __attribute__((const)) double
-__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
+__device__ inline __attribute__((const)) double
+__llvm_amdgcn_rcp_f64(double __x) {
+  return __builtin_amdgcn_rcp(__x);
+}
+__device__ inline __attribute__((const)) double
+__llvm_amdgcn_rsq_f64(double __x) {
+  return __builtin_amdgcn_rsq(__x);
+}

 __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
 __device__ _Float16 __ocml_cos_f16(_Float16);
--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@ -13,11 +13,13 @@
 #error "This file is for HIP and OpenMP AMDGCN device compilation only."
 #endif

+#if !defined(__HIPCC_RTC__)
 #if defined(__cplusplus)
 #include <algorithm>
 #endif
 #include <limits.h>
 #include <stdint.h>
+#endif // __HIPCC_RTC__

 #pragma push_macro("__DEVICE__")
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
@ -36,7 +38,7 @@ template<bool>
 struct __compare_result{};
 template<>
 struct __compare_result<true> {
-  static const bool valid;
+  static const __device__ bool valid;
 };

 __DEVICE__
@ -1260,6 +1262,7 @@ float min(float __x, float __y) { return fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return fmin(__x, __y); }

+#if !defined(__HIPCC_RTC__)
 __host__ inline static int min(int __arg1, int __arg2) {
  return std::min(__arg1, __arg2);
 }
@ -1267,6 +1270,7 @@ __host__ inline static int min(int __arg1, int __arg2) {
 __host__ inline static int max(int __arg1, int __arg2) {
  return std::max(__arg1, __arg2);
 }
+#endif // __HIPCC_RTC__
 #endif

 #pragma pop_macro("__DEVICE__")
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -18,52 +18,107 @@

 #if __HIP__

-#include <cmath>
-#include <cstdlib>
-#include <stdlib.h>
-
 #define __host__ __attribute__((host))
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
 #define __shared__ __attribute__((shared))
 #define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))

 #if !defined(__cplusplus) || __cplusplus < 201103L
  #define nullptr NULL;
 #endif

+#ifdef __cplusplus
+extern "C" {
+  __attribute__((__visibility__("default")))
+  __attribute__((weak))
+  __attribute__((noreturn))
+  __device__ void __cxa_pure_virtual(void) {
+    __builtin_trap();
+  }
+  __attribute__((__visibility__("default")))
+  __attribute__((weak))
+  __attribute__((noreturn))
+  __device__ void __cxa_deleted_virtual(void) {
+    __builtin_trap();
+  }
+}
+#endif //__cplusplus
+
+#if !defined(__HIPCC_RTC__)
+#include <cmath>
+#include <cstdlib>
+#include <stdlib.h>
+#else
+typedef __SIZE_TYPE__ size_t;
+// Define macros which are needed to declare HIP device API's without standard
+// C/C++ headers. This is for readability so that these API's can be written
+// the same way as non-hipRTC use case. These macros need to be popped so that
+// they do not pollute users' name space.
+#pragma push_macro("NULL")
+#pragma push_macro("uint32_t")
+#pragma push_macro("uint64_t")
+#pragma push_macro("CHAR_BIT")
+#pragma push_macro("INT_MAX")
+#define NULL (void *)0
+#define uint32_t __UINT32_TYPE__
+#define uint64_t __UINT64_TYPE__
+#define CHAR_BIT __CHAR_BIT__
+#define INT_MAX __INTMAX_MAX__
+#endif // __HIPCC_RTC__
+
+typedef __SIZE_TYPE__ __hip_size_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif //__cplusplus
+
 #if __HIP_ENABLE_DEVICE_MALLOC__
-extern "C" __device__ void *__hip_malloc(size_t __size);
-extern "C" __device__ void *__hip_free(void *__ptr);
-static inline __device__ void *malloc(size_t __size) {
+__device__ void *__hip_malloc(__hip_size_t __size);
+__device__ void *__hip_free(void *__ptr);
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  return __hip_malloc(__size);
 }
-static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
-#else
-static inline __device__ void *malloc(size_t __size) {
-  __builtin_trap();
-  return nullptr;
+__attribute__((weak)) inline __device__ void *free(void *__ptr) {
+  return __hip_free(__ptr);
 }
-static inline __device__ void *free(void *__ptr) {
+#else
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  __builtin_trap();
-  return nullptr;
+  return (void *)0;
+}
+__attribute__((weak)) inline __device__ void *free(void *__ptr) {
+  __builtin_trap();
+  return (void *)0;
 }
 #endif

+#ifdef __cplusplus
+} // extern "C"
+#endif //__cplusplus
+
 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>

-#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#if defined(__HIPCC_RTC__)
+#include <__clang_hip_cmath.h>
+#else
 #include <__clang_cuda_math_forward_declares.h>
 #include <__clang_hip_cmath.h>
 #include <__clang_cuda_complex_builtins.h>
-
 #include <algorithm>
 #include <complex>
 #include <new>
-#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
+#endif // __HIPCC_RTC__

 #define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
-
+#if defined(__HIPCC_RTC__)
+#pragma pop_macro("NULL")
+#pragma pop_macro("uint32_t")
+#pragma pop_macro("uint64_t")
+#pragma pop_macro("CHAR_BIT")
+#pragma pop_macro("INT_MAX")
+#endif // __HIPCC_RTC__
 #endif // __HIP__
 #endif // __CLANG_HIP_RUNTIME_WRAPPER_H__
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -15,8 +15,13 @@
 #define __AMXINTRIN_H
 #ifdef __x86_64__

+/* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS_TILE                                                \
  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_INT8                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+#define __DEFAULT_FN_ATTRS_BF16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))

 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@ -25,7 +30,7 @@
 /// config and the tile data, and the tiles are zeroed. Any invalid
 /// configurations will result in #GP fault.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
 ///
@ -41,7 +46,7 @@ _tile_loadconfig(const void *__config) {
 /// palette, the number of bytes per row, and the number of rows. If tiles
 /// are not configured, all zeroes will be stored to memory.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
 ///
@ -55,7 +60,7 @@ _tile_storeconfig(void *__config) {
 /// Release the tile configuration to return to the init state, which
 /// releases all storage it currently holds.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
 static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
@ -66,7 +71,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// destination tile "dst" using the tile configuration previously configured
 /// via "_tile_loadconfig".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 ///
@ -86,7 +91,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// that the data will likely not be reused in the near future and the data
 /// caching can be optimized accordingly.
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 ///
@ -104,7 +109,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// "stride" using the tile configuration previously configured via
 /// "_tile_loadconfig".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
@ -119,7 +124,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {

 /// Zero the tile specified by "tdest".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
 ///
@ -133,7 +138,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
 ///
@ -152,7 +157,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
 /// in "dst", and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
 ///
@ -171,7 +176,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
 /// and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
 ///
@ -190,7 +195,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
 /// "dst", and store the 32-bit result back to tile "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
 ///
@ -208,7 +213,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 /// elements with elements in "dst", and store the 32-bit result back to tile
 /// "dst".
 ///
-/// \headerfile <x86intrin.h>
+/// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
 ///
@ -221,10 +226,12 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 #define _tile_dpbf16ps(dst, src0, src1)                                        \
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))

-#define __DEFAULT_FN_ATTRS_INT8                                                \
-  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
-
+/// AMX tile register size can be configured, the maximum size is 16x64=1024
+/// bytes. Since there is no 2D type in llvm IR, we use vector type to
+/// represent 2D tile and the fixed size is maximum amx tile register size.
 typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                     __SIZE_TYPE__ stride) {
@ -232,12 +239,43 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
                                             (__SIZE_TYPE__)(stride));
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+                       __SIZE_TYPE__ stride) {
+  return __builtin_ia32_tileloaddt164_internal(m, n, base,
+                                               (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
 static __inline__ void __DEFAULT_FN_ATTRS_INT8
 _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                      __SIZE_TYPE__ stride, _tile1024i tile) {
@ -245,34 +283,211 @@ _tile_stored_internal(unsigned short m, unsigned short n, void *base,
                                              (__SIZE_TYPE__)(stride), tile);
 }

+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// This struct pack the shape and tile data together for user. We suggest
+/// initializing the struct as early as possible, because compiler depends
+/// on the shape information to do configure. The constant value is preferred
+/// for optimization by compiler.
 typedef struct __tile1024i_str {
  const unsigned short row;
  const unsigned short col;
  _tile1024i tile;
 } __tile1024i;

+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_loadd(__tile1024i *dst, const void *base,
                         __SIZE_TYPE__ stride) {
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }

-__DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
-                          __tile1024i src2) {
-  dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
-                                    src1.tile, src2.tile);
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static void __tile_stream_loadd(__tile1024i *dst, const void *base,
+                                __SIZE_TYPE__ stride) {
+  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 }

+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+                          __tile1024i src1) {
+  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
+                                    src0.tile, src1.tile);
+}
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+///    A destination tile. Max size is 1024 Bytes.
+/// \param base
+///    A pointer to base address.
+/// \param stride
+///    The stride between the rows' data to be stored in memory.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }

+/// Zero the tile specified by "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param dst
+///    The destination tile to be zero. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_TILE
 static void __tile_zero(__tile1024i *dst) {
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 }

+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_BF16
+static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+                            __tile1024i src1) {
+  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS_TILE
+#undef __DEFAULT_FN_ATTRS_INT8
+#undef __DEFAULT_FN_ATTRS_BF16
+
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -639,6 +639,49 @@ __jcvt(double __a) {
 }
 #endif

+/* Armv8.5-A FP rounding intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint32zf(float __a) {
+  return __builtin_arm_frint32zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint32z(double __a) {
+  return __builtin_arm_frint32z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint64zf(float __a) {
+  return __builtin_arm_frint64zf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint64z(double __a) {
+  return __builtin_arm_frint64z(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint32xf(float __a) {
+  return __builtin_arm_frint32xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint32x(double __a) {
+  return __builtin_arm_frint32x(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__frint64xf(float __a) {
+  return __builtin_arm_frint64xf(__a);
+}
+
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__frint64x(double __a) {
+  return __builtin_arm_frint64x(__a);
+}
+#endif
+
 /* Armv8.7-A load/store 64-byte intrinsics */
 #if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
 typedef struct {
@ -709,6 +752,18 @@ __arm_st64bv0(void *__addr, data512_t __value) {

 #endif /* __ARM_FEATURE_TME */

+/* Armv8.5-A Random number generation intrinsics */
+#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__rndr(uint64_t *__p) {
+  return __builtin_arm_rndr(__p);
+}
+static __inline__ int __attribute__((__always_inline__, __nodebug__))
+__rndrrs(uint64_t *__p) {
+  return __builtin_arm_rndrrs(__p);
+}
+#endif
+
 #if defined(__cplusplus)
 }
 #endif
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -9300,8 +9300,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 * computations. In vector-reduction arithmetic, the evaluation order is
 * independent of the order of the input elements of V.

- * For floating point types, we always assume the elements are reassociable even
- * if -fast-math is off.
+ * For floating-point intrinsics:
+ * 1. When using fadd/fmul intrinsics, the order of operations within the
+ * vector is unspecified (associative math).
+ * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
+ * produce unspecified results.

 * Used bisection method. At each step, we partition the vector with previous
 * step in half, and the operation is performed on its two halves.
@ -9524,75 +9527,49 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
  return __builtin_ia32_reduce_umin_d512((__v16si)__V);
 }

-#define _mm512_mask_reduce_operator(op) \
-  __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
-  __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
-  __m256d __t3 = _mm256_##op(__t1, __t2); \
-  __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
-  __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
-  __m128d __t6 = _mm_##op(__t4, __t5); \
-  __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
-  __m128d __t8 = _mm_##op(__t6, __t7); \
-  return __t8[0]
-
 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_pd(__m512d __V) {
-  _mm512_mask_reduce_operator(min_pd);
+  return __builtin_ia32_reduce_fmin_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
  __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(max_pd);
+  return __builtin_ia32_reduce_fmax_pd512(__V);
 }

 static __inline__ double __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
  __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
-  _mm512_mask_reduce_operator(min_pd);
+  return __builtin_ia32_reduce_fmin_pd512(__V);
 }
-#undef _mm512_mask_reduce_operator
-
-#define _mm512_mask_reduce_operator(op) \
-  __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
-  __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
-  __m256 __t3 = _mm256_##op(__t1, __t2); \
-  __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
-  __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
-  __m128 __t6 = _mm_##op(__t4, __t5); \
-  __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
-  __m128 __t8 = _mm_##op(__t6, __t7); \
-  __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
-  __m128 __t10 = _mm_##op(__t8, __t9); \
-  return __t10[0]

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_max_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_reduce_min_ps(__m512 __V) {
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
  __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(max_ps);
+  return __builtin_ia32_reduce_fmax_ps512(__V);
 }

 static __inline__ float __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
  __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
-  _mm512_mask_reduce_operator(min_ps);
+  return __builtin_ia32_reduce_fmin_ps512(__V);
 }
-#undef _mm512_mask_reduce_operator

 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
 ///    32-bit signed integer value.
@ -9611,6 +9588,169 @@ _mm512_cvtsi512_si32(__m512i __A) {
  return __b[0];
 }

+/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
+/// locations starting at location \a base_addr at packed 32-bit integer indices
+/// stored in the lower half of \a vindex scaled by \a scale them in dst.
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
+  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory
+/// starting at location \a base_addr at packed 32-bit integer indices stored in
+/// the lower half of \a vindex scaled by \a scale into dst using writemask
+/// \a mask (elements are copied from \a src when the corresponding mask bit is
+/// not set).
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     dst[i+63:i] := MEM[addr+63:addr]
+///   ELSE
+///     dst[i+63:i] := src[i+63:i]
+///   FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
+  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
+                           (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst.
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
+  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst using writemask \a mask (elements
+/// are copied from \a src when the corresponding mask bit is not set).
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     dst[i+63:i] := MEM[addr+63:addr]
+///   ELSE
+///     dst[i+63:i] := src[i+63:i]
+///   FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
+  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
+                              (base_addr), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// and to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   MEM[addr+63:addr] := v1[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
+  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale. Only those elements
+/// whose corresponding mask bit is set in writemask \a mask are written to
+/// memory.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     MEM[addr+63:addr] := a[i+63:i]
+///   FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
+  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
+                            _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///   MEM[addr+63:addr] := a[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
+  _mm512_i32scatter_epi64((base_addr),                                         \
+                          _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in a and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
+/// whose corresponding mask bit is not set are not written to memory).
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+///   i := j*64
+///   m := j*32
+///   IF mask[j]
+///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+///     MEM[addr+63:addr] := a[i+63:i]
+///   FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
+  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
+                               _mm512_castsi512_si256(vindex), (v1), (scale))
+
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/builtins.h
+++ b/lib/include/builtins.h
@ -0,0 +1,16 @@
+/*===---- builtins.h - Standard header for extra builtins -----------------===*\
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+\*===----------------------------------------------------------------------===*/
+
+/// Some legacy compilers have builtin definitions in a file named builtins.h.
+/// This header file has been added to allow compatibility with code that was
+/// written for those compilers. Code may have an include line for this file
+/// and to avoid an error an empty file with this name is provided.
+#ifndef __BUILTINS_H
+#define __BUILTINS_H
+
+#endif /* __BUILTINS_H */
--- a/lib/include/cuda_wrappers/complex
+++ b/lib/include/cuda_wrappers/complex
@ -72,8 +72,16 @@
 #define _GLIBCXX_USE_C99_COMPLEX 0
 #define _GLIBCXX_USE_C99_COMPLEX_TR1 0

+// Work around a compatibility issue with libstdc++ 11.1.0
+// https://bugs.llvm.org/show_bug.cgi?id=50383
+#pragma push_macro("__failed_assertion")
+#if _GLIBCXX_RELEASE == 11
+#define __failed_assertion __cuda_failed_assertion
+#endif
+
 #include_next <complex>

+#pragma pop_macro("__failed_assertion")
 #pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
 #pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")

--- a/lib/include/hexagon_circ_brev_intrinsics.h
+++ b/lib/include/hexagon_circ_brev_intrinsics.h
@ -0,0 +1,298 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_
+#define _HEXAGON_CIRC_BREV_INTRINSICS_H_ 1
+
+#include <hexagon_protos.h>
+#include <stdint.h>
+
+/* Circular Load */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_D(Word64 dst, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_D(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int64_t *) HEXAGON_circ_ldd (ptr, &(dest), ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_W(Word32 dst, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_W(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int *) HEXAGON_circ_ldw (ptr, &(dest), (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_H(Word16 dst, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_H(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int16_t *) HEXAGON_circ_ldh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_UH( UWord16 dst,  UWord16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_UH(dest,ptr,incr,bufsize,K) \
+    { ptr = (uint16_t *) HEXAGON_circ_lduh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_load_update_B(Word8 dst, Word8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_B(dest,ptr,incr,bufsize,K)  \
+    { ptr = (int8_t *) HEXAGON_circ_ldb (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void  Q6_circ_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_load_update_UB(dest,ptr,incr,bufsize,K) \
+    { ptr = (uint8_t *) HEXAGON_circ_ldub (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
+
+/* Circular Store */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_D(Word64 *src, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_D(src,ptr,incr,bufsize,K)  \
+    { ptr = (int64_t *) HEXAGON_circ_std (ptr, src, ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_W(Word32 *src, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_W(src,ptr,incr,bufsize,K)  \
+    { ptr = (int *) HEXAGON_circ_stw (ptr, src, (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_HL(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_HL(src,ptr,incr,bufsize,K) \
+    { ptr = (int16_t *) HEXAGON_circ_sth (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_HH(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_HH(src,ptr,incr,bufsize,K) \
+    { ptr = (int16_t *) HEXAGON_circ_sthhi (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_circ_store_update_B(Word8 *src, Word8 *ptr, UWord32 I4, UWord32 bufsize,  UWord64 K)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_circ_store_update_B(src,ptr,incr,bufsize,K)  \
+    { ptr = (int8_t *) HEXAGON_circ_stb (ptr, src, ((((K)-2)<<24)|(bufsize)), incr); }
+
+
+/* Bit Reverse Load */
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_D(Word64 dst, Word64 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_D(dest,ptr,log2bufsize) \
+    { ptr = (int64_t *) HEXAGON_brev_ldd (ptr, &(dest), (1<<(16-((log2bufsize) + 3)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_W(Word32 dst, Word32 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_W(dest,ptr,log2bufsize) \
+    { ptr = (int *) HEXAGON_brev_ldw (ptr, &(dest), (1<<(16-((log2bufsize) + 2)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_H(Word16 dst, Word16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_H(dest,ptr,log2bufsize) \
+    { ptr = (int16_t *) HEXAGON_brev_ldh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_UH(UWord16 dst,  UWord16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_UH(dest,ptr,log2bufsize) \
+    { ptr = (uint16_t *) HEXAGON_brev_lduh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_B(Word8 dst, Word8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_B(dest,ptr,log2bufsize) \
+    { ptr = (int8_t *) HEXAGON_brev_ldb (ptr, &(dest), (1<<(16-((log2bufsize))))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_load_update_UB(dest,ptr,log2bufsize) \
+    { ptr = (uint8_t *) HEXAGON_brev_ldub (ptr, &(dest), (1<<(16-((log2bufsize))))); }
+
+/* Bit Reverse Store */
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_D(Word64 *src, Word64 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_D(src,ptr,log2bufsize)   \
+    { ptr = (int64_t *) HEXAGON_brev_std (ptr, src, (1<<(16-((log2bufsize) + 3)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_W(Word32 *src, Word32 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_W(src,ptr,log2bufsize)   \
+    { ptr = (int *) HEXAGON_brev_stw (ptr, src, (1<<(16-((log2bufsize) + 2)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_HL(Word16 *src, Word16 *ptr, Word32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_HL(src,ptr,log2bufsize)   \
+    { ptr = (int16_t *) HEXAGON_brev_sth (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_HH(Word16 *src, Word16 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_HH(src,ptr,log2bufsize)   \
+    { ptr = (int16_t *) HEXAGON_brev_sthhi (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
+
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: void Q6_bitrev_store_update_B(Word8 *src, Word8 *ptr, UWord32 Iu4)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#define Q6_bitrev_store_update_B(src,ptr,log2bufsize)   \
+    { ptr = (int8_t *) HEXAGON_brev_stb (ptr, src, (1<<(16-((log2bufsize))))); }
+
+
+#define HEXAGON_circ_ldd  __builtin_circ_ldd
+#define HEXAGON_circ_ldw  __builtin_circ_ldw
+#define HEXAGON_circ_ldh  __builtin_circ_ldh
+#define HEXAGON_circ_lduh __builtin_circ_lduh
+#define HEXAGON_circ_ldb  __builtin_circ_ldb
+#define HEXAGON_circ_ldub __builtin_circ_ldub
+
+
+#define HEXAGON_circ_std  __builtin_circ_std
+#define HEXAGON_circ_stw  __builtin_circ_stw
+#define HEXAGON_circ_sth  __builtin_circ_sth
+#define HEXAGON_circ_sthhi __builtin_circ_sthhi
+#define HEXAGON_circ_stb  __builtin_circ_stb
+
+
+#define HEXAGON_brev_ldd  __builtin_brev_ldd
+#define HEXAGON_brev_ldw  __builtin_brev_ldw
+#define HEXAGON_brev_ldh  __builtin_brev_ldh
+#define HEXAGON_brev_lduh __builtin_brev_lduh
+#define HEXAGON_brev_ldb  __builtin_brev_ldb
+#define HEXAGON_brev_ldub __builtin_brev_ldub
+
+#define HEXAGON_brev_std  __builtin_brev_std
+#define HEXAGON_brev_stw  __builtin_brev_stw
+#define HEXAGON_brev_sth  __builtin_brev_sth
+#define HEXAGON_brev_sthhi __builtin_brev_sthhi
+#define HEXAGON_brev_stb  __builtin_brev_stb
+
+#ifdef __HVX__
+/* ==========================================================================
+   Assembly Syntax:       if (Qt) vmem(Rt+#0) = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstoreq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstoreq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstoreq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (!Qt) vmem(Rt+#0) = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorenq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorenq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorenq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (Qt) vmem(Rt+#0):nt = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorentq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorentq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentq)
+
+/* ==========================================================================
+   Assembly Syntax:       if (!Qt) vmem(Rt+#0):nt = Vs
+   C Intrinsic Prototype: void Q6_vmaskedstorentnq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
+   Instruction Type:      COPROC_VMEM
+   Execution Slots:       SLOT0
+   ========================================================================== */
+
+#define Q6_vmaskedstorentnq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentnq)
+
+#endif
+
+
+#endif  /* #ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_ */
+
+#ifdef __NOT_DEFINED__
+/*** comment block template  ***/
+/* ==========================================================================
+   Assembly Syntax:       Return=instruction()
+   C Intrinsic Prototype: ReturnType Intrinsic(ParamType Rs, ParamType Rt)
+   Instruction Type:      InstructionType
+   Execution Slots:       SLOT0123
+   ========================================================================== */
+#endif /***  __NOT_DEFINED__  ***/
--- a/lib/include/hexagon_protos.h
+++ b/lib/include/hexagon_protos.h
--- a/lib/include/hexagon_types.h
+++ b/lib/include/hexagon_types.h
--- a/lib/include/hvx_hexagon_protos.h
+++ b/lib/include/hvx_hexagon_protos.h
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -72,11 +72,6 @@
 #include <f16cintrin.h>
 #endif

-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__VPCLMULQDQ__)
-#include <vpclmulqdqintrin.h>
-#endif
-
 /* No feature check desired due to internal checks */
 #include <bmiintrin.h>

@ -230,6 +225,11 @@
 #include <pkuintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__VAES__)
 #include <vaesintrin.h>
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -451,24 +451,47 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
 static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
                                                  unsigned char const *__src,
                                                  size_t __n) {
-  __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
-                       : : "memory");
+#if defined(__x86_64__)
+  __asm__ __volatile__("rep movsb"
+                       : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
                                                  unsigned long const *__src,
                                                  size_t __n) {
+#if defined(__x86_64__)
  __asm__ __volatile__("rep movsl"
                       : "+D"(__dst), "+S"(__src), "+c"(__n)
                       :
                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
                                                  unsigned short const *__src,
                                                  size_t __n) {
+#if defined(__x86_64__)
  __asm__ __volatile__("rep movsw"
                       : "+D"(__dst), "+S"(__src), "+c"(__n)
                       :
                       : "memory");
+#else
+  __asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
+                       : "+D"(__dst), "+r"(__src), "+c"(__n)
+                       :
+                       : "memory");
+#endif
 }
 static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
                                                  unsigned long __x,
@ -507,16 +530,26 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
+  __asm("cpuid"                                                                \
+        : "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
+        : "0"(__leaf), "2"(__count))
+#else
+/* x86-64 uses %rbx as the base register, so preserve it. */
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
+  __asm("xchgq %%rbx,%q1\n"                                                    \
+        "cpuid\n"                                                              \
+        "xchgq %%rbx,%q1"                                                      \
+        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
+        : "0"(__leaf), "2"(__count))
+#endif
 static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
-  __asm__("cpuid"
-          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
-          : "a"(__level), "c"(0));
+  __cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
                                                    int __ecx) {
-  __asm__("cpuid"
-          : "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
-          : "a"(__level), "c"(__ecx));
+  __cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -230,10 +230,12 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -267,10 +269,12 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                  HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -304,10 +308,12 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                   HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   MEM[__odata+127:__odata] := 0
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF (Authentic == 0)
 ///     ZF := 1
+///     MEM[__odata+127:__odata] := 0
 ///   ELSE
 ///     MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
 ///     ZF := 0
@ -354,10 +360,16 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///                    HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
@ -394,10 +406,16 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
@ -434,10 +452,16 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                    HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
 /// IF (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
@ -474,10 +498,16 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///                   HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
 /// If (IllegalHandle)
 ///   ZF := 1
+///   FOR i := 0 to 7
+///     __odata[i] := 0
+///   ENDFOR
 /// ELSE
 ///   (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
 ///   IF Authentic == 0
 ///     ZF := 1
+///     FOR i := 0 to 7
+///       __odata[i] := 0
+///     ENDFOR
 ///   ELSE
 ///     FOR i := 0 to 7
 ///       __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -21,9 +21,37 @@
 #define cl_khr_subgroup_shuffle 1
 #define cl_khr_subgroup_shuffle_relative 1
 #define cl_khr_subgroup_clustered_reduce 1
+#define cl_khr_extended_bit_ops 1
+#define cl_khr_integer_dot_product 1
+#define __opencl_c_integer_dot_product_input_4x8bit 1
+#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
+
 #endif // defined(__SPIR__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)

+// Define feature macros for OpenCL C 2.0
+#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
+#define __opencl_c_pipes 1
+#define __opencl_c_generic_address_space 1
+#define __opencl_c_work_group_collective_functions 1
+#define __opencl_c_atomic_order_acq_rel 1
+#define __opencl_c_atomic_order_seq_cst 1
+#define __opencl_c_atomic_scope_device 1
+#define __opencl_c_atomic_scope_all_devices 1
+#define __opencl_c_device_enqueue 1
+#define __opencl_c_read_write_images 1
+#define __opencl_c_program_scope_global_variables 1
+#define __opencl_c_images 1
+#endif
+
+// Define header-only feature macros for OpenCL C 3.0.
+#if (__OPENCL_C_VERSION__ == 300)
+// For the SPIR target all features are supported.
+#if defined(__SPIR__)
+#define __opencl_c_atomic_scope_all_devices 1
+#endif // defined(__SPIR__)
+#endif // (__OPENCL_C_VERSION__ == 300)
+
 // built-in scalar data types:

 /**
@ -141,7 +169,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
 typedef double double16 __attribute__((ext_vector_type(16)));
 #endif

-#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__)
+#define NULL nullptr
+#elif defined(__OPENCL_C_VERSION__)
 #define NULL ((void*)0)
 #endif

@ -297,7 +327,12 @@ typedef enum memory_scope {
  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+#if defined(__opencl_c_atomic_scope_all_devices)
  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
+  memory_scope_all_devices = memory_scope_all_svm_devices,
+#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
+#endif // defined(__opencl_c_atomic_scope_all_devices)
 #if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
 #endif
@ -322,7 +357,9 @@ typedef enum memory_order
  memory_order_acquire = __ATOMIC_ACQUIRE,
  memory_order_release = __ATOMIC_RELEASE,
  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+#if defined(__opencl_c_atomic_order_seq_cst)
  memory_order_seq_cst = __ATOMIC_SEQ_CST
+#endif
 } memory_order;

 #endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@ -445,8 +482,113 @@ typedef struct {

 #endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

+/**
+ * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
+ * Reinterprets a data type as another data type of the same size
+ */
+#define as_char(x) __builtin_astype((x), char)
+#define as_char2(x) __builtin_astype((x), char2)
+#define as_char3(x) __builtin_astype((x), char3)
+#define as_char4(x) __builtin_astype((x), char4)
+#define as_char8(x) __builtin_astype((x), char8)
+#define as_char16(x) __builtin_astype((x), char16)
+
+#define as_uchar(x) __builtin_astype((x), uchar)
+#define as_uchar2(x) __builtin_astype((x), uchar2)
+#define as_uchar3(x) __builtin_astype((x), uchar3)
+#define as_uchar4(x) __builtin_astype((x), uchar4)
+#define as_uchar8(x) __builtin_astype((x), uchar8)
+#define as_uchar16(x) __builtin_astype((x), uchar16)
+
+#define as_short(x) __builtin_astype((x), short)
+#define as_short2(x) __builtin_astype((x), short2)
+#define as_short3(x) __builtin_astype((x), short3)
+#define as_short4(x) __builtin_astype((x), short4)
+#define as_short8(x) __builtin_astype((x), short8)
+#define as_short16(x) __builtin_astype((x), short16)
+
+#define as_ushort(x) __builtin_astype((x), ushort)
+#define as_ushort2(x) __builtin_astype((x), ushort2)
+#define as_ushort3(x) __builtin_astype((x), ushort3)
+#define as_ushort4(x) __builtin_astype((x), ushort4)
+#define as_ushort8(x) __builtin_astype((x), ushort8)
+#define as_ushort16(x) __builtin_astype((x), ushort16)
+
+#define as_int(x) __builtin_astype((x), int)
+#define as_int2(x) __builtin_astype((x), int2)
+#define as_int3(x) __builtin_astype((x), int3)
+#define as_int4(x) __builtin_astype((x), int4)
+#define as_int8(x) __builtin_astype((x), int8)
+#define as_int16(x) __builtin_astype((x), int16)
+
+#define as_uint(x) __builtin_astype((x), uint)
+#define as_uint2(x) __builtin_astype((x), uint2)
+#define as_uint3(x) __builtin_astype((x), uint3)
+#define as_uint4(x) __builtin_astype((x), uint4)
+#define as_uint8(x) __builtin_astype((x), uint8)
+#define as_uint16(x) __builtin_astype((x), uint16)
+
+#define as_long(x) __builtin_astype((x), long)
+#define as_long2(x) __builtin_astype((x), long2)
+#define as_long3(x) __builtin_astype((x), long3)
+#define as_long4(x) __builtin_astype((x), long4)
+#define as_long8(x) __builtin_astype((x), long8)
+#define as_long16(x) __builtin_astype((x), long16)
+
+#define as_ulong(x) __builtin_astype((x), ulong)
+#define as_ulong2(x) __builtin_astype((x), ulong2)
+#define as_ulong3(x) __builtin_astype((x), ulong3)
+#define as_ulong4(x) __builtin_astype((x), ulong4)
+#define as_ulong8(x) __builtin_astype((x), ulong8)
+#define as_ulong16(x) __builtin_astype((x), ulong16)
+
+#define as_float(x) __builtin_astype((x), float)
+#define as_float2(x) __builtin_astype((x), float2)
+#define as_float3(x) __builtin_astype((x), float3)
+#define as_float4(x) __builtin_astype((x), float4)
+#define as_float8(x) __builtin_astype((x), float8)
+#define as_float16(x) __builtin_astype((x), float16)
+
+#ifdef cl_khr_fp64
+#define as_double(x) __builtin_astype((x), double)
+#define as_double2(x) __builtin_astype((x), double2)
+#define as_double3(x) __builtin_astype((x), double3)
+#define as_double4(x) __builtin_astype((x), double4)
+#define as_double8(x) __builtin_astype((x), double8)
+#define as_double16(x) __builtin_astype((x), double16)
+#endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+#define as_half(x) __builtin_astype((x), half)
+#define as_half2(x) __builtin_astype((x), half2)
+#define as_half3(x) __builtin_astype((x), half3)
+#define as_half4(x) __builtin_astype((x), half4)
+#define as_half8(x) __builtin_astype((x), half8)
+#define as_half16(x) __builtin_astype((x), half16)
+#endif // cl_khr_fp16
+
+#define as_size_t(x) __builtin_astype((x), size_t)
+#define as_ptrdiff_t(x) __builtin_astype((x), ptrdiff_t)
+#define as_intptr_t(x) __builtin_astype((x), intptr_t)
+#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
+
+// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
+
+#define __kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#define kernel_exec(X, typen) __kernel \
+	__attribute__((work_group_size_hint(X, 1, 1))) \
+	__attribute__((vec_type_hint(typen)))
+
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
+// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
+
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+#endif
+
 #ifdef cl_intel_device_side_avc_motion_estimation
-#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin

 #define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
 #define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
@ -580,7 +722,6 @@ typedef struct {
 #define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
 #define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0

-#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
 #endif // cl_intel_device_side_avc_motion_estimation

 // Disable any extensions we may have enabled previously.
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
--- a/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/lib/include/openmp_wrappers/__clang_openmp_device_functions.h
@ -39,4 +39,46 @@ extern "C" {

 #pragma omp end declare variant

+// Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the
+// need to `include <new>` in C++ mode.
+#ifdef __cplusplus
+
+// We require malloc/free.
+#include <cstdlib>
+
+#pragma push_macro("OPENMP_NOEXCEPT")
+#if __cplusplus >= 201103L
+#define OPENMP_NOEXCEPT noexcept
+#else
+#define OPENMP_NOEXCEPT
+#endif
+
+// Device overrides for non-placement new and delete.
+inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0)
+    size = 1;
+  return ::malloc(size);
+}
+
+inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
+
+inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }
+
+inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+#pragma pop_macro("OPENMP_NOEXCEPT")
+#endif
+
 #endif
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@ -17,7 +17,6 @@
 // We require std::math functions in the complex builtins below.
 #include <cmath>

-#define __CUDA__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
@ -26,9 +25,6 @@
 // Grab the host header too.
 #include_next <complex>

-
-#ifdef __cplusplus
-
 // If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
 // after including <cmath> above. Since the complex header we use is a
 // simplified version of the libc++, we don't need it in this case. If we
@ -48,5 +44,3 @@
 #pragma omp end declare variant

 #endif
-
-#endif
--- a/lib/include/openmp_wrappers/complex.h
+++ b/lib/include/openmp_wrappers/complex.h
@ -17,7 +17,6 @@
 // We require math functions in the complex builtins below.
 #include <math.h>

-#define __CUDA__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
--- a/lib/include/openmp_wrappers/new
+++ b/lib/include/openmp_wrappers/new
@ -9,6 +9,8 @@
 #ifndef __CLANG_OPENMP_WRAPPERS_NEW
 #define __CLANG_OPENMP_WRAPPERS_NEW

+// We need the system <new> for the std::nothrow_t. The new/delete operators
+// which do not use nothrow_t are provided without the <new> header.
 #include_next <new>

 #if defined(__NVPTX__) && defined(_OPENMP)
@ -22,48 +24,24 @@
 #define OPENMP_NOEXCEPT
 #endif

-// Device overrides for non-placement new and delete.
-inline void *operator new(__SIZE_TYPE__ size) {
-  if (size == 0)
-    size = 1;
-  return ::malloc(size);
-}
 inline void *operator new(__SIZE_TYPE__ size,
                          const std::nothrow_t &) OPENMP_NOEXCEPT {
  return ::operator new(size);
 }

-inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
 inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
  return ::operator new(size);
 }

-inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
-  if (ptr)
-    ::free(ptr);
-}
 inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
  ::operator delete(ptr);
 }

-inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
 inline void operator delete[](void *ptr,
                              const std::nothrow_t &) OPENMP_NOEXCEPT {
  ::operator delete(ptr);
 }

-// Sized delete, C++14 only.
-#if __cplusplus >= 201402L
-inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
-inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
-  ::operator delete(ptr);
-}
-#endif
-
 #pragma pop_macro("OPENMP_NOEXCEPT")
 #endif

--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@ -28,7 +28,7 @@
   Most SSE scalar float intrinsic operations can be performed more
   efficiently as C language float scalar operations or optimized to
   use vector SIMD operations. We recommend this for new applications. */
-#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif

 #ifndef _XMMINTRIN_H_INCLUDED
@ -62,14 +62,13 @@

 /* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
-typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef vector float __m128 __attribute__((__may_alias__));

 /* Unaligned version of the same type.  */
-typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
-				       __aligned__ (1)));
+typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));

 /* Internal data types for implementing the intrinsics.  */
-typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+typedef vector float __v4sf;

 /* Create an undefined vector.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
--- a/lib/include/uintrintrin.h
+++ b/lib/include/uintrintrin.h
@ -20,6 +20,13 @@

 #ifdef __x86_64__

+struct __uintr_frame
+{
+  unsigned long long rip;
+  unsigned long long rflags;
+  unsigned long long rsp;
+};
+
 /// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
 ///    user interrupt cannot be delivered on the instruction boundary following
 ///    CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
--- a/lib/include/vaesintrin.h
+++ b/lib/include/vaesintrin.h
@ -28,13 +28,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenc_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesdec_epi128(__m256i __A, __m256i __B)
 {
@ -42,13 +35,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdec_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
 {
@ -56,13 +42,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
-              (__v8di) __B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
 {
@ -70,13 +49,35 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
              (__v4di) __B);
 }

+#ifdef __AVX512FINTRIN_H
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenc_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdec_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
+              (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
+              (__v8di) __B);
+}
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS_F
 _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
 {
  return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
              (__v8di) __B);
 }
-
+#endif // __AVX512FINTRIN_H

 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_F
--- a/lib/include/vecintrin.h
+++ b/lib/include/vecintrin.h
@ -1016,64 +1016,84 @@ vec_scatter_element(__vector double __vec,

 static inline __ATTRS_o_ai __vector signed char
 vec_xl(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned char
 vec_xl(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed short
 vec_xl(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned short
 vec_xl(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed int
 vec_xl(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned int
 vec_xl(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 static inline __ATTRS_o_ai __vector signed long long
 vec_xl(long __offset, const signed long long *__ptr) {
-  return *(const __vector signed long long *)
-          ((const char *)__ptr + __offset);
+  __vector signed long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed long long));
+  return V;
 }

 static inline __ATTRS_o_ai __vector unsigned long long
 vec_xl(long __offset, const unsigned long long *__ptr) {
-  return *(const __vector unsigned long long *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned long long));
+  return V;
 }

 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai __vector float
 vec_xl(long __offset, const float *__ptr) {
-  return *(const __vector float *)
-          ((const char *)__ptr + __offset);
+  __vector float V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector float));
+  return V;
 }
 #endif

 static inline __ATTRS_o_ai __vector double
 vec_xl(long __offset, const double *__ptr) {
-  return *(const __vector double *)
-          ((const char *)__ptr + __offset);
+  __vector double V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector double));
+  return V;
 }

 /*-- vec_xld2 ---------------------------------------------------------------*/
@ -1081,64 +1101,82 @@ vec_xl(long __offset, const double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_xld2(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_xld2(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_xld2(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_xld2(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_xld2(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_xld2(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed long long
 vec_xld2(long __offset, const signed long long *__ptr) {
-  return *(const __vector signed long long *)
-          ((const char *)__ptr + __offset);
+  __vector signed long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed long long));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned long long
 vec_xld2(long __offset, const unsigned long long *__ptr) {
-  return *(const __vector unsigned long long *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned long long V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned long long));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector double
 vec_xld2(long __offset, const double *__ptr) {
-  return *(const __vector double *)
-          ((const char *)__ptr + __offset);
+  __vector double V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector double));
+  return V;
 }

 /*-- vec_xlw4 ---------------------------------------------------------------*/
@ -1146,99 +1184,128 @@ vec_xld2(long __offset, const double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed char
 vec_xlw4(long __offset, const signed char *__ptr) {
-  return *(const __vector signed char *)
-          ((const char *)__ptr + __offset);
+  __vector signed char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned char
 vec_xlw4(long __offset, const unsigned char *__ptr) {
-  return *(const __vector unsigned char *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned char V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned char));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed short
 vec_xlw4(long __offset, const signed short *__ptr) {
-  return *(const __vector signed short *)
-          ((const char *)__ptr + __offset);
+  __vector signed short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned short
 vec_xlw4(long __offset, const unsigned short *__ptr) {
-  return *(const __vector unsigned short *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned short V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned short));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector signed int
 vec_xlw4(long __offset, const signed int *__ptr) {
-  return *(const __vector signed int *)
-          ((const char *)__ptr + __offset);
+  __vector signed int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector signed int));
+  return V;
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai __vector unsigned int
 vec_xlw4(long __offset, const unsigned int *__ptr) {
-  return *(const __vector unsigned int *)
-          ((const char *)__ptr + __offset);
+  __vector unsigned int V;
+  __builtin_memcpy(&V, ((const char *)__ptr + __offset),
+                   sizeof(__vector unsigned int));
+  return V;
 }

 /*-- vec_xst ----------------------------------------------------------------*/

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector signed long long __vec, long __offset,
-          signed long long *__ptr) {
-  *(__vector signed long long *)((char *)__ptr + __offset) = __vec;
+        signed long long *__ptr) {
+  __vector signed long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed long long));
 }

 static inline __ATTRS_o_ai void
 vec_xst(__vector unsigned long long __vec, long __offset,
-          unsigned long long *__ptr) {
-  *(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
+        unsigned long long *__ptr) {
+  __vector unsigned long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned long long));
 }

 #if __ARCH__ >= 12
 static inline __ATTRS_o_ai void
 vec_xst(__vector float __vec, long __offset, float *__ptr) {
-  *(__vector float *)((char *)__ptr + __offset) = __vec;
+  __vector float V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector float));
 }
 #endif

 static inline __ATTRS_o_ai void
 vec_xst(__vector double __vec, long __offset, double *__ptr) {
-  *(__vector double *)((char *)__ptr + __offset) = __vec;
+  __vector double V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
 }

 /*-- vec_xstd2 --------------------------------------------------------------*/
@ -1246,57 +1313,73 @@ vec_xst(__vector double __vec, long __offset, double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector signed long long __vec, long __offset,
          signed long long *__ptr) {
-  *(__vector signed long long *)((char *)__ptr + __offset) = __vec;
+  __vector signed long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed long long));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector unsigned long long __vec, long __offset,
          unsigned long long *__ptr) {
-  *(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned long long V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned long long));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
-  *(__vector double *)((char *)__ptr + __offset) = __vec;
+  __vector double V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
 }

 /*-- vec_xstw4 --------------------------------------------------------------*/
@ -1304,37 +1387,48 @@ vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed char __vec, long __offset, signed char *__ptr) {
-  *(__vector signed char *)((char *)__ptr + __offset) = __vec;
+  __vector signed char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
-  *(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned char V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned char));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed short __vec, long __offset, signed short *__ptr) {
-  *(__vector signed short *)((char *)__ptr + __offset) = __vec;
+  __vector signed short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector signed short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
-  *(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned short V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned short));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector signed int __vec, long __offset, signed int *__ptr) {
-  *(__vector signed int *)((char *)__ptr + __offset) = __vec;
+  __vector signed int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
 }

 // This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
-  *(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
+  __vector unsigned int V = __vec;
+  __builtin_memcpy(((char *)__ptr + __offset), &V,
+                   sizeof(__vector unsigned int));
 }

 /*-- vec_load_bndry ---------------------------------------------------------*/
@ -9259,6 +9353,41 @@ vec_fp_test_data_class(__vector double __a, int __b, int *__c)
                                   __VEC_CLASS_FP_ZERO | \
                                   __VEC_CLASS_FP_INFINITY)

+/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_extend_to_fp32_hi(X, W) \
+  ((__vector float)__builtin_s390_vclfnhs((X), (W)));
+#endif
+
+/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_extend_to_fp32_lo(X, W) \
+  ((__vector float)__builtin_s390_vclfnls((X), (W)));
+#endif
+
+/*-- vec_round_from_fp32 ----------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_round_from_fp32(X, Y, W) \
+  ((__vector unsigned short)__builtin_s390_vcrnfs((X), (Y), (W)));
+#endif
+
+/*-- vec_convert_to_fp16 ----------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_convert_to_fp16(X, W) \
+  ((__vector unsigned short)__builtin_s390_vcfn((X), (W)));
+#endif
+
+/*-- vec_convert_from_fp16 --------------------------------------------------*/
+
+#if __ARCH__ >= 14
+#define vec_convert_from_fp16(X, W) \
+  ((__vector unsigned short)__builtin_s390_vcnf((X), (W)));
+#endif
+
 /*-- vec_cp_until_zero ------------------------------------------------------*/

 static inline __ATTRS_o_ai __vector signed char
--- a/lib/include/vpclmulqdqintrin.h
+++ b/lib/include/vpclmulqdqintrin.h
@ -19,10 +19,12 @@
                                       (__v4di)(__m256i)(B),  \
                                       (char)(I))

+#ifdef __AVX512FINTRIN_H
 #define _mm512_clmulepi64_epi128(A, B, I) \
  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
                                       (__v8di)(__m512i)(B),  \
                                       (char)(I))
+#endif // __AVX512FINTRIN_H

 #endif /* __VPCLMULQDQINTRIN_H */

--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h