update C language headers to LLVM 18

release/18.x branch, commit 78b99c73ee4b96fe9ce0e294d4632326afb2db42
2025-12-06 06:13:07 +00:00 · 2024-04-25 14:59:29 -07:00 · 2024-04-25 14:59:29 -07:00 · 70c85b1bf1
commit 70c85b1bf1
parent e4029b9943
103 changed files with 21299 additions and 2595 deletions
--- a/lib/include/__clang_cuda_device_functions.h
+++ b/lib/include/__clang_cuda_device_functions.h
@ -502,8 +502,8 @@ __DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); }
 __DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); }
 __DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); }
 __DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); }
-__DEVICE__ int __popc(int __a) { return __nv_popc(__a); }
-__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); }
+__DEVICE__ int __popc(unsigned int __a) { return __nv_popc(__a); }
+__DEVICE__ int __popcll(unsigned long long __a) { return __nv_popcll(__a); }
 __DEVICE__ float __powf(float __a, float __b) {
  return __nv_fast_powf(__a, __b);
 }
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@ -285,8 +285,8 @@ __DEVICE__ double __nv_normcdfinv(double __a);
 __DEVICE__ float __nv_normcdfinvf(float __a);
 __DEVICE__ float __nv_normf(int __a, const float *__b);
 __DEVICE__ double __nv_norm(int __a, const double *__b);
-__DEVICE__ int __nv_popc(int __a);
-__DEVICE__ int __nv_popcll(long long __a);
+__DEVICE__ int __nv_popc(unsigned int __a);
+__DEVICE__ int __nv_popcll(unsigned long long __a);
 __DEVICE__ double __nv_pow(double __a, double __b);
 __DEVICE__ float __nv_powf(float __a, float __b);
 __DEVICE__ double __nv_powi(double __a, int __b);
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -36,7 +36,7 @@
 // because the OpenMP overlay requires constexpr functions here but prior to
 // c++14 void return functions could not be constexpr.
 #pragma push_macro("__DEVICE_VOID__")
-#ifdef __OPENMP_NVPTX__ && defined(__cplusplus) && __cplusplus < 201402L
+#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
 #define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
 #else
 #define __DEVICE_VOID__ __DEVICE__
@ -45,9 +45,9 @@
 // libdevice provides fast low precision and slow full-recision implementations
 // for some functions. Which one gets selected depends on
 // __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
-// -ffast-math or -fcuda-approx-transcendentals are in effect.
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
 #pragma push_macro("__FAST_OR_SLOW")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __FAST_OR_SLOW(fast, slow) fast
 #else
 #define __FAST_OR_SLOW(fast, slow) slow
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -196,12 +196,12 @@ inline __host__ double __signbitd(double x) {

 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
-// sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
+// sin and exp.  This is controlled in clang by -fgpu-approx-transcendentals.
 //
 // device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
 // slow divides), so we need to scope our define carefully here.
 #pragma push_macro("__USE_FAST_MATH__")
-#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
 #define __USE_FAST_MATH__ 1
 #endif

--- a/lib/include/__clang_hip_math.h
+++ b/lib/include/__clang_hip_math.h
@ -14,9 +14,6 @@
 #endif

 #if !defined(__HIPCC_RTC__)
-#if defined(__cplusplus)
-#include <algorithm>
-#endif
 #include <limits.h>
 #include <stdint.h>
 #ifdef __OPENMP_AMDGCN__
@ -32,6 +29,17 @@
 #define __DEVICE__ static __device__ inline __attribute__((always_inline))
 #endif

+// Device library provides fast low precision and slow full-recision
+// implementations for some functions. Which one gets selected depends on
+// __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if
+// -ffast-math or -fgpu-approx-transcendentals are in effect.
+#pragma push_macro("__FAST_OR_SLOW")
+#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
+#define __FAST_OR_SLOW(fast, slow) fast
+#else
+#define __FAST_OR_SLOW(fast, slow) slow
+#endif
+
 // A few functions return bool type starting only in C++11.
 #pragma push_macro("__RETURN_TYPE")
 #ifdef __OPENMP_AMDGCN__
@ -139,21 +147,180 @@ uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
 }

 // BEGIN FLOAT
+
+// BEGIN INTRINSICS
+
+__DEVICE__
+float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
+
+__DEVICE__
+float __exp10f(float __x) {
+  const float __log2_10 = 0x1.a934f0p+1f;
+  return __builtin_amdgcn_exp2f(__log2_10 * __x);
+}
+
+__DEVICE__
+float __expf(float __x) {
+  const float __log2_e = 0x1.715476p+0;
+  return __builtin_amdgcn_exp2f(__log2_e * __x);
+}
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
+__DEVICE__
+float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
+__DEVICE__
+float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fadd_rn(float __x, float __y) { return __x + __y; }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
+__DEVICE__
+float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
+__DEVICE__
+float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
+__DEVICE__
+float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fdiv_rn(float __x, float __y) { return __x / __y; }
+#endif
+
+__DEVICE__
+float __fdividef(float __x, float __y) { return __x / __y; }
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fmaf_rd(float __x, float __y, float __z) {
+  return __ocml_fma_rtn_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __ocml_fma_rte_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_ru(float __x, float __y, float __z) {
+  return __ocml_fma_rtp_f32(__x, __y, __z);
+}
+__DEVICE__
+float __fmaf_rz(float __x, float __y, float __z) {
+  return __ocml_fma_rtz_f32(__x, __y, __z);
+}
+#else
+__DEVICE__
+float __fmaf_rn(float __x, float __y, float __z) {
+  return __builtin_fmaf(__x, __y, __z);
+}
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
+__DEVICE__
+float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
+__DEVICE__
+float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
+__DEVICE__
+float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fmul_rn(float __x, float __y) { return __x * __y; }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
+__DEVICE__
+float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
+#else
+__DEVICE__
+float __frcp_rn(float __x) { return 1.0f / __x; }
+#endif
+
+__DEVICE__
+float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); }
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
+__DEVICE__
+float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
+__DEVICE__
+float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
+__DEVICE__
+float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
+#else
+__DEVICE__
+float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
+#endif
+
+#if defined OCML_BASIC_ROUNDED_OPERATIONS
+__DEVICE__
+float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
+__DEVICE__
+float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
+__DEVICE__
+float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
+__DEVICE__
+float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
+#else
+__DEVICE__
+float __fsub_rn(float __x, float __y) { return __x - __y; }
+#endif
+
+__DEVICE__
+float __log10f(float __x) { return __builtin_log10f(__x); }
+
+__DEVICE__
+float __log2f(float __x) { return __builtin_amdgcn_logf(__x); }
+
+__DEVICE__
+float __logf(float __x) { return __builtin_logf(__x); }
+
+__DEVICE__
+float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
+
+__DEVICE__
+float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
+
+__DEVICE__
+void __sincosf(float __x, float *__sinptr, float *__cosptr) {
+  *__sinptr = __ocml_native_sin_f32(__x);
+  *__cosptr = __ocml_native_cos_f32(__x);
+}
+
+__DEVICE__
+float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
+
+__DEVICE__
+float __tanf(float __x) {
+  return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x));
+}
+// END INTRINSICS
+
 #if defined(__cplusplus)
 __DEVICE__
 int abs(int __x) {
-  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_abs(__x);
 }
 __DEVICE__
 long labs(long __x) {
-  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_labs(__x);
 }
 __DEVICE__
 long long llabs(long long __x) {
-  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
-  return (__x ^ __sgn) - __sgn;
+  return __builtin_llabs(__x);
 }
 #endif

@ -188,7 +355,7 @@ __DEVICE__
 float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); }

 __DEVICE__
-float cosf(float __x) { return __ocml_cos_f32(__x); }
+float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); }

 __DEVICE__
 float coshf(float __x) { return __ocml_cosh_f32(__x); }
@ -321,13 +488,13 @@ __DEVICE__
 float log1pf(float __x) { return __ocml_log1p_f32(__x); }

 __DEVICE__
-float log2f(float __x) { return __builtin_log2f(__x); }
+float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); }

 __DEVICE__
 float logbf(float __x) { return __ocml_logb_f32(__x); }

 __DEVICE__
-float logf(float __x) { return __builtin_logf(__x); }
+float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); }

 __DEVICE__
 long int lrintf(float __x) { return __builtin_rintf(__x); }
@ -401,7 +568,7 @@ float normf(int __dim,
    ++__a;
  }

-  return __ocml_sqrt_f32(__r);
+  return __builtin_sqrtf(__r);
 }

 __DEVICE__
@ -483,9 +650,13 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
 #ifdef __OPENMP_AMDGCN__
 #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
 #endif
+#ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+  __sincosf(__x, __sinptr, __cosptr);
+#else
  *__sinptr =
      __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
  *__cosptr = __tmp;
+#endif
 }

 __DEVICE__
@ -500,7 +671,7 @@ void sincospif(float __x, float *__sinptr, float *__cosptr) {
 }

 __DEVICE__
-float sinf(float __x) { return __ocml_sin_f32(__x); }
+float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); }

 __DEVICE__
 float sinhf(float __x) { return __ocml_sinh_f32(__x); }
@ -509,7 +680,7 @@ __DEVICE__
 float sinpif(float __x) { return __ocml_sinpi_f32(__x); }

 __DEVICE__
-float sqrtf(float __x) { return __ocml_sqrt_f32(__x); }
+float sqrtf(float __x) { return __builtin_sqrtf(__x); }

 __DEVICE__
 float tanf(float __x) { return __ocml_tan_f32(__x); }
@ -551,158 +722,7 @@ float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication
  return __x1;
 }

-// BEGIN INTRINSICS

-__DEVICE__
-float __cosf(float __x) { return __ocml_native_cos_f32(__x); }
-
-__DEVICE__
-float __exp10f(float __x) { return __ocml_native_exp10_f32(__x); }
-
-__DEVICE__
-float __expf(float __x) { return __ocml_native_exp_f32(__x); }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); }
-__DEVICE__
-float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); }
-__DEVICE__
-float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); }
-__DEVICE__
-float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fadd_rn(float __x, float __y) { return __x + __y; }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); }
-__DEVICE__
-float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); }
-__DEVICE__
-float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); }
-__DEVICE__
-float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fdiv_rn(float __x, float __y) { return __x / __y; }
-#endif
-
-__DEVICE__
-float __fdividef(float __x, float __y) { return __x / __y; }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fmaf_rd(float __x, float __y, float __z) {
-  return __ocml_fma_rtn_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_rte_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_ru(float __x, float __y, float __z) {
-  return __ocml_fma_rtp_f32(__x, __y, __z);
-}
-__DEVICE__
-float __fmaf_rz(float __x, float __y, float __z) {
-  return __ocml_fma_rtz_f32(__x, __y, __z);
-}
-#else
-__DEVICE__
-float __fmaf_rn(float __x, float __y, float __z) {
-  return __builtin_fmaf(__x, __y, __z);
-}
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); }
-__DEVICE__
-float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); }
-__DEVICE__
-float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); }
-__DEVICE__
-float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fmul_rn(float __x, float __y) { return __x * __y; }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); }
-__DEVICE__
-float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); }
-#else
-__DEVICE__
-float __frcp_rn(float __x) { return 1.0f / __x; }
-#endif
-
-__DEVICE__
-float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); }
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); }
-__DEVICE__
-float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); }
-__DEVICE__
-float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); }
-__DEVICE__
-float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); }
-#else
-__DEVICE__
-float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); }
-#endif
-
-#if defined OCML_BASIC_ROUNDED_OPERATIONS
-__DEVICE__
-float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); }
-__DEVICE__
-float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); }
-__DEVICE__
-float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); }
-__DEVICE__
-float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); }
-#else
-__DEVICE__
-float __fsub_rn(float __x, float __y) { return __x - __y; }
-#endif
-
-__DEVICE__
-float __log10f(float __x) { return __ocml_native_log10_f32(__x); }
-
-__DEVICE__
-float __log2f(float __x) { return __ocml_native_log2_f32(__x); }
-
-__DEVICE__
-float __logf(float __x) { return __ocml_native_log_f32(__x); }
-
-__DEVICE__
-float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); }
-
-__DEVICE__
-float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); }
-
-__DEVICE__
-void __sincosf(float __x, float *__sinptr, float *__cosptr) {
-  *__sinptr = __ocml_native_sin_f32(__x);
-  *__cosptr = __ocml_native_cos_f32(__x);
-}
-
-__DEVICE__
-float __sinf(float __x) { return __ocml_native_sin_f32(__x); }
-
-__DEVICE__
-float __tanf(float __x) { return __ocml_tan_f32(__x); }
-// END INTRINSICS
 // END FLOAT

 // BEGIN DOUBLE
@ -941,7 +961,7 @@ double norm(int __dim,
    ++__a;
  }

-  return __ocml_sqrt_f64(__r);
+  return __builtin_sqrt(__r);
 }

 __DEVICE__
@ -1064,7 +1084,7 @@ __DEVICE__
 double sinpi(double __x) { return __ocml_sinpi_f64(__x); }

 __DEVICE__
-double sqrt(double __x) { return __ocml_sqrt_f64(__x); }
+double sqrt(double __x) { return __builtin_sqrt(__x); }

 __DEVICE__
 double tan(double __x) { return __ocml_tan_f64(__x); }
@ -1198,7 +1218,7 @@ __DEVICE__
 double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); }
 #else
 __DEVICE__
-double __dsqrt_rn(double __x) { return __ocml_sqrt_f64(__x); }
+double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); }
 #endif

 #if defined OCML_BASIC_ROUNDED_OPERATIONS
@ -1288,16 +1308,17 @@ double min(double __x, double __y) { return __builtin_fmin(__x, __y); }

 #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 __host__ inline static int min(int __arg1, int __arg2) {
-  return std::min(__arg1, __arg2);
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }

 __host__ inline static int max(int __arg1, int __arg2) {
-  return std::max(__arg1, __arg2);
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
 #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif

 #pragma pop_macro("__DEVICE__")
 #pragma pop_macro("__RETURN_TYPE")
+#pragma pop_macro("__FAST_OR_SLOW")

 #endif // __CLANG_HIP_MATH_H__
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -46,6 +46,67 @@ extern "C" {
 }
 #endif //__cplusplus

+#if !defined(__HIPCC_RTC__)
+#if __has_include("hip/hip_version.h")
+#include "hip/hip_version.h"
+#endif // __has_include("hip/hip_version.h")
+#endif // __HIPCC_RTC__
+
+typedef __SIZE_TYPE__ __hip_size_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif //__cplusplus
+
+#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
+__device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
+__device__ void __ockl_dm_dealloc(unsigned long long __addr);
+#if __has_feature(address_sanitizer)
+__device__ unsigned long long __asan_malloc_impl(unsigned long long __size,
+                                                 unsigned long long __pc);
+__device__ void __asan_free_impl(unsigned long long __addr,
+                                 unsigned long long __pc);
+__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  return (void *)__asan_malloc_impl(__size, __pc);
+}
+__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  __asan_free_impl((unsigned long long)__ptr, __pc);
+}
+#else // __has_feature(address_sanitizer)
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  return (void *) __ockl_dm_alloc(__size);
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __ockl_dm_dealloc((unsigned long long)__ptr);
+}
+#endif // __has_feature(address_sanitizer)
+#else  // HIP version check
+#if __HIP_ENABLE_DEVICE_MALLOC__
+__device__ void *__hip_malloc(__hip_size_t __size);
+__device__ void *__hip_free(void *__ptr);
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  return __hip_malloc(__size);
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __hip_free(__ptr);
+}
+#else  // __HIP_ENABLE_DEVICE_MALLOC__
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  __builtin_trap();
+  return (void *)0;
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __builtin_trap();
+}
+#endif // __HIP_ENABLE_DEVICE_MALLOC__
+#endif // HIP version check
+
+#ifdef __cplusplus
+} // extern "C"
+#endif //__cplusplus
+
 #if !defined(__HIPCC_RTC__)
 #include <cmath>
 #include <cstdlib>
@ -71,59 +132,6 @@ typedef __SIZE_TYPE__ size_t;
 #define INT_MAX __INTMAX_MAX__
 #endif // __HIPCC_RTC__

-typedef __SIZE_TYPE__ __hip_size_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif //__cplusplus
-
-#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
-extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
-extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
-#if __has_feature(address_sanitizer)
-extern "C" __device__ unsigned long long __asan_malloc_impl(unsigned long long __size, unsigned long long __pc);
-extern "C" __device__ void __asan_free_impl(unsigned long long __addr, unsigned long long __pc);
-__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  return (void *)__asan_malloc_impl(__size, __pc);
-}
-__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
-  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
-  __asan_free_impl((unsigned long long)__ptr, __pc);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return (void *) __ockl_dm_alloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __ockl_dm_dealloc((unsigned long long)__ptr);
-}
-#endif // __has_feature(address_sanitizer)
-#else  // HIP version check
-#if __HIP_ENABLE_DEVICE_MALLOC__
-__device__ void *__hip_malloc(__hip_size_t __size);
-__device__ void *__hip_free(void *__ptr);
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  return __hip_malloc(__size);
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __hip_free(__ptr);
-}
-#else
-__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
-  __builtin_trap();
-  return (void *)0;
-}
-__attribute__((weak)) inline __device__ void free(void *__ptr) {
-  __builtin_trap();
-}
-#endif
-#endif // HIP version check
-
-#ifdef __cplusplus
-} // extern "C"
-#endif //__cplusplus
-
 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>
 #include <__clang_hip_stdlib.h>
--- a/lib/include/stdarg_gnuc_va_list.h
+++ b/lib/include/stdarg_gnuc_va_list.h
@ -0,0 +1,13 @@
+/*===---- __stdarg___gnuc_va_list.h - Definition of __gnuc_va_list ---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST
+typedef __builtin_va_list __gnuc_va_list;
+#endif
--- a/lib/include/stdarg_va_copy.h
+++ b/lib/include/stdarg_va_copy.h
@ -0,0 +1,12 @@
+/*===---- __stdarg___va_copy.h - Definition of __va_copy -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __va_copy
+#define __va_copy(d, s) __builtin_va_copy(d, s)
+#endif
--- a/lib/include/__stdarg_va_arg.h
+++ b/lib/include/__stdarg_va_arg.h
@ -0,0 +1,22 @@
+/*===---- __stdarg_va_arg.h - Definitions of va_start, va_arg, va_end-------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef va_arg
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+/* C23 does not require the second parameter for va_start. */
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#else
+/* Versions before C23 do require the second parameter. */
+#define va_start(ap, param) __builtin_va_start(ap, param)
+#endif
+#define va_end(ap) __builtin_va_end(ap)
+#define va_arg(ap, type) __builtin_va_arg(ap, type)
+
+#endif
--- a/lib/include/__stdarg_va_copy.h
+++ b/lib/include/__stdarg_va_copy.h
@ -0,0 +1,12 @@
+/*===---- __stdarg_va_copy.h - Definition of va_copy------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef va_copy
+#define va_copy(dest, src) __builtin_va_copy(dest, src)
+#endif
--- a/lib/include/__stdarg_va_list.h
+++ b/lib/include/__stdarg_va_list.h
@ -0,0 +1,13 @@
+/*===---- __stdarg_va_list.h - Definition of va_list -----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _VA_LIST
+#define _VA_LIST
+typedef __builtin_va_list va_list;
+#endif
--- a/lib/include/__stddef_max_align_t.h
+++ b/lib/include/__stddef_max_align_t.h
@ -1,4 +1,4 @@
-/*===---- __stddef_max_align_t.h - Definition of max_align_t for modules ---===
+/*===---- __stddef_max_align_t.h - Definition of max_align_t ---------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/lib/include/__stddef_null.h
+++ b/lib/include/__stddef_null.h
@ -0,0 +1,29 @@
+/*===---- __stddef_null.h - Definition of NULL -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(NULL) || !__building_module(_Builtin_stddef)
+
+/* linux/stddef.h will define NULL to 0. glibc (and other) headers then define
+ * __need_NULL and rely on stddef.h to redefine NULL to the correct value again.
+ * Modules don't support redefining macros like that, but support that pattern
+ * in the non-modules case.
+ */
+#undef NULL
+
+#ifdef __cplusplus
+#if !defined(__MINGW32__) && !defined(_MSC_VER)
+#define NULL __null
+#else
+#define NULL 0
+#endif
+#else
+#define NULL ((void*)0)
+#endif
+
+#endif
--- a/lib/include/__stddef_nullptr_t.h
+++ b/lib/include/__stddef_nullptr_t.h
@ -0,0 +1,29 @@
+/*===---- __stddef_nullptr_t.h - Definition of nullptr_t -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_NULLPTR_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _NULLPTR_T
+
+#ifdef __cplusplus
+#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
+namespace std {
+typedef decltype(nullptr) nullptr_t;
+}
+using ::std::nullptr_t;
+#endif
+#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+typedef typeof(nullptr) nullptr_t;
+#endif
+
+#endif
--- a/lib/include/__stddef_offsetof.h
+++ b/lib/include/__stddef_offsetof.h
@ -0,0 +1,17 @@
+/*===---- __stddef_offsetof.h - Definition of offsetof ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(offsetof) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define offsetof(t, d) __builtin_offsetof(t, d)
+#endif
--- a/lib/include/__stddef_ptrdiff_t.h
+++ b/lib/include/__stddef_ptrdiff_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_ptrdiff_t.h - Definition of ptrdiff_t -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_PTRDIFF_T) ||                                                    \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _PTRDIFF_T
+
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+#endif
--- a/lib/include/__stddef_rsize_t.h
+++ b/lib/include/__stddef_rsize_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_rsize_t.h - Definition of rsize_t -----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_RSIZE_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _RSIZE_T
+
+typedef __SIZE_TYPE__ rsize_t;
+
+#endif
--- a/lib/include/__stddef_size_t.h
+++ b/lib/include/__stddef_size_t.h
@ -0,0 +1,20 @@
+/*===---- __stddef_size_t.h - Definition of size_t -------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_SIZE_T) ||                                                       \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _SIZE_T
+
+typedef __SIZE_TYPE__ size_t;
+
+#endif
--- a/lib/include/__stddef_unreachable.h
+++ b/lib/include/__stddef_unreachable.h
@ -0,0 +1,21 @@
+/*===---- __stddef_unreachable.h - Definition of unreachable ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __cplusplus
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(unreachable) ||                                                   \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define unreachable() __builtin_unreachable()
+#endif
+
+#endif
--- a/lib/include/__stddef_wchar_t.h
+++ b/lib/include/__stddef_wchar_t.h
@ -0,0 +1,28 @@
+/*===---- __stddef_wchar.h - Definition of wchar_t -------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
+
+/*
+ * When -fbuiltin-headers-in-system-modules is set this is a non-modular header
+ * and needs to behave as if it was textual.
+ */
+#if !defined(_WCHAR_T) ||                                                      \
+    (__has_feature(modules) && !__building_module(_Builtin_stddef))
+#define _WCHAR_T
+
+#ifdef _MSC_EXTENSIONS
+#define _WCHAR_T_DEFINED
+#endif
+
+typedef __WCHAR_TYPE__ wchar_t;
+
+#endif
+
+#endif
--- a/lib/include/__stddef_wint_t.h
+++ b/lib/include/__stddef_wint_t.h
@ -0,0 +1,15 @@
+/*===---- __stddef_wint.h - Definition of wint_t ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _WINT_T
+#define _WINT_T
+
+typedef __WINT_TYPE__ wint_t;
+
+#endif
--- a/lib/include/adcintrin.h
+++ b/lib/include/adcintrin.h
@ -0,0 +1,160 @@
+/*===---- adcintrin.h - ADC intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __ADCINTRIN_H
+#define __ADCINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 32-bit unsigned addend.
+/// \param __y
+///    A 32-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
+  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
+///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store32(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c SBB instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 32-bit unsigned minuend.
+/// \param __y
+///    The 32-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
+  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
+///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
+///    at \a __p, and returns the 8-bit carry-out (carry flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x + __y + temp)
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    A 64-bit unsigned addend.
+/// \param __y
+///    A 64-bit unsigned addend.
+/// \param __p
+///    Pointer to memory for storing the sum.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+              unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+
+/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
+///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
+///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
+///    and returns the 8-bit carry-out (carry or overflow flag).
+///
+/// \code{.operation}
+/// temp := (__cf == 0) ? 0 : 1
+/// Store64(__p, __x - (__y + temp))
+/// result := CF
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c ADC instruction.
+///
+/// \param __cf
+///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
+/// \param __x
+///    The 64-bit unsigned minuend.
+/// \param __y
+///    The 64-bit unsigned subtrahend.
+/// \param __p
+///    Pointer to memory for storing the difference.
+/// \returns The 8-bit unsigned carry-out value.
+__INLINE unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+               unsigned long long __y, unsigned long long *__p) {
+  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#undef __INLINE
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADCINTRIN_H */
--- a/lib/include/adxintrin.h
+++ b/lib/include/adxintrin.h
@ -15,7 +15,8 @@
 #define __ADXINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("adx")))

 /* Use C++ inline semantics in C++, GNU inline for C mode. */
 #if defined(__cplusplus)
@ -53,9 +54,9 @@ extern "C" {
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarryx_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
                                                         unsigned int *__p) {
  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
@ -84,144 +85,18 @@ __INLINE unsigned char
 /// \param __p
 ///    Pointer to memory for storing the sum.
 /// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char
-    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+__INLINE unsigned char __DEFAULT_FN_ATTRS
 _addcarryx_u64(unsigned char __cf, unsigned long long __x,
               unsigned long long __y, unsigned long long *__p) {
  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif

-/* Intrinsics that are also available if __ADX__ is undefined. */
-
-/// Adds unsigned 32-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 32-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 32-bit unsigned addend.
-/// \param __y
-///    A 32-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
-                                                        unsigned int __x,
-                                                        unsigned int __y,
-                                                        unsigned int *__p) {
-  return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integers \a __x and \a __y, plus 0 or 1 as indicated
-///    by the carry flag \a __cf. Stores the unsigned 64-bit sum in the memory
-///    at \a __p, and returns the 8-bit carry-out (carry flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x + __y + temp)
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    A 64-bit unsigned addend.
-/// \param __y
-///    A 64-bit unsigned addend.
-/// \param __p
-///    Pointer to memory for storing the sum.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/// Adds unsigned 32-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 32-bit integer
-///    \a __x. Stores the unsigned 32-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store32(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c SBB instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 32-bit unsigned minuend.
-/// \param __y
-///    The 32-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
-                                                         unsigned int __x,
-                                                         unsigned int __y,
-                                                         unsigned int *__p) {
-  return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-/// Adds unsigned 64-bit integer \a __y to 0 or 1 as indicated by the carry
-///    flag \a __cf, and subtracts the result from unsigned 64-bit integer
-///    \a __x. Stores the unsigned 64-bit difference in the memory at \a __p,
-///    and returns the 8-bit carry-out (carry or overflow flag).
-///
-/// \code{.operation}
-/// temp := (__cf == 0) ? 0 : 1
-/// Store64(__p, __x - (__y + temp))
-/// result := CF
-/// \endcode
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the \c ADC instruction.
-///
-/// \param __cf
-///    The 8-bit unsigned carry flag; any non-zero value indicates carry.
-/// \param __x
-///    The 64-bit unsigned minuend.
-/// \param __y
-///    The 64-bit unsigned subtrahend.
-/// \param __p
-///    Pointer to memory for storing the difference.
-/// \returns The 8-bit unsigned carry-out value.
-__INLINE unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long *__p) {
-  return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
-}
-#endif
-
 #if defined(__cplusplus)
 }
 #endif

+#undef __INLINE
 #undef __DEFAULT_FN_ATTRS

 #endif /* __ADXINTRIN_H */
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -14647,67 +14647,86 @@ static __inline__ void __ATTRS_o_ai vec_stvrxl(vector float __a, int __b,

 static __inline__ vector signed char __ATTRS_o_ai vec_promote(signed char __a,
                                                              int __b) {
-  vector signed char __res = (vector signed char)(0);
-  __res[__b & 0x7] = __a;
+  const vector signed char __zero = (vector signed char)0;
+  vector signed char __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1,
+                              -1, -1, -1, -1, -1, -1, -1, -1);
+  __res[__b & 0xf] = __a;
  return __res;
 }

 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_promote(unsigned char __a, int __b) {
-  vector unsigned char __res = (vector unsigned char)(0);
-  __res[__b & 0x7] = __a;
+  const vector unsigned char __zero = (vector unsigned char)(0);
+  vector unsigned char __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1,
+                              -1, -1, -1, -1, -1, -1, -1, -1);
+  __res[__b & 0xf] = __a;
  return __res;
 }

 static __inline__ vector short __ATTRS_o_ai vec_promote(short __a, int __b) {
-  vector short __res = (vector short)(0);
+  const vector short __zero = (vector short)(0);
+  vector short __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1);
  __res[__b & 0x7] = __a;
  return __res;
 }

 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_promote(unsigned short __a, int __b) {
-  vector unsigned short __res = (vector unsigned short)(0);
+  const vector unsigned short __zero = (vector unsigned short)(0);
+  vector unsigned short __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1, -1, -1, -1, -1);
  __res[__b & 0x7] = __a;
  return __res;
 }

 static __inline__ vector int __ATTRS_o_ai vec_promote(int __a, int __b) {
-  vector int __res = (vector int)(0);
+  const vector int __zero = (vector int)(0);
+  vector int __res = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 static __inline__ vector unsigned int __ATTRS_o_ai vec_promote(unsigned int __a,
                                                               int __b) {
-  vector unsigned int __res = (vector unsigned int)(0);
+  const vector unsigned int __zero = (vector unsigned int)(0);
+  vector unsigned int __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 static __inline__ vector float __ATTRS_o_ai vec_promote(float __a, int __b) {
-  vector float __res = (vector float)(0);
+  const vector float __zero = (vector float)(0);
+  vector float __res = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
  __res[__b & 0x3] = __a;
  return __res;
 }

 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_promote(double __a, int __b) {
-  vector double __res = (vector double)(0);
+  const vector double __zero = (vector double)(0);
+  vector double __res = __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }

 static __inline__ vector signed long long __ATTRS_o_ai
 vec_promote(signed long long __a, int __b) {
-  vector signed long long __res = (vector signed long long)(0);
+  const vector signed long long __zero = (vector signed long long)(0);
+  vector signed long long __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }

 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_promote(unsigned long long __a, int __b) {
-  vector unsigned long long __res = (vector unsigned long long)(0);
+  const vector unsigned long long __zero = (vector unsigned long long)(0);
+  vector unsigned long long __res =
+      __builtin_shufflevector(__zero, __zero, -1, -1);
  __res[__b & 0x1] = __a;
  return __res;
 }
--- a/lib/include/ammintrin.h
+++ b/lib/include/ammintrin.h
@ -155,9 +155,9 @@ _mm_insert_si64(__m128i __x, __m128i __y)
 /// \param __a
 ///    The 64-bit double-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(double *__p, __m128d __a)
+_mm_stream_sd(void *__p, __m128d __a)
 {
-  __builtin_ia32_movntsd(__p, (__v2df)__a);
+  __builtin_ia32_movntsd((double *)__p, (__v2df)__a);
 }

 /// Stores a 32-bit single-precision floating-point value in a 32-bit
@ -173,9 +173,9 @@ _mm_stream_sd(double *__p, __m128d __a)
 /// \param __a
 ///    The 32-bit single-precision floating-point register value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(float *__p, __m128 __a)
+_mm_stream_ss(void *__p, __m128 __a)
 {
-  __builtin_ia32_movntss(__p, (__v4sf)__a);
+  __builtin_ia32_movntss((float *)__p, (__v4sf)__a);
 }

 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -4,6 +4,13 @@
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
+ * The Arm C Language Extensions specifications can be found in the following
+ * link: https://github.com/ARM-software/acle/releases
+ *
+ * The ACLE section numbers are subject to change. When consulting the
+ * specifications, it is recommended to search using section titles if
+ * the section numbers look outdated.
+ *
 *===-----------------------------------------------------------------------===
 */

@ -20,8 +27,8 @@
 extern "C" {
 #endif

-/* 8 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
-/* 8.3 Memory barriers */
+/* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */
+/* 7.3 Memory barriers */
 #if !__has_builtin(__dmb)
 #define __dmb(i) __builtin_arm_dmb(i)
 #endif
@ -32,7 +39,7 @@ extern "C" {
 #define __isb(i) __builtin_arm_isb(i)
 #endif

-/* 8.4 Hints */
+/* 7.4 Hints */

 #if !__has_builtin(__wfi)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) {
@ -68,7 +75,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif

-/* 8.5 Swap */
+/* 7.5 Swap */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __swp(uint32_t __x, volatile uint32_t *__p) {
  uint32_t v;
@ -78,8 +85,8 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  return v;
 }

-/* 8.6 Memory prefetch intrinsics */
-/* 8.6.1 Data prefetch */
+/* 7.6 Memory prefetch intrinsics */
+/* 7.6.1 Data prefetch */
 #define __pld(addr) __pldx(0, 0, 0, addr)

 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
@ -90,7 +97,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1)
 #endif

-/* 8.6.2 Instruction prefetch */
+/* 7.6.2 Instruction prefetch */
 #define __pli(addr) __plix(0, 0, addr)

 #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
@ -101,15 +108,15 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
  __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0)
 #endif

-/* 8.7 NOP */
+/* 7.7 NOP */
 #if !defined(_MSC_VER) || !defined(__aarch64__)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
  __builtin_arm_nop();
 }
 #endif

-/* 9 DATA-PROCESSING INTRINSICS */
-/* 9.2 Miscellaneous data-processing intrinsics */
+/* 8 DATA-PROCESSING INTRINSICS */
+/* 8.2 Miscellaneous data-processing intrinsics */
 /* ROR */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __ror(uint32_t __x, uint32_t __y) {
@ -248,9 +255,7 @@ __rbitl(unsigned long __t) {
 #endif
 }

-/*
- * 9.3 16-bit multiplications
- */
+/* 8.3 16-bit multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbb(int32_t __a, int32_t __b) {
@ -279,18 +284,18 @@ __smulwt(int32_t __a, int32_t __b) {
 #endif

 /*
- * 9.4 Saturating intrinsics
+ * 8.4 Saturating intrinsics
 *
 * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
 * intrinsics are implemented and the flag is enabled.
 */
-/* 9.4.1 Width-specified saturation intrinsics */
+/* 8.4.1 Width-specified saturation intrinsics */
 #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif

-/* 9.4.2 Saturating addition and subtraction intrinsics */
+/* 8.4.2 Saturating addition and subtraction intrinsics */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
@ -308,7 +313,7 @@ __qdbl(int32_t __t) {
 }
 #endif

-/* 9.4.3 Accumultating multiplications */
+/* 8.4.3 Accumultating multiplications */
 #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
@ -337,13 +342,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {
 #endif


-/* 9.5.4 Parallel 16-bit saturation */
+/* 8.5.4 Parallel 16-bit saturation */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
 #define __usat16(x, y) __builtin_arm_usat16(x, y)
 #endif

-/* 9.5.5 Packing and unpacking */
+/* 8.5.5 Packing and unpacking */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 typedef int32_t int8x4_t;
 typedef int32_t int16x2_t;
@ -368,7 +373,7 @@ __uxtb16(int8x4_t __a) {
 }
 #endif

-/* 9.5.6 Parallel selection */
+/* 8.5.6 Parallel selection */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __sel(uint8x4_t __a, uint8x4_t __b) {
@ -376,7 +381,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
 }
 #endif

-/* 9.5.7 Parallel 8-bit addition and subtraction */
+/* 8.5.7 Parallel 8-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qadd8(int8x4_t __a, int8x4_t __b) {
@ -428,7 +433,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
 }
 #endif

-/* 9.5.8 Sum of 8-bit absolute differences */
+/* 8.5.8 Sum of 8-bit absolute differences */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usad8(uint8x4_t __a, uint8x4_t __b) {
@ -440,7 +445,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
 }
 #endif

-/* 9.5.9 Parallel 16-bit addition and subtraction */
+/* 8.5.9 Parallel 16-bit addition and subtraction */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qadd16(int16x2_t __a, int16x2_t __b) {
@ -540,7 +545,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 }
 #endif

-/* 9.5.10 Parallel 16-bit multiplications */
+/* 8.5.10 Parallel 16-bit multiplications */
 #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
@ -592,7 +597,22 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
 }
 #endif

-/* 9.7 CRC32 intrinsics */
+/* 8.6 Floating-point data-processing intrinsics */
+#if (defined(__ARM_FEATURE_DIRECTED_ROUNDING)    &&                         \
+  (__ARM_FEATURE_DIRECTED_ROUNDING))             &&                         \
+  (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ double __attribute__((__always_inline__, __nodebug__))
+__rintn(double __a) {
+  return __builtin_roundeven(__a);
+}
+
+static __inline__ float __attribute__((__always_inline__, __nodebug__))
+__rintnf(float __a) {
+  return __builtin_roundevenf(__a);
+}
+#endif
+
+/* 8.8 CRC32 intrinsics */
 #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
@ -636,6 +656,7 @@ __crc32cd(uint32_t __a, uint64_t __b) {
 }
 #endif

+/* 8.6 Floating-point data-processing intrinsics */
 /* Armv8.3-A Javascript conversion intrinsic */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
@ -687,7 +708,7 @@ __rint64x(double __a) {
 }
 #endif

-/* Armv8.7-A load/store 64-byte intrinsics */
+/* 8.9 Armv8.7-A load/store 64-byte intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 typedef struct {
    uint64_t val[8];
@ -713,7 +734,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 }
 #endif

-/* 10.1 Special register intrinsics */
+/* 11.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
 #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
@ -727,7 +748,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))

-/* Memory Tagging Extensions (MTE) Intrinsics */
+/* 10.3 Memory Tagging Extensions (MTE) Intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@ -736,12 +757,71 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)

-/* Memory Operations Intrinsics */
+/* 18 Memory Operations Intrinsics */
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif

-/* Transactional Memory Extension (TME) Intrinsics */
+/* 11.3 Coprocessor Intrinsics */
+#if defined(__ARM_FEATURE_COPROC)
+
+#if (__ARM_FEATURE_COPROC & 0x1)
+
+#if (__ARM_ARCH < 8)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#endif /* __ARM_ARCH < 8 */
+
+#define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p)
+#define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p)
+
+#define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2)                         \
+  __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc(coproc, opc1, CRn, CRm, opc2)                                \
+  __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2)
+
+#if (__ARM_ARCH != 4) && (__ARM_ARCH < 8)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */
+
+#if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__)
+#define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)                           \
+  __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p)
+#define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p)
+#endif /* ___ARM_ARCH_8M_MAIN__ */
+
+#endif /* __ARM_FEATURE_COPROC & 0x1 */
+
+#if (__ARM_FEATURE_COPROC & 0x2)
+#define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)                          \
+  __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2)
+#define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p)
+#define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p)
+#define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p)
+#define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p)
+#define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)                        \
+  __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2)
+#define __arm_mrc2(coproc, opc1, CRn, CRm, opc2)                               \
+  __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x4)
+#define __arm_mcrr(coproc, opc1, value, CRm)                                   \
+  __builtin_arm_mcrr(coproc, opc1, value, CRm)
+#define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm)
+#endif
+
+#if (__ARM_FEATURE_COPROC & 0x8)
+#define __arm_mcrr2(coproc, opc1, value, CRm)                                  \
+  __builtin_arm_mcrr2(coproc, opc1, value, CRm)
+#define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm)
+#endif
+
+#endif // __ARM_FEATURE_COPROC
+
+/* 17 Transactional Memory Extension (TME) Intrinsics */
 #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME

 #define _TMFAILURE_REASON  0x00007fffu
@ -763,7 +843,7 @@ __arm_st64bv0(void *__addr, data512_t __value) {

 #endif /* __ARM_FEATURE_TME */

-/* Armv8.5-A Random number generation intrinsics */
+/* 8.7 Armv8.5-A Random number generation intrinsics */
 #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndr(uint64_t *__p) {
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
@ -35,12 +35,7 @@
 #include <stdint.h>

 #include <arm_bf16.h>
-typedef float float32_t;
-typedef __fp16 float16_t;
-#ifdef __aarch64__
-typedef double float64_t;
-#endif
-
+#include <arm_vector_types.h>
 #ifdef __aarch64__
 typedef uint8_t poly8_t;
 typedef uint16_t poly16_t;
@ -51,30 +46,6 @@ typedef int8_t poly8_t;
 typedef int16_t poly16_t;
 typedef int64_t poly64_t;
 #endif
-typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
-typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
-typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
-typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
-typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
-typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
-typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
-typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
-typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
-typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
-typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
-typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
-typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
-typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
-typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
-typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
-typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
-typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
-typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
-typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
-#ifdef __aarch64__
-typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
-typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
-#endif
 typedef __attribute__((neon_polyvector_type(8))) poly8_t poly8x8_t;
 typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
 typedef __attribute__((neon_polyvector_type(4))) poly16_t poly16x4_t;
@ -82,96 +53,6 @@ typedef __attribute__((neon_polyvector_type(8))) poly16_t poly16x8_t;
 typedef __attribute__((neon_polyvector_type(1))) poly64_t poly64x1_t;
 typedef __attribute__((neon_polyvector_type(2))) poly64_t poly64x2_t;

-typedef struct int8x8x2_t {
-  int8x8_t val[2];
-} int8x8x2_t;
-
-typedef struct int8x16x2_t {
-  int8x16_t val[2];
-} int8x16x2_t;
-
-typedef struct int16x4x2_t {
-  int16x4_t val[2];
-} int16x4x2_t;
-
-typedef struct int16x8x2_t {
-  int16x8_t val[2];
-} int16x8x2_t;
-
-typedef struct int32x2x2_t {
-  int32x2_t val[2];
-} int32x2x2_t;
-
-typedef struct int32x4x2_t {
-  int32x4_t val[2];
-} int32x4x2_t;
-
-typedef struct int64x1x2_t {
-  int64x1_t val[2];
-} int64x1x2_t;
-
-typedef struct int64x2x2_t {
-  int64x2_t val[2];
-} int64x2x2_t;
-
-typedef struct uint8x8x2_t {
-  uint8x8_t val[2];
-} uint8x8x2_t;
-
-typedef struct uint8x16x2_t {
-  uint8x16_t val[2];
-} uint8x16x2_t;
-
-typedef struct uint16x4x2_t {
-  uint16x4_t val[2];
-} uint16x4x2_t;
-
-typedef struct uint16x8x2_t {
-  uint16x8_t val[2];
-} uint16x8x2_t;
-
-typedef struct uint32x2x2_t {
-  uint32x2_t val[2];
-} uint32x2x2_t;
-
-typedef struct uint32x4x2_t {
-  uint32x4_t val[2];
-} uint32x4x2_t;
-
-typedef struct uint64x1x2_t {
-  uint64x1_t val[2];
-} uint64x1x2_t;
-
-typedef struct uint64x2x2_t {
-  uint64x2_t val[2];
-} uint64x2x2_t;
-
-typedef struct float16x4x2_t {
-  float16x4_t val[2];
-} float16x4x2_t;
-
-typedef struct float16x8x2_t {
-  float16x8_t val[2];
-} float16x8x2_t;
-
-typedef struct float32x2x2_t {
-  float32x2_t val[2];
-} float32x2x2_t;
-
-typedef struct float32x4x2_t {
-  float32x4_t val[2];
-} float32x4x2_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x2_t {
-  float64x1_t val[2];
-} float64x1x2_t;
-
-typedef struct float64x2x2_t {
-  float64x2_t val[2];
-} float64x2x2_t;
-
-#endif
 typedef struct poly8x8x2_t {
  poly8x8_t val[2];
 } poly8x8x2_t;
@ -196,96 +77,6 @@ typedef struct poly64x2x2_t {
  poly64x2_t val[2];
 } poly64x2x2_t;

-typedef struct int8x8x3_t {
-  int8x8_t val[3];
-} int8x8x3_t;
-
-typedef struct int8x16x3_t {
-  int8x16_t val[3];
-} int8x16x3_t;
-
-typedef struct int16x4x3_t {
-  int16x4_t val[3];
-} int16x4x3_t;
-
-typedef struct int16x8x3_t {
-  int16x8_t val[3];
-} int16x8x3_t;
-
-typedef struct int32x2x3_t {
-  int32x2_t val[3];
-} int32x2x3_t;
-
-typedef struct int32x4x3_t {
-  int32x4_t val[3];
-} int32x4x3_t;
-
-typedef struct int64x1x3_t {
-  int64x1_t val[3];
-} int64x1x3_t;
-
-typedef struct int64x2x3_t {
-  int64x2_t val[3];
-} int64x2x3_t;
-
-typedef struct uint8x8x3_t {
-  uint8x8_t val[3];
-} uint8x8x3_t;
-
-typedef struct uint8x16x3_t {
-  uint8x16_t val[3];
-} uint8x16x3_t;
-
-typedef struct uint16x4x3_t {
-  uint16x4_t val[3];
-} uint16x4x3_t;
-
-typedef struct uint16x8x3_t {
-  uint16x8_t val[3];
-} uint16x8x3_t;
-
-typedef struct uint32x2x3_t {
-  uint32x2_t val[3];
-} uint32x2x3_t;
-
-typedef struct uint32x4x3_t {
-  uint32x4_t val[3];
-} uint32x4x3_t;
-
-typedef struct uint64x1x3_t {
-  uint64x1_t val[3];
-} uint64x1x3_t;
-
-typedef struct uint64x2x3_t {
-  uint64x2_t val[3];
-} uint64x2x3_t;
-
-typedef struct float16x4x3_t {
-  float16x4_t val[3];
-} float16x4x3_t;
-
-typedef struct float16x8x3_t {
-  float16x8_t val[3];
-} float16x8x3_t;
-
-typedef struct float32x2x3_t {
-  float32x2_t val[3];
-} float32x2x3_t;
-
-typedef struct float32x4x3_t {
-  float32x4_t val[3];
-} float32x4x3_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x3_t {
-  float64x1_t val[3];
-} float64x1x3_t;
-
-typedef struct float64x2x3_t {
-  float64x2_t val[3];
-} float64x2x3_t;
-
-#endif
 typedef struct poly8x8x3_t {
  poly8x8_t val[3];
 } poly8x8x3_t;
@ -310,96 +101,6 @@ typedef struct poly64x2x3_t {
  poly64x2_t val[3];
 } poly64x2x3_t;

-typedef struct int8x8x4_t {
-  int8x8_t val[4];
-} int8x8x4_t;
-
-typedef struct int8x16x4_t {
-  int8x16_t val[4];
-} int8x16x4_t;
-
-typedef struct int16x4x4_t {
-  int16x4_t val[4];
-} int16x4x4_t;
-
-typedef struct int16x8x4_t {
-  int16x8_t val[4];
-} int16x8x4_t;
-
-typedef struct int32x2x4_t {
-  int32x2_t val[4];
-} int32x2x4_t;
-
-typedef struct int32x4x4_t {
-  int32x4_t val[4];
-} int32x4x4_t;
-
-typedef struct int64x1x4_t {
-  int64x1_t val[4];
-} int64x1x4_t;
-
-typedef struct int64x2x4_t {
-  int64x2_t val[4];
-} int64x2x4_t;
-
-typedef struct uint8x8x4_t {
-  uint8x8_t val[4];
-} uint8x8x4_t;
-
-typedef struct uint8x16x4_t {
-  uint8x16_t val[4];
-} uint8x16x4_t;
-
-typedef struct uint16x4x4_t {
-  uint16x4_t val[4];
-} uint16x4x4_t;
-
-typedef struct uint16x8x4_t {
-  uint16x8_t val[4];
-} uint16x8x4_t;
-
-typedef struct uint32x2x4_t {
-  uint32x2_t val[4];
-} uint32x2x4_t;
-
-typedef struct uint32x4x4_t {
-  uint32x4_t val[4];
-} uint32x4x4_t;
-
-typedef struct uint64x1x4_t {
-  uint64x1_t val[4];
-} uint64x1x4_t;
-
-typedef struct uint64x2x4_t {
-  uint64x2_t val[4];
-} uint64x2x4_t;
-
-typedef struct float16x4x4_t {
-  float16x4_t val[4];
-} float16x4x4_t;
-
-typedef struct float16x8x4_t {
-  float16x8_t val[4];
-} float16x8x4_t;
-
-typedef struct float32x2x4_t {
-  float32x2_t val[4];
-} float32x2x4_t;
-
-typedef struct float32x4x4_t {
-  float32x4_t val[4];
-} float32x4x4_t;
-
-#ifdef __aarch64__
-typedef struct float64x1x4_t {
-  float64x1_t val[4];
-} float64x1x4_t;
-
-typedef struct float64x2x4_t {
-  float64x2_t val[4];
-} float64x2x4_t;
-
-#endif
 typedef struct poly8x8x4_t {
  poly8x8_t val[4];
 } poly8x8x4_t;
@ -424,33 +125,6 @@ typedef struct poly64x2x4_t {
  poly64x2_t val[4];
 } poly64x2x4_t;

-typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
-typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
-
-typedef struct bfloat16x4x2_t {
-  bfloat16x4_t val[2];
-} bfloat16x4x2_t;
-
-typedef struct bfloat16x8x2_t {
-  bfloat16x8_t val[2];
-} bfloat16x8x2_t;
-
-typedef struct bfloat16x4x3_t {
-  bfloat16x4_t val[3];
-} bfloat16x4x3_t;
-
-typedef struct bfloat16x8x3_t {
-  bfloat16x8_t val[3];
-} bfloat16x8x3_t;
-
-typedef struct bfloat16x4x4_t {
-  bfloat16x4_t val[4];
-} bfloat16x4x4_t;
-
-typedef struct bfloat16x8x4_t {
-  bfloat16x8_t val[4];
-} bfloat16x8x4_t;
-
 #define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))

 #ifdef __LITTLE_ENDIAN__
@ -66600,6 +66274,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd32x_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd32xq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd32x_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd32x_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd32zq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66632,6 +66327,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd32z_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd32zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd32zq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd32z_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd32z_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd64xq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66664,6 +66380,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd64x_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64xq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd64xq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd64x_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd64x_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #ifdef __LITTLE_ENDIAN__
 __ai __attribute__((target("v8.5a"))) float32x4_t vrnd64zq_f32(float32x4_t __p0) {
  float32x4_t __ret;
@ -66696,6 +66433,27 @@ __ai __attribute__((target("v8.5a"))) float32x2_t vrnd64z_f32(float32x2_t __p0)
 }
 #endif

+#ifdef __LITTLE_ENDIAN__
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__p0, 42);
+  return __ret;
+}
+#else
+__ai __attribute__((target("v8.5a"))) float64x2_t vrnd64zq_f64(float64x2_t __p0) {
+  float64x2_t __ret;
+  float64x2_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
+  __ret = (float64x2_t) __builtin_neon_vrnd64zq_f64((int8x16_t)__rev0, 42);
+  __ret = __builtin_shufflevector(__ret, __ret, 1, 0);
+  return __ret;
+}
+#endif
+
+__ai __attribute__((target("v8.5a"))) float64x1_t vrnd64z_f64(float64x1_t __p0) {
+  float64x1_t __ret;
+  __ret = (float64x1_t) __builtin_neon_vrnd64z_f64((int8x8_t)__p0, 10);
+  return __ret;
+}
 #endif
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)
 #ifdef __LITTLE_ENDIAN__
--- a/lib/include/arm_sme.h
+++ b/lib/include/arm_sme.h
--- a/lib/include/arm_sme_draft_spec_subject_to_change.h
+++ b/lib/include/arm_sme_draft_spec_subject_to_change.h
@ -1,642 +0,0 @@
-/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics ------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __ARM_SME_H
-#define __ARM_SME_H
-
-#if !defined(__LITTLE_ENDIAN__)
-#error "Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h"
-#endif
-#include <arm_sve.h> 
-
-/* Function attributes */
-#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
-
-#define __aio static __inline__ __attribute__((__always_inline__, __nodebug__, __overloadable__))
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_u32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_s32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsb), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsb(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsd), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsd(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsh), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsh(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svcntsw), arm_streaming_compatible, arm_preserves_za))
-uint64_t svcntsw(void);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za128), arm_streaming, arm_shared_za))
-void svld1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za16), arm_streaming, arm_shared_za))
-void svld1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za32), arm_streaming, arm_shared_za))
-void svld1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za64), arm_streaming, arm_shared_za))
-void svld1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_hor_za8), arm_streaming, arm_shared_za))
-void svld1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za128), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za16), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za32), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za64), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_vnum_za8), arm_streaming, arm_shared_za))
-void svld1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za128), arm_streaming, arm_shared_za))
-void svld1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za16), arm_streaming, arm_shared_za))
-void svld1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za32), arm_streaming, arm_shared_za))
-void svld1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za64), arm_streaming, arm_shared_za))
-void svld1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svld1_ver_za8), arm_streaming, arm_shared_za))
-void svld1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_f16_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_bf16_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_f32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_u16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_bf16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_f16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_s16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_u32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_f32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_s32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_u64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_f64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_s64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_u8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_s8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_hor_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_hor_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_vnum_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_vnum_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za128), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za128(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za16), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za16(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za32), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za32(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za64), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za64(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svst1_ver_za8), arm_streaming, arm_shared_za, arm_preserves_za))
-void svst1_ver_za8(uint64_t, uint32_t, uint64_t, svbool_t, void *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_s8_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_u8_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_u16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_bf16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_f16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_s16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_u32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_f32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_s32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_u64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_f64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_s64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_u8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_s8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_mask_za), arm_streaming_compatible, arm_shared_za))
-void svzero_mask_za(uint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svzero_za), arm_streaming_compatible, arm_shared_za))
-void svzero_za();
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddha_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_u32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za32_s32_m), arm_streaming, arm_shared_za))
-void svaddva_za32_m(uint64_t, svbool_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_f32_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat16_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_bf16_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svbfloat16_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_f32_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svfloat32_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_s8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_hor_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_hor_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_hor_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_hor_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_hor_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_hor_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_hor_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_hor_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_hor_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_hor_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_hor_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_hor_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_hor_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za128_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za128_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za128_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za128_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za128_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za128_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za128_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za128_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za128_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za128_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za128_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za128_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za128_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_u16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint16_t svread_ver_za16_m(svuint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_bf16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svbfloat16_t svread_ver_za16_m(svbfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_f16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat16_t svread_ver_za16_m(svfloat16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za16_s16_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint16_t svread_ver_za16_m(svint16_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_u32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint32_t svread_ver_za32_m(svuint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_f32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat32_t svread_ver_za32_m(svfloat32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za32_s32_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint32_t svread_ver_za32_m(svint32_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_u64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint64_t svread_ver_za64_m(svuint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_f64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svfloat64_t svread_ver_za64_m(svfloat64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za64_s64_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint64_t svread_ver_za64_m(svint64_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_u8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svuint8_t svread_ver_za8_m(svuint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svread_ver_za8_s8_m), arm_streaming, arm_shared_za, arm_preserves_za))
-svint8_t svread_ver_za8_m(svint8_t, svbool_t, uint64_t, uint32_t, uint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumopa_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za32_s8_m), arm_streaming, arm_shared_za))
-void svsumops_za32_m(uint64_t, svbool_t, svbool_t, svint8_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmopa_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za32_u8_m), arm_streaming, arm_shared_za))
-void svusmops_za32_m(uint64_t, svbool_t, svbool_t, svuint8_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_hor_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_hor_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za128_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za128_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_u16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_bf16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svbfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_f16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za16_s16_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za16_m(uint64_t, uint32_t, uint64_t, svbool_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_u32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_f32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za32_s32_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za32_m(uint64_t, uint32_t, uint64_t, svbool_t, svint32_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_u64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_f64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za64_s64_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za64_m(uint64_t, uint32_t, uint64_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_u8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svuint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svwrite_ver_za8_s8_m), arm_streaming, arm_shared_za))
-void svwrite_ver_za8_m(uint64_t, uint32_t, uint64_t, svbool_t, svint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_f64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_f64_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_f64_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svfloat64_t, svfloat64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_u64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_s64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_s16_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_u16_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddha_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddha_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_u64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svuint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svaddva_za64_s64_m), arm_streaming, arm_shared_za))
-void svaddva_za64_m(uint64_t, svbool_t, svbool_t, svint64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_s16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumopa_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumopa_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svsumops_za64_s16_m), arm_streaming, arm_shared_za))
-void svsumops_za64_m(uint64_t, svbool_t, svbool_t, svint16_t, svuint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmopa_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmopa_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sme_svusmops_za64_u16_m), arm_streaming, arm_shared_za))
-void svusmops_za64_m(uint64_t, svbool_t, svbool_t, svuint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svldr_vnum_za), arm_streaming_compatible, arm_shared_za))
-void svldr_vnum_za(uint32_t, uint64_t, void const *);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sme_svstr_vnum_za), arm_streaming_compatible, arm_shared_za, arm_preserves_za))
-void svstr_vnum_za(uint32_t, uint64_t, void *);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#undef __ai
-
-#endif /* __ARM_SME_H */
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
--- a/lib/include/arm_vector_types.h
+++ b/lib/include/arm_vector_types.h
@ -0,0 +1,345 @@
+/*===---- arm_vector_types - ARM vector type ------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__ARM_NEON_H) && !defined(__ARM_SVE_H)
+#error "This file should not be used standalone. Please include arm_neon.h or arm_sve.h instead"
+
+#endif
+#ifndef __ARM_NEON_TYPES_H
+#define __ARM_NEON_TYPES_H
+typedef float float32_t;
+typedef __fp16 float16_t;
+#ifdef __aarch64__
+typedef double float64_t;
+#endif
+
+typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
+typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
+typedef __attribute__((neon_vector_type(4))) int16_t int16x4_t;
+typedef __attribute__((neon_vector_type(8))) int16_t int16x8_t;
+typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
+typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
+typedef __attribute__((neon_vector_type(1))) int64_t int64x1_t;
+typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
+typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
+typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
+typedef __attribute__((neon_vector_type(4))) uint16_t uint16x4_t;
+typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
+typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
+typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
+typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
+typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
+typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
+typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
+typedef __attribute__((neon_vector_type(4))) float32_t float32x4_t;
+#ifdef __aarch64__
+typedef __attribute__((neon_vector_type(1))) float64_t float64x1_t;
+typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+#endif
+
+typedef struct int8x8x2_t {
+  int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t {
+  int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t {
+  int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t {
+  int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t {
+  int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t {
+  int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t {
+  int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t {
+  int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t {
+  uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t {
+  uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t {
+  uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t {
+  uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t {
+  uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t {
+  uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t {
+  uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t {
+  uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float16x4x2_t {
+  float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t {
+  float16x8_t val[2];
+} float16x8x2_t;
+
+typedef struct float32x2x2_t {
+  float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t {
+  float32x4_t val[2];
+} float32x4x2_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x2_t {
+  float64x1_t val[2];
+} float64x1x2_t;
+
+typedef struct float64x2x2_t {
+  float64x2_t val[2];
+} float64x2x2_t;
+
+#endif
+typedef struct int8x8x3_t {
+  int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t {
+  int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t {
+  int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t {
+  int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t {
+  int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t {
+  int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t {
+  int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t {
+  int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t {
+  uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t {
+  uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t {
+  uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t {
+  uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t {
+  uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t {
+  uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t {
+  uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t {
+  uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float16x4x3_t {
+  float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t {
+  float16x8_t val[3];
+} float16x8x3_t;
+
+typedef struct float32x2x3_t {
+  float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t {
+  float32x4_t val[3];
+} float32x4x3_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x3_t {
+  float64x1_t val[3];
+} float64x1x3_t;
+
+typedef struct float64x2x3_t {
+  float64x2_t val[3];
+} float64x2x3_t;
+
+#endif
+typedef struct int8x8x4_t {
+  int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t {
+  int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t {
+  int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t {
+  int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t {
+  int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t {
+  int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t {
+  int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t {
+  int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t {
+  uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t {
+  uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t {
+  uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t {
+  uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t {
+  uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t {
+  uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t {
+  uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t {
+  uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float16x4x4_t {
+  float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t {
+  float16x8_t val[4];
+} float16x8x4_t;
+
+typedef struct float32x2x4_t {
+  float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t {
+  float32x4_t val[4];
+} float32x4x4_t;
+
+#ifdef __aarch64__
+typedef struct float64x1x4_t {
+  float64x1_t val[4];
+} float64x1x4_t;
+
+typedef struct float64x2x4_t {
+  float64x2_t val[4];
+} float64x2x4_t;
+
+#endif
+typedef __attribute__((neon_vector_type(4))) bfloat16_t bfloat16x4_t;
+typedef __attribute__((neon_vector_type(8))) bfloat16_t bfloat16x8_t;
+
+typedef struct bfloat16x4x2_t {
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t {
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t {
+  bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t {
+  bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t {
+  bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t {
+  bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
+#endif // __ARM_NEON_TYPES_H
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -15,8 +15,12 @@
 #define __AVX2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx2,no-evex512"), __min_vector_width__(128)))

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 /// Computes sixteen sum of absolute difference (SAD) operations on sets of
@ -1307,6 +1311,23 @@ _mm256_min_epu32(__m256i __a, __m256i __b)
  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 }

+/// Creates a 32-bit integer mask from the most significant bit of each byte
+///    in the 256-bit integer vector in \a __a and returns the result.
+///
+/// \code{.operation}
+/// FOR i := 0 TO 31
+///   j := i*8
+///   result[i] := __a[j+7]
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
+///
+/// \param __a
+///    A 256-bit integer vector containing the source bytes.
+/// \returns The 32-bit integer mask.
 static __inline__ int __DEFAULT_FN_ATTRS256
 _mm256_movemask_epi8(__m256i __a)
 {
@ -2962,7 +2983,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
+_mm256_stream_load_si256(const void *__V)
 {
  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
--- a/lib/include/avx512bf16intrin.h
+++ b/lib/include/avx512bf16intrin.h
@ -20,10 +20,11 @@ typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
 typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));

 #define __DEFAULT_FN_ATTRS512 \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16,evex512"), \
                 __min_vector_width__(512)))
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bf16,no-evex512")))

 /// Convert One BF16 Data to One Single Float Data.
 ///
--- a/lib/include/avx512bitalgintrin.h
+++ b/lib/include/avx512bitalgintrin.h
@ -15,7 +15,10 @@
 #define __AVX512BITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bitalg,evex512"),                           \
+                 __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi16(__m512i __A)
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -18,8 +18,12 @@ typedef unsigned int __mmask32;
 typedef unsigned long long __mmask64;

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,no-evex512")))

 static __inline __mmask32 __DEFAULT_FN_ATTRS
 _knot_mask32(__mmask32 __M)
@ -27,9 +31,7 @@ _knot_mask32(__mmask32 __M)
  return __builtin_ia32_knotsi(__M);
 }

-static __inline __mmask64 __DEFAULT_FN_ATTRS
-_knot_mask64(__mmask64 __M)
-{
+static __inline __mmask64 __DEFAULT_FN_ATTRS _knot_mask64(__mmask64 __M) {
  return __builtin_ia32_knotdi(__M);
 }

@ -39,9 +41,8 @@ _kand_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kand_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kand_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B);
 }

@ -51,9 +52,8 @@ _kandn_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kandn_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kandn_mask64(__mmask64 __A,
+                                                             __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B);
 }

@ -63,9 +63,8 @@ _kor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kor_mask64(__mmask64 __A,
+                                                           __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B);
 }

@ -75,9 +74,8 @@ _kxnor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxnor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxnor_mask64(__mmask64 __A,
+                                                             __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B);
 }

@ -87,9 +85,8 @@ _kxor_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxor_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxor_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
 }

@ -112,14 +109,12 @@ _kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
 }

@ -148,14 +143,12 @@ _ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
 }

 static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
+_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) {
  return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
 }

@ -171,9 +164,8 @@ _kadd_mask32(__mmask32 __A, __mmask32 __B)
  return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kadd_mask64(__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kadd_mask64(__mmask64 __A,
+                                                            __mmask64 __B) {
  return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B);
 }

@ -214,8 +206,7 @@ _load_mask32(__mmask32 *__A) {
  return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A);
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_load_mask64(__mmask64 *__A) {
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _load_mask64(__mmask64 *__A) {
  return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A);
 }

@ -224,8 +215,8 @@ _store_mask32(__mmask32 *__A, __mmask32 __B) {
  *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask64(__mmask64 *__A, __mmask64 __B) {
+static __inline__ void __DEFAULT_FN_ATTRS _store_mask64(__mmask64 *__A,
+                                                        __mmask64 __B) {
  *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B);
 }

@ -1714,9 +1705,8 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
                                              (__v64qi) _mm512_setzero_si512());
 }

-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
-{
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd(__mmask64 __A,
+                                                               __mmask64 __B) {
  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
                (__mmask64) __B);
 }
--- a/lib/include/avx512cdintrin.h
+++ b/lib/include/avx512cdintrin.h
@ -15,7 +15,9 @@
 #define __AVX512CDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512cd,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
--- a/lib/include/avx512dqintrin.h
+++ b/lib/include/avx512dqintrin.h
@ -15,8 +15,10 @@
 #define __AVX512DQINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512dq,no-evex512")))

 static __inline __mmask8 __DEFAULT_FN_ATTRS
 _knot_mask8(__mmask8 __M)
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -167,9 +167,13 @@ typedef enum
 } _MM_MANTISSA_SIGN_ENUM;

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,no-evex512"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,no-evex512")))

 /* Create vectors with repeated elements */

--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
@ -22,13 +22,15 @@ typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS512                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
-                 __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
 #define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,no-evex512"),                          \
                 __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16,no-evex512"),                          \
                 __min_vector_width__(128)))

 static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
--- a/lib/include/avx512ifmaintrin.h
+++ b/lib/include/avx512ifmaintrin.h
@ -15,7 +15,9 @@
 #define __IFMAINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
--- a/lib/include/avx512ifmavlintrin.h
+++ b/lib/include/avx512ifmavlintrin.h
@ -15,8 +15,14 @@
 #define __IFMAVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))

 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
--- a/lib/include/avx512pfintrin.h
+++ b/lib/include/avx512pfintrin.h
@ -14,9 +14,6 @@
 #ifndef __AVX512PFINTRIN_H
 #define __AVX512PFINTRIN_H

-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
-
 #define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                             (void const *)(addr), (int)(scale), \
@ -92,6 +89,4 @@
  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (void *)(addr), (int)(scale), (int)(hint))

-#undef __DEFAULT_FN_ATTRS
-
 #endif
--- a/lib/include/avx512vbmi2intrin.h
+++ b/lib/include/avx512vbmi2intrin.h
@ -15,7 +15,7 @@
 #define __AVX512VBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2,evex512"), __min_vector_width__(512)))


 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/lib/include/avx512vbmiintrin.h
+++ b/lib/include/avx512vbmiintrin.h
@ -15,8 +15,9 @@
 #define __VBMIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
-
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
--- a/lib/include/avx512vbmivlintrin.h
+++ b/lib/include/avx512vbmivlintrin.h
@ -15,9 +15,14 @@
 #define __VBMIVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
-
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vbmi,avx512vl,no-evex512"),                 \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -17,10 +17,12 @@

 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
+                 __target__("avx512vl,avx512bf16,no-evex512"),                 \
+                 __min_vector_width__(256)))

 /// Convert Two Packed Single Data to One Packed BF16 Data.
 ///
--- a/lib/include/avx512vlbitalgintrin.h
+++ b/lib/include/avx512vlbitalgintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLBITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bitalg,no-evex512"),               \
+                 __min_vector_width__(256)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi16(__m256i __A)
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLBWINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bw,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512bw,no-evex512"),                   \
+                 __min_vector_width__(256)))

 /* Integer compare */

--- a/lib/include/avx512vlcdintrin.h
+++ b/lib/include/avx512vlcdintrin.h
@ -14,9 +14,14 @@
 #define __AVX512VLCDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512cd,no-evex512"),                   \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmb_epi64 (__mmask8 __A)
--- a/lib/include/avx512vldqintrin.h
+++ b/lib/include/avx512vldqintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLDQINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512dq,no-evex512"),                   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512dq,no-evex512"),                   \
+                 __min_vector_width__(256)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mullo_epi64 (__m256i __A, __m256i __B) {
--- a/lib/include/avx512vlfp16intrin.h
+++ b/lib/include/avx512vlfp16intrin.h
@ -19,11 +19,11 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS256                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16, avx512vl"),                           \
+                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
                 __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128                                                  \
  __attribute__((__always_inline__, __nodebug__,                               \
-                 __target__("avx512fp16, avx512vl"),                           \
+                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
                 __min_vector_width__(128)))

 static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -14,8 +14,14 @@
 #ifndef __AVX512VLINTRIN_H
 #define __AVX512VLINTRIN_H

-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,no-evex512"),                            \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,no-evex512"),                            \
+                 __min_vector_width__(256)))

 typedef short __v2hi __attribute__((__vector_size__(4)));
 typedef char __v4qi __attribute__((__vector_size__(4)));
--- a/lib/include/avx512vlvbmi2intrin.h
+++ b/lib/include/avx512vlvbmi2intrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLVBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vbmi2,no-evex512"),                \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -15,8 +15,14 @@
 #define __AVX512VLVNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vnni,no-evex512"),                 \
+                 __min_vector_width__(256)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
--- a/lib/include/avx512vlvp2intersectintrin.h
+++ b/lib/include/avx512vlvp2intersectintrin.h
@ -29,11 +29,13 @@
 #define _AVX512VLVP2INTERSECT_H

 #define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
                 __min_vector_width__(128)))

 #define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vl,avx512vp2intersect"), \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vl,avx512vp2intersect,no-evex512"),         \
                 __min_vector_width__(256)))
 /// Store, in an even/odd pair of mask registers, the indicators of the
 /// locations of value matches between dwords in operands __a and __b.
--- a/lib/include/avx512vnniintrin.h
+++ b/lib/include/avx512vnniintrin.h
@ -15,8 +15,9 @@
 #define __AVX512VNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
-
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vnni,evex512"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
--- a/lib/include/avx512vp2intersectintrin.h
+++ b/lib/include/avx512vp2intersectintrin.h
@ -29,7 +29,8 @@
 #define _AVX512VP2INTERSECT_H

 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__,  __target__("avx512vp2intersect"), \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vp2intersect,evex512"),                     \
                 __min_vector_width__(512)))

 /// Store, in an even/odd pair of mask registers, the indicators of the
--- a/lib/include/avx512vpopcntdqintrin.h
+++ b/lib/include/avx512vpopcntdqintrin.h
@ -17,7 +17,9 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,evex512"),                        \
+                 __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
--- a/lib/include/avx512vpopcntdqvlintrin.h
+++ b/lib/include/avx512vpopcntdqvlintrin.h
@ -17,9 +17,13 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS128                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256                                                  \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512vpopcntdq,avx512vl,no-evex512"),            \
+                 __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_popcnt_epi64(__m128i __A) {
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -50,8 +50,12 @@ typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
 #endif

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
+                 __min_vector_width__(128)))

 /* Arithmetic */
 /// Adds two 256-bit vectors of [4 x double].
@ -3563,7 +3567,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 /// \param __b
 ///    A 256-bit integer vector containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
+_mm256_stream_si256(void *__a, __m256i __b)
 {
  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
@ -3583,7 +3587,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
+_mm256_stream_pd(void *__a, __m256d __b)
 {
  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
@ -3604,7 +3608,7 @@ _mm256_stream_pd(double *__a, __m256d __b)
 /// \param __a
 ///    A 256-bit vector of [8 x float] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
+_mm256_stream_ps(void *__p, __m256 __a)
 {
  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -19,18 +19,17 @@
   to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))

-#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
-
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 16-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _tzcnt_u16
 static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
@ -41,13 +40,30 @@ __tzcnt_u16(unsigned short __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see __tzcnt_u16
+#define _tzcnt_u16 __tzcnt_u16
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_32
+/// \see { _mm_tzcnt_32 _tzcnt_u32 }
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@ -58,20 +74,35 @@ __tzcnt_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
+/// \returns A 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u32
+/// \see { __tzcnt_u32 _tzcnt_u32 }
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
  return (int)__builtin_ia32_tzcnt_u32(__X);
 }

-#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_32 __tzcnt_u32 }
+#define _tzcnt_u32 __tzcnt_u32

 #ifdef __x86_64__

@ -79,13 +110,13 @@ _mm_tzcnt_32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_64
+/// \see { _mm_tzcnt_64 _tzcnt_u64 }
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@ -96,20 +127,35 @@ __tzcnt_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u64
+/// \see { __tzcnt_u64 _tzcnt_u64 }
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
  return (long long)__builtin_ia32_tzcnt_u64(__X);
 }

-#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_64 __tzcnt_u64
+#define _tzcnt_u64 __tzcnt_u64

 #endif /* __x86_64__ */

@ -121,21 +167,12 @@ _mm_tzcnt_64(unsigned long long __X)
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))

-#define _andn_u32(a, b)   (__andn_u32((a), (b)))
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32(a)      (__blsi_u32((a)))
-
-#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
-
-#define _blsr_u32(a)      (__blsr_u32((a)))
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing one of the operands.
@ -143,19 +180,40 @@ _mm_tzcnt_64(unsigned long long __X)
 ///    An unsigned integer containing one of the operands.
 /// \returns An unsigned integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
  return ~__X & __Y;
 }

+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u32
+#define _andn_u32 __andn_u32
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -178,7 +236,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -203,7 +261,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@ -224,33 +282,89 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be cleared.
 /// \returns An unsigned integer containing the result of clearing the bits from
 ///    the source operand.
+/// \see _blsi_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
  return __X & -__X;
 }

+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+/// \see __blsi_u32
+#define _blsi_u32 __blsi_u32
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+/// \see _blsmsk_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned integer used to create the mask.
 /// \returns An unsigned integer containing the newly created mask.
+/// \see __blsmsk_u32
+#define _blsmsk_u32 __blsmsk_u32
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+/// \see _blsr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
+__blsr_u32(unsigned int __X)
 {
-  return __X ^ (__X - 1);
+  return __X & (__X - 1);
 }

 /// Clears the least significant bit that is set to 1 in the source
@ -258,35 +372,27 @@ __blsmsk_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// \code
+/// unsigned int _bls4_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing the operand to be cleared.
 /// \returns An unsigned integer containing the result of clearing the source
 ///    operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
-  return __X & (__X - 1);
-}
+/// \see __blsr_u32
+#define _blsr_u32 __blsr_u32

 #ifdef __x86_64__

-#define _andn_u64(a, b)   (__andn_u64((a), (b)))
-
-/* _bextr_u64 != __bextr_u64 */
-#define _blsi_u64(a)      (__blsi_u64((a)))
-
-#define _blsmsk_u64(a)    (__blsmsk_u64((a)))
-
-#define _blsr_u64(a)      (__blsr_u64((a)))
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing one of the operands.
@ -294,19 +400,41 @@ __blsr_u32(unsigned int __X)
 ///    An unsigned 64-bit integer containing one of the operands.
 /// \returns An unsigned 64-bit integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
  return ~__X & __Y;
 }

+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64(unsigned long long __X,
+///                              unsigned long long __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u64
+#define _andn_u64 __andn_u64
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -329,7 +457,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -354,7 +482,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@ -375,33 +503,89 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    bits from the source operand.
+/// \see _blsi_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
  return __X & -__X;
 }

+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+/// \see __blsi_u64
+#define _blsi_u64 __blsi_u64
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see _blsmsk_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
 /// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see __blsmsk_u64
+#define _blsmsk_u64 __blsmsk_u64
+
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+/// \see _blsr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
+__blsr_u64(unsigned long long __X)
 {
-  return __X ^ (__X - 1);
+  return __X & (__X - 1);
 }

 /// Clears the least significant bit that is set to 1 in the source
@ -409,17 +593,18 @@ __blsmsk_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing the operand to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
-  return __X & (__X - 1);
-}
+/// \see __blsr_u64
+#define _blsr_u64 __blsr_u64

 #endif /* __x86_64__ */

--- a/lib/include/cuda_wrappers/bits/basic_string.h
+++ b/lib/include/cuda_wrappers/bits/basic_string.h
@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/basic_string.h"
+
+#pragma pop_macro("__noinline__")
--- a/lib/include/cuda_wrappers/bits/basic_string.tcc
+++ b/lib/include/cuda_wrappers/bits/basic_string.tcc
@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/basic_string.tcc"
+
+#pragma pop_macro("__noinline__")
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -50,11 +50,11 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
-                 __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS_MMX                                                 \
-  __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"),       \
-                 __min_vector_width__(64)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))

 /// Adds lower double-precision values in both operands and returns the
 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
                                                        __m128d __a) {
  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
 }
@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
 ///    A pointer to the 128-bit aligned memory location used to store the value.
 /// \param __a
 ///    A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
                                                           __m128i __a) {
  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
 }
@ -3983,8 +3983,8 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
 ///    A 32-bit integer containing the value to be stored.
 static __inline__ void
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si32(int *__p, int __a) {
-  __builtin_ia32_movnti(__p, __a);
+    _mm_stream_si32(void *__p, int __a) {
+  __builtin_ia32_movnti((int *)__p, __a);
 }

 #ifdef __x86_64__
@ -4003,8 +4003,8 @@ static __inline__ void
 ///    A 64-bit integer containing the value to be stored.
 static __inline__ void
    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-    _mm_stream_si64(long long *__p, long long __a) {
-  __builtin_ia32_movnti64(__p, __a);
+    _mm_stream_si64(void *__p, long long __a) {
+  __builtin_ia32_movnti64((long long *)__p, __a);
 }
 #endif

--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -15,19 +15,36 @@
 #define __GFNIINTRIN_H

 /* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("gfni,no-evex512"), __min_vector_width__(128)))

 /* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS_Y                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx,gfni,no-evex512"),                            \
+                 __min_vector_width__(256)))

 /* Default attributes for ZMM unmasked forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS_Z                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,evex512,gfni"),                           \
+                 __min_vector_width__(512)))
 /* Default attributes for ZMM masked forms. */
-#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS_Z_MASK                                              \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,evex512,gfni"),                          \
+                 __min_vector_width__(512)))

 /* Default attributes for VLX masked forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS_VL128                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256                                               \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512bw,avx512vl,gfni,no-evex512"),              \
+                 __min_vector_width__(256)))

 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -291,11 +291,13 @@

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDPID__)
-/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> RDPID </c> instruction.
+///
+/// \returns The 32-bit contents of the MSR.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
 _rdpid_u32(void) {
  return __builtin_ia32_rdpid();
@ -488,6 +490,15 @@ _writegsbase_u64(unsigned long long __V)
 * field inside of it.
 */

+/// Load a 16-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 16-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i16(void const * __P) {
  struct __loadu_i16 {
@ -496,6 +507,16 @@ _loadbe_i16(void const * __P) {
  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
 }

+/// Swap the bytes of a 16-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 16-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i16(void * __P, short __D) {
  struct __storeu_i16 {
@ -504,6 +525,15 @@ _storebe_i16(void * __P, short __D) {
  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
 }

+/// Load a 32-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 32-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i32(void const * __P) {
  struct __loadu_i32 {
@ -512,6 +542,16 @@ _loadbe_i32(void const * __P) {
  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
 }

+/// Swap the bytes of a 32-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 32-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i32(void * __P, int __D) {
  struct __storeu_i32 {
@ -521,6 +561,15 @@ _storebe_i32(void * __P, int __D) {
 }

 #ifdef __x86_64__
+/// Load a 64-bit value from memory and swap its bytes.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the 64-bit value to load.
+/// \returns The byte-swapped value.
 static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i64(void const * __P) {
  struct __loadu_i64 {
@ -529,6 +578,16 @@ _loadbe_i64(void const * __P) {
  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
 }

+/// Swap the bytes of a 64-bit value and store it to memory.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the MOVBE instruction.
+///
+/// \param __P
+///    A pointer to the memory for storing the swapped value.
+/// \param __D
+///    The 64-bit value to be byte-swapped.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i64(void * __P, long long __D) {
  struct __storeu_i64 {
@ -578,9 +637,13 @@ _storebe_i64(void * __P, long long __D) {
 #include <cetintrin.h>
 #endif

-/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
- * whereas others are also available at all times. */
+/* Intrinsics inside adcintrin.h are available at all times. */
+#include <adcintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__ADX__)
 #include <adxintrin.h>
+#endif

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDSEED__)
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -572,6 +572,22 @@ unsigned char __readx18byte(unsigned long offset);
 unsigned short __readx18word(unsigned long offset);
 unsigned long __readx18dword(unsigned long offset);
 unsigned __int64 __readx18qword(unsigned long offset);
+
+double _CopyDoubleFromInt64(__int64);
+float _CopyFloatFromInt32(__int32);
+__int32 _CopyInt32FromFloat(float);
+__int64 _CopyInt64FromDouble(double);
+
+unsigned int _CountLeadingOnes(unsigned long);
+unsigned int _CountLeadingOnes64(unsigned __int64);
+unsigned int _CountLeadingSigns(long);
+unsigned int _CountLeadingSigns64(__int64);
+unsigned int _CountLeadingZeros(unsigned long);
+unsigned int _CountLeadingZeros64(unsigned _int64);
+unsigned int _CountOneBits(unsigned long);
+unsigned int _CountOneBits64(unsigned __int64);
+
+void __cdecl __prefetch(void *);
 #endif

 /*----------------------------------------------------------------------------*\
--- a/lib/include/larchintrin.h
+++ b/lib/include/larchintrin.h
@ -156,7 +156,7 @@ extern __inline unsigned char
  return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
 }

-extern __inline unsigned char
+extern __inline unsigned short
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __iocsrrd_h(unsigned int _1) {
  return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
@ -228,6 +228,18 @@ extern __inline void
  ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif

+#define __frecipe_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frecipe_s((float)_1)
+
+#define __frecipe_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frecipe_d((double)_1)
+
+#define __frsqrte_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frsqrte_s((float)_1)
+
+#define __frsqrte_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frsqrte_d((double)_1)
+
 #ifdef __cplusplus
 }
 #endif
--- a/lib/include/lasxintrin.h
+++ b/lib/include/lasxintrin.h
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -66,10 +66,8 @@

 #define CHAR_BIT  __CHAR_BIT__

-/* C2x 5.2.4.2.1 */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C23 5.2.4.2.1 */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 #define BOOL_WIDTH   __BOOL_WIDTH__
 #define CHAR_WIDTH   CHAR_BIT
 #define SCHAR_WIDTH  CHAR_BIT
--- a/lib/include/llvm_libc_wrappers/assert.h
+++ b/lib/include/llvm_libc_wrappers/assert.h
@ -0,0 +1,34 @@
+//===-- Wrapper for C standard assert.h declarations on the GPU ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
+
+#if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
+#error "This file is for GPU offloading compilation only"
+#endif
+
+#include_next <assert.h>
+
+#if __has_include(<llvm-libc-decls/assert.h>)
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define __LIBC_ATTRS __attribute__((device))
+#endif
+
+#pragma omp begin declare target
+
+#include <llvm-libc-decls/assert.h>
+
+#pragma omp end declare target
+
+#undef __LIBC_ATTRS
+
+#endif
+
+#endif // __CLANG_LLVM_LIBC_WRAPPERS_ASSERT_H__
--- a/lib/include/llvm_libc_wrappers/ctype.h
+++ b/lib/include/llvm_libc_wrappers/ctype.h
@ -13,8 +13,19 @@
 #error "This file is for GPU offloading compilation only"
 #endif

+// The GNU headers like to define 'toupper' and 'tolower' redundantly. This is
+// necessary to prevent it from doing that and remapping our implementation.
+#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__)
+#pragma push_macro("__USE_EXTERN_INLINES")
+#undef __USE_EXTERN_INLINES
+#endif
+
 #include_next <ctype.h>

+#if (defined(__NVPTX__) || defined(__AMDGPU__)) && defined(__GLIBC__)
+#pragma pop_macro("__USE_EXTERN_INLINES")
+#endif
+
 #if __has_include(<llvm-libc-decls/ctype.h>)

 #if defined(__HIP__) || defined(__CUDA__)
@ -26,6 +37,7 @@

 #pragma push_macro("isalnum")
 #pragma push_macro("isalpha")
+#pragma push_macro("isascii")
 #pragma push_macro("isblank")
 #pragma push_macro("iscntrl")
 #pragma push_macro("isdigit")
@ -36,11 +48,13 @@
 #pragma push_macro("isspace")
 #pragma push_macro("isupper")
 #pragma push_macro("isxdigit")
+#pragma push_macro("toascii")
 #pragma push_macro("tolower")
 #pragma push_macro("toupper")

 #undef isalnum
 #undef isalpha
+#undef isascii
 #undef iscntrl
 #undef isdigit
 #undef islower
@ -51,6 +65,7 @@
 #undef isupper
 #undef isblank
 #undef isxdigit
+#undef toascii
 #undef tolower
 #undef toupper

@ -64,6 +79,7 @@
 #if !defined(__NVPTX__) && !defined(__AMDGPU__)
 #pragma pop_macro("isalnum")
 #pragma pop_macro("isalpha")
+#pragma pop_macro("isascii")
 #pragma pop_macro("isblank")
 #pragma pop_macro("iscntrl")
 #pragma pop_macro("isdigit")
@ -74,6 +90,7 @@
 #pragma pop_macro("isspace")
 #pragma pop_macro("isupper")
 #pragma pop_macro("isxdigit")
+#pragma pop_macro("toascii")
 #pragma pop_macro("tolower")
 #pragma pop_macro("toupper")
 #endif
--- a/lib/include/llvm_libc_wrappers/stdio.h
+++ b/lib/include/llvm_libc_wrappers/stdio.h
@ -6,21 +6,58 @@
 //
 //===----------------------------------------------------------------------===//

-#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
-#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
-
 #if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
 #error "This file is for GPU offloading compilation only"
 #endif

 #include_next <stdio.h>

+// In some old versions of glibc, other standard headers sometimes define
+// special macros (e.g., __need_FILE) before including stdio.h to cause stdio.h
+// to produce special definitions.  Future includes of stdio.h when those
+// special macros are undefined are expected to produce the normal definitions
+// from stdio.h.
+//
+// We do not apply our include guard (__CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__)
+// unconditionally to the above include_next.  Otherwise, after an occurrence of
+// the first glibc stdio.h use case described above, the include_next would be
+// skipped for remaining includes of stdio.h, leaving required symbols
+// undefined.
+//
+// We make the following assumptions to handle all use cases:
+//
+// 1. If the above include_next produces special glibc definitions, then (a) it
+//    does not produce the normal definitions that we must intercept below, (b)
+//    the current file was included from a glibc header that already defined
+//    __GLIBC__ (usually by including glibc's <features.h>), and (c) the above
+//    include_next does not define _STDIO_H.  In that case, we skip the rest of
+//    the current file and don't guard against future includes.
+// 2. If the above include_next produces the normal stdio.h definitions, then
+//    either (a) __GLIBC__ is not defined because C headers are from some other
+//    libc implementation or (b) the above include_next defines _STDIO_H to
+//    prevent the above include_next from having any effect in the future.
+#if !defined(__GLIBC__) || defined(_STDIO_H)
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+
 #if __has_include(<llvm-libc-decls/stdio.h>)

 #if defined(__HIP__) || defined(__CUDA__)
 #define __LIBC_ATTRS __attribute__((device))
 #endif

+// Some headers provide these as macros. Temporarily undefine them so they do
+// not conflict with any definitions for the GPU.
+
+#pragma push_macro("stdout")
+#pragma push_macro("stdin")
+#pragma push_macro("stderr")
+
+#undef stdout
+#undef stderr
+#undef stdin
+
 #pragma omp begin declare target

 #include <llvm-libc-decls/stdio.h>
@ -29,6 +66,15 @@

 #undef __LIBC_ATTRS

+// Restore the original macros when compiling on the host.
+#if !defined(__NVPTX__) && !defined(__AMDGPU__)
+#pragma pop_macro("stdout")
+#pragma pop_macro("stderr")
+#pragma pop_macro("stdin")
+#endif
+
 #endif

 #endif // __CLANG_LLVM_LIBC_WRAPPERS_STDIO_H__
+
+#endif
--- a/lib/include/llvm_libc_wrappers/stdlib.h
+++ b/lib/include/llvm_libc_wrappers/stdlib.h
@ -23,8 +23,11 @@

 #pragma omp begin declare target

-// The LLVM C library uses this type so we forward declare it.
+// The LLVM C library uses these named types so we forward declare them.
 typedef void (*__atexithandler_t)(void);
+typedef int (*__bsearchcompare_t)(const void *, const void *);
+typedef int (*__qsortcompare_t)(const void *, const void *);
+typedef int (*__qsortrcompare_t)(const void *, const void *, void *);

 // Enforce ABI compatibility with the structs used by the LLVM C library.
 _Static_assert(__builtin_offsetof(div_t, quot) == 0, "ABI mismatch!");
--- a/lib/include/llvm_libc_wrappers/string.h
+++ b/lib/include/llvm_libc_wrappers/string.h
@ -13,9 +13,6 @@
 #error "This file is for GPU offloading compilation only"
 #endif

-// FIXME: The GNU headers provide C++ standard compliant headers when in C++
-// mode and the LLVM libc does not. We cannot enable memchr, strchr, strchrnul,
-// strpbrk, strrchr, strstr, or strcasestr until this is addressed.
 #include_next <string.h>

 #if __has_include(<llvm-libc-decls/string.h>)
@ -26,8 +23,70 @@

 #pragma omp begin declare target

+// The GNU headers provide C++ standard compliant headers when in C++ mode and
+// the LLVM libc does not. We need to manually provide the definitions using the
+// same prototypes.
+#if defined(__cplusplus) && defined(__GLIBC__) &&                              \
+    defined(__CORRECT_ISO_CPP_STRING_H_PROTO)
+
+#ifndef __LIBC_ATTRS
+#define __LIBC_ATTRS
+#endif
+
+extern "C" {
+void *memccpy(void *__restrict, const void *__restrict, int,
+              size_t) __LIBC_ATTRS;
+int memcmp(const void *, const void *, size_t) __LIBC_ATTRS;
+void *memcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS;
+void *memmem(const void *, size_t, const void *, size_t) __LIBC_ATTRS;
+void *memmove(void *, const void *, size_t) __LIBC_ATTRS;
+void *mempcpy(void *__restrict, const void *__restrict, size_t) __LIBC_ATTRS;
+void *memset(void *, int, size_t) __LIBC_ATTRS;
+char *stpcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+char *stpncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+char *strcat(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+int strcmp(const char *, const char *) __LIBC_ATTRS;
+int strcoll(const char *, const char *) __LIBC_ATTRS;
+char *strcpy(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+size_t strcspn(const char *, const char *) __LIBC_ATTRS;
+char *strdup(const char *) __LIBC_ATTRS;
+size_t strlen(const char *) __LIBC_ATTRS;
+char *strncat(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+int strncmp(const char *, const char *, size_t) __LIBC_ATTRS;
+char *strncpy(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+char *strndup(const char *, size_t) __LIBC_ATTRS;
+size_t strnlen(const char *, size_t) __LIBC_ATTRS;
+size_t strspn(const char *, const char *) __LIBC_ATTRS;
+char *strtok(char *__restrict, const char *__restrict) __LIBC_ATTRS;
+char *strtok_r(char *__restrict, const char *__restrict,
+               char **__restrict) __LIBC_ATTRS;
+size_t strxfrm(char *__restrict, const char *__restrict, size_t) __LIBC_ATTRS;
+}
+
+extern "C++" {
+char *strstr(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strstr(const char *, const char *) noexcept __LIBC_ATTRS;
+char *strpbrk(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strpbrk(const char *, const char *) noexcept __LIBC_ATTRS;
+char *strrchr(char *, int) noexcept __LIBC_ATTRS;
+const char *strrchr(const char *, int) noexcept __LIBC_ATTRS;
+char *strchr(char *, int) noexcept __LIBC_ATTRS;
+const char *strchr(const char *, int) noexcept __LIBC_ATTRS;
+char *strchrnul(char *, int) noexcept __LIBC_ATTRS;
+const char *strchrnul(const char *, int) noexcept __LIBC_ATTRS;
+char *strcasestr(char *, const char *) noexcept __LIBC_ATTRS;
+const char *strcasestr(const char *, const char *) noexcept __LIBC_ATTRS;
+void *memrchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+const void *memrchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+void *memchr(void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+const void *memchr(const void *__s, int __c, size_t __n) noexcept __LIBC_ATTRS;
+}
+
+#else
 #include <llvm-libc-decls/string.h>

+#endif
+
 #pragma omp end declare target

 #undef __LIBC_ATTRS
--- a/lib/include/llvm_libc_wrappers/time.h
+++ b/lib/include/llvm_libc_wrappers/time.h
@ -0,0 +1,34 @@
+//===-- Wrapper for C standard time.h declarations on the GPU -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
+#define __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
+
+#if !defined(_OPENMP) && !defined(__HIP__) && !defined(__CUDA__)
+#error "This file is for GPU offloading compilation only"
+#endif
+
+#include_next <time.h>
+
+#if __has_include(<llvm-libc-decls/time.h>)
+
+#if defined(__HIP__) || defined(__CUDA__)
+#define __LIBC_ATTRS __attribute__((device))
+#endif
+
+#pragma omp begin declare target
+
+_Static_assert(sizeof(clock_t) == sizeof(long), "ABI mismatch!");
+
+#include <llvm-libc-decls/time.h>
+
+#pragma omp end declare target
+
+#endif
+
+#endif // __CLANG_LLVM_LIBC_WRAPPERS_TIME_H__
--- a/lib/include/lsxintrin.h
+++ b/lib/include/lsxintrin.h
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@ -22,7 +22,9 @@ typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("mmx,no-evex512"), \
+                 __min_vector_width__(64)))

 /// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
@ -31,9 +33,9 @@ typedef char __v8qi __attribute__((__vector_size__(8)));
 ///
 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
-static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
-_mm_empty(void)
-{
+static __inline__ void __attribute__((__always_inline__, __nodebug__,
+                                      __target__("mmx,no-evex512")))
+_mm_empty(void) {
  __builtin_ia32_emms();
 }

--- a/lib/include/module.modulemap
+++ b/lib/include/module.modulemap
@ -153,10 +153,163 @@ module _Builtin_intrinsics [system] [extern_c] {
  }
 }

-module _Builtin_stddef_max_align_t [system] [extern_c] {
-  header "__stddef_max_align_t.h"
+// Start -fbuiltin-headers-in-system-modules affected modules
+
+// The following modules all ignore their headers when
+// -fbuiltin-headers-in-system-modules is passed, and many of
+// those headers join system modules when present.
+
+// e.g. if -fbuiltin-headers-in-system-modules is passed, then
+// float.h will not be in the _Builtin_float module (that module
+// will be empty). If there is a system module that declares
+// `header "float.h"`, then the builtin float.h will join
+// that module. The system float.h (if present) will be treated
+// as a textual header in the sytem module.
+module _Builtin_float [system] {
+  header "float.h"
+  export *
 }

+module _Builtin_inttypes [system] {
+  header "inttypes.h"
+  export *
+}
+
+module _Builtin_iso646 [system] {
+  header "iso646.h"
+  export *
+}
+
+module _Builtin_limits [system] {
+  header "limits.h"
+  export *
+}
+
+module _Builtin_stdalign [system] {
+  header "stdalign.h"
+  export *
+}
+
+module _Builtin_stdarg [system] {
+  textual header "stdarg.h"
+
+  explicit module __gnuc_va_list {
+    header "__stdarg___gnuc_va_list.h"
+    export *
+  }
+
+  explicit module __va_copy {
+    header "__stdarg___va_copy.h"
+    export *
+  }
+
+  explicit module va_arg {
+    header "__stdarg_va_arg.h"
+    export *
+  }
+
+  explicit module va_copy {
+    header "__stdarg_va_copy.h"
+    export *
+  }
+
+  explicit module va_list {
+    header "__stdarg_va_list.h"
+    export *
+  }
+}
+
+module _Builtin_stdatomic [system] {
+  header "stdatomic.h"
+  export *
+}
+
+module _Builtin_stdbool [system] {
+  header "stdbool.h"
+  export *
+}
+
+module _Builtin_stddef [system] {
+  textual header "stddef.h"
+
+  // __stddef_max_align_t.h is always in this module, even if
+  // -fbuiltin-headers-in-system-modules is passed.
+  explicit module max_align_t {
+    header "__stddef_max_align_t.h"
+    export *
+  }
+
+  explicit module null {
+    header "__stddef_null.h"
+    export *
+  }
+
+  explicit module nullptr_t {
+    header "__stddef_nullptr_t.h"
+    export *
+  }
+
+  explicit module offsetof {
+    header "__stddef_offsetof.h"
+    export *
+  }
+
+  explicit module ptrdiff_t {
+    header "__stddef_ptrdiff_t.h"
+    export *
+  }
+
+  explicit module rsize_t {
+    header "__stddef_rsize_t.h"
+    export *
+  }
+
+  explicit module size_t {
+    header "__stddef_size_t.h"
+    export *
+  }
+
+  explicit module unreachable {
+    header "__stddef_unreachable.h"
+    export *
+  }
+
+  explicit module wchar_t {
+    header "__stddef_wchar_t.h"
+    export *
+  }
+}
+
+// wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+// for compatibility, but must be explicitly requested. Therefore
+// __stddef_wint_t.h is not part of _Builtin_stddef. It is always in
+// this module even if -fbuiltin-headers-in-system-modules is passed.
+module _Builtin_stddef_wint_t [system] {
+  header "__stddef_wint_t.h"
+  export *
+}
+
+module _Builtin_stdint [system] {
+  header "stdint.h"
+  export *
+}
+
+module _Builtin_stdnoreturn [system] {
+  header "stdnoreturn.h"
+  export *
+}
+
+module _Builtin_tgmath [system] {
+  header "tgmath.h"
+  export *
+}
+
+module _Builtin_unwind [system] {
+  header "unwind.h"
+  export *
+}
+// End -fbuiltin-headers-in-system-modules affected modules
+
 module opencl_c {
  requires opencl
  header "opencl-c.h"
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -45,6 +45,7 @@
 #define __opencl_c_ext_fp32_local_atomic_add 1
 #define __opencl_c_ext_fp32_global_atomic_min_max 1
 #define __opencl_c_ext_fp32_local_atomic_min_max 1
+#define __opencl_c_ext_image_raw10_raw12 1

 #endif // defined(__SPIR__) || defined(__SPIRV__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
@ -477,6 +478,10 @@ typedef enum memory_order
 #if __OPENCL_C_VERSION__ >= CL_VERSION_3_0
 #define CLK_UNORM_INT_101010_2 0x10E0
 #endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
+#ifdef __opencl_c_ext_image_raw10_raw12
+#define CLK_UNSIGNED_INT_RAW10_EXT 0x10E3
+#define CLK_UNSIGNED_INT_RAW12_EXT 0x10E4
+#endif // __opencl_c_ext_image_raw10_raw12

 // Channel order, numbering must be aligned with cl_channel_order in cl.h
 //
--- a/lib/include/openmp_wrappers/cmath
+++ b/lib/include/openmp_wrappers/cmath
@ -1,4 +1,4 @@
-/*===-- __clang_openmp_device_functions.h - OpenMP math declares ------ c++ -===
+/*===-- __clang_openmp_device_functions.h - OpenMP math declares -*- c++ -*-===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -18,7 +18,8 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse3,no-evex512"), __min_vector_width__(128)))

 /// Loads data from an unaligned memory location to elements in a 128-bit
 ///    vector.
--- a/lib/include/ppc_wrappers/nmmintrin.h
+++ b/lib/include/ppc_wrappers/nmmintrin.h
@ -0,0 +1,26 @@
+/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef NMMINTRIN_H_
+#define NMMINTRIN_H_
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* NMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -14,7 +14,7 @@

 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
-   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+   makes explicit use of Intel intrinsics to powerpc64/powerpc64le.

   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
@ -68,10 +68,10 @@ extern __inline __m128d
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
@ -83,10 +83,15 @@ extern __inline __m128d

  switch (__rounding) {
  case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
    __attribute__((fallthrough));
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
@ -102,7 +107,7 @@ extern __inline __m128d
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
    break;
  case _MM_FROUND_TO_NEG_INF:
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@ -128,9 +133,14 @@ extern __inline __m128d
     */
    __asm__("" : : "wa"(__r));
    /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
  }
  return (__m128d)__r;
 }
@ -159,10 +169,10 @@ extern __inline __m128
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
@ -174,10 +184,15 @@ extern __inline __m128

  switch (__rounding) {
  case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
    __attribute__((fallthrough));
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
@ -193,7 +208,7 @@ extern __inline __m128
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
    break;
  case _MM_FROUND_TO_NEG_INF:
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@ -219,9 +234,14 @@ extern __inline __m128
     */
    __asm__("" : : "wa"(__r));
    /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
  }
  return (__m128)__r;
 }
--- a/lib/include/riscv_bitmanip.h
+++ b/lib/include/riscv_bitmanip.h
@ -0,0 +1,195 @@
+/*===---- riscv_bitmanip.h - RISC-V Zb* intrinsics --------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_BITMANIP_H
+#define __RISCV_BITMANIP_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__riscv_zbb)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_orc_b_32(uint32_t __x) {
+  return __builtin_riscv_orc_b_32(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_clz_32(uint32_t __x) {
+  return __builtin_riscv_clz_32(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_ctz_32(uint32_t __x) {
+  return __builtin_riscv_ctz_32(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_cpop_32(uint32_t __x) {
+  return __builtin_popcount(__x);
+}
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_orc_b_64(uint64_t __x) {
+  return __builtin_riscv_orc_b_64(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_clz_64(uint64_t __x) {
+  return __builtin_riscv_clz_64(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_ctz_64(uint64_t __x) {
+  return __builtin_riscv_ctz_64(__x);
+}
+
+static __inline__ unsigned __attribute__((__always_inline__, __nodebug__))
+__riscv_cpop_64(uint64_t __x) {
+  return __builtin_popcountll(__x);
+}
+#endif
+#endif // defined(__riscv_zbb)
+
+#if defined(__riscv_zbb) || defined(__riscv_zbkb)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_rev8_32(uint32_t __x) {
+  return __builtin_bswap32(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_rol_32(uint32_t __x, uint32_t __y) {
+  return __builtin_rotateleft32(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_ror_32(uint32_t __x, uint32_t __y) {
+  return __builtin_rotateright32(__x, __y);
+}
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_rev8_64(uint64_t __x) {
+  return __builtin_bswap64(__x);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_rol_64(uint64_t __x, uint32_t __y) {
+  return __builtin_rotateleft64(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_ror_64(uint64_t __x, uint32_t __y) {
+  return __builtin_rotateright64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbb) || defined(__riscv_zbkb)
+
+#if defined(__riscv_zbkb)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_brev8_32(uint32_t __x) {
+  return __builtin_riscv_brev8_32(__x);
+}
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_brev8_64(uint64_t __x) {
+  return __builtin_riscv_brev8_64(__x);
+}
+#endif
+
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_unzip_32(uint32_t __x) {
+  return __builtin_riscv_unzip_32(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_zip_32(uint32_t __x) {
+  return __builtin_riscv_zip_32(__x);
+}
+#endif
+#endif // defined(__riscv_zbkb)
+
+#if defined(__riscv_zbc)
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_clmulr_32(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulr_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_clmulr_64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbc)
+
+#if defined(__riscv_zbkc) || defined(__riscv_zbc)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmul_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_clmul_32(__x, __y);
+}
+
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulh_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_clmulh_32(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmul_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_clmul_64(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_clmulh_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_clmulh_64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbkc) || defined(__riscv_zbc)
+
+#if defined(__riscv_zbkx)
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_xperm4_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_xperm4_32(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_xperm8_32(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_xperm8_32(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_xperm4_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_xperm4_64(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_xperm8_64(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_xperm8_64(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zbkx)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/lib/include/riscv_crypto.h
+++ b/lib/include/riscv_crypto.h
@ -0,0 +1,170 @@
+/*===---- riscv_crypto.h - RISC-V Zk* intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_CRYPTO_H
+#define __RISCV_CRYPTO_H
+
+#include <stdint.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__riscv_zknd)
+#if __riscv_xlen == 32
+#define __riscv_aes32dsi(x, y, bs) __builtin_riscv_aes32dsi(x, y, bs)
+#define __riscv_aes32dsmi(x, y, bs) __builtin_riscv_aes32dsmi(x, y, bs)
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64ds(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_aes64ds(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64dsm(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_aes64dsm(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64im(uint64_t __x) {
+  return __builtin_riscv_aes64im(__x);
+}
+#endif
+#endif // defined(__riscv_zknd)
+
+#if defined(__riscv_zkne)
+#if __riscv_xlen == 32
+#define __riscv_aes32esi(x, y, bs) __builtin_riscv_aes32esi(x, y, bs)
+#define __riscv_aes32esmi(x, y, bs) __builtin_riscv_aes32esmi(x, y, bs)
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64es(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_aes64es(__x, __y);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64esm(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_aes64esm(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zkne)
+
+#if defined(__riscv_zknd) || defined(__riscv_zkne)
+#if __riscv_xlen == 64
+#define __riscv_aes64ks1i(x, rnum) __builtin_riscv_aes64ks1i(x, rnum)
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_aes64ks2(uint64_t __x, uint64_t __y) {
+  return __builtin_riscv_aes64ks2(__x, __y);
+}
+#endif
+#endif // defined(__riscv_zknd) || defined(__riscv_zkne)
+
+#if defined(__riscv_zknh)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha256sig0(uint32_t __x) {
+  return __builtin_riscv_sha256sig0(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha256sig1(uint32_t __x) {
+  return __builtin_riscv_sha256sig1(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha256sum0(uint32_t __x) {
+  return __builtin_riscv_sha256sum0(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha256sum1(uint32_t __x) {
+  return __builtin_riscv_sha256sum1(__x);
+}
+
+#if __riscv_xlen == 32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig0h(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sig0h(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig0l(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sig0l(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig1h(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sig1h(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig1l(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sig1l(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sum0r(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sum0r(__x, __y);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sum1r(uint32_t __x, uint32_t __y) {
+  return __builtin_riscv_sha512sum1r(__x, __y);
+}
+#endif
+
+#if __riscv_xlen == 64
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig0(uint64_t __x) {
+  return __builtin_riscv_sha512sig0(__x);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sig1(uint64_t __x) {
+  return __builtin_riscv_sha512sig1(__x);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sum0(uint64_t __x) {
+  return __builtin_riscv_sha512sum0(__x);
+}
+
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sha512sum1(uint64_t __x) {
+  return __builtin_riscv_sha512sum1(__x);
+}
+#endif
+#endif // defined(__riscv_zknh)
+
+#if defined(__riscv_zksh)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sm3p0(uint32_t __x) {
+  return __builtin_riscv_sm3p0(__x);
+}
+
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__riscv_sm3p1(uint32_t __x) {
+  return __builtin_riscv_sm3p1(__x);
+}
+#endif // defined(__riscv_zksh)
+
+#if defined(__riscv_zksed)
+#define __riscv_sm4ed(x, y, bs) __builtin_riscv_sm4ed(x, y, bs);
+#define __riscv_sm4ks(x, y, bs) __builtin_riscv_sm4ks(x, y, bs);
+#endif // defined(__riscv_zksed)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
--- a/lib/include/riscv_ntlh.h
+++ b/lib/include/riscv_ntlh.h
@ -21,8 +21,6 @@ enum {
  __RISCV_NTLH_ALL
 };

-#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN))
-#define __riscv_ntl_store(PTR, VAL, DOMAIN)                                    \
-  __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN))
-
+#define __riscv_ntl_load __builtin_riscv_ntl_load
+#define __riscv_ntl_store __builtin_riscv_ntl_store
 #endif
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
@ -392,6 +392,37 @@ typedef __rvv_float64m2x4_t vfloat64m2x4_t;
 typedef __rvv_float64m4_t vfloat64m4_t;
 typedef __rvv_float64m4x2_t vfloat64m4x2_t;
 typedef __rvv_float64m8_t vfloat64m8_t;
+typedef __rvv_bfloat16mf4_t vbfloat16mf4_t;
+typedef __rvv_bfloat16mf4x2_t vbfloat16mf4x2_t;
+typedef __rvv_bfloat16mf4x3_t vbfloat16mf4x3_t;
+typedef __rvv_bfloat16mf4x4_t vbfloat16mf4x4_t;
+typedef __rvv_bfloat16mf4x5_t vbfloat16mf4x5_t;
+typedef __rvv_bfloat16mf4x6_t vbfloat16mf4x6_t;
+typedef __rvv_bfloat16mf4x7_t vbfloat16mf4x7_t;
+typedef __rvv_bfloat16mf4x8_t vbfloat16mf4x8_t;
+typedef __rvv_bfloat16mf2_t vbfloat16mf2_t;
+typedef __rvv_bfloat16mf2x2_t vbfloat16mf2x2_t;
+typedef __rvv_bfloat16mf2x3_t vbfloat16mf2x3_t;
+typedef __rvv_bfloat16mf2x4_t vbfloat16mf2x4_t;
+typedef __rvv_bfloat16mf2x5_t vbfloat16mf2x5_t;
+typedef __rvv_bfloat16mf2x6_t vbfloat16mf2x6_t;
+typedef __rvv_bfloat16mf2x7_t vbfloat16mf2x7_t;
+typedef __rvv_bfloat16mf2x8_t vbfloat16mf2x8_t;
+typedef __rvv_bfloat16m1_t vbfloat16m1_t;
+typedef __rvv_bfloat16m1x2_t vbfloat16m1x2_t;
+typedef __rvv_bfloat16m1x3_t vbfloat16m1x3_t;
+typedef __rvv_bfloat16m1x4_t vbfloat16m1x4_t;
+typedef __rvv_bfloat16m1x5_t vbfloat16m1x5_t;
+typedef __rvv_bfloat16m1x6_t vbfloat16m1x6_t;
+typedef __rvv_bfloat16m1x7_t vbfloat16m1x7_t;
+typedef __rvv_bfloat16m1x8_t vbfloat16m1x8_t;
+typedef __rvv_bfloat16m2_t vbfloat16m2_t;
+typedef __rvv_bfloat16m2x2_t vbfloat16m2x2_t;
+typedef __rvv_bfloat16m2x3_t vbfloat16m2x3_t;
+typedef __rvv_bfloat16m2x4_t vbfloat16m2x4_t;
+typedef __rvv_bfloat16m4_t vbfloat16m4_t;
+typedef __rvv_bfloat16m4x2_t vbfloat16m4x2_t;
+typedef __rvv_bfloat16m8_t vbfloat16m8_t;
 #define __riscv_v_intrinsic_overloading 1

 #ifdef __cplusplus
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -18,8 +18,8 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
-                 __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))

 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT 0x00
@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
 /// \returns A 128-bit integer vector containing the data stored at the
 ///    specified memory location.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(__m128i const *__V) {
+_mm_stream_load_si128(const void *__V) {
  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
 }

--- a/lib/include/stdalign.h
+++ b/lib/include/stdalign.h
@ -10,10 +10,8 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
 #if defined(__cplusplus) ||                                                    \
-    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202000L)
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L)
 #ifndef __cplusplus
 #define alignas _Alignas
 #define alignof _Alignof
--- a/lib/include/stdarg.h
+++ b/lib/include/stdarg.h
@ -7,45 +7,73 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef __STDARG_H
+/*
+ * This header is designed to be included multiple times. If any of the __need_
+ * macros are defined, then only that subset of interfaces are provided. This
+ * can be useful for POSIX headers that need to not expose all of stdarg.h, but
+ * need to use some of its interfaces. Otherwise this header provides all of
+ * the expected interfaces.
+ *
+ * When clang modules are enabled, this header is a textual header. It ignores
+ * its header guard so that multiple submodules can export its interfaces.
+ * Take module SM with submodules A and B, whose headers both include stdarg.h
+ * When SM.A builds, __STDARG_H will be defined. When SM.B builds, the
+ * definition from SM.A will leak when building without local submodule
+ * visibility. stdarg.h wouldn't include any of its implementation headers, and
+ * SM.B wouldn't import any of the stdarg modules, and SM.B's `export *`
+ * wouldn't export any stdarg interfaces as expected. However, since stdarg.h
+ * ignores its header guard when building with modules, it all works as
+ * expected.
+ *
+ * When clang modules are not enabled, the header guards can function in the
+ * normal simple fashion.
+ */
+#if !defined(__STDARG_H) || __has_feature(modules) ||                          \
+    defined(__need___va_list) || defined(__need_va_list) ||                    \
+    defined(__need_va_arg) || defined(__need___va_copy) ||                     \
+    defined(__need_va_copy)

-#ifndef __GNUC_VA_LIST
-#define __GNUC_VA_LIST
-typedef __builtin_va_list __gnuc_va_list;
-#endif
-
-#ifdef __need___va_list
-#undef __need___va_list
-#else
+#if !defined(__need___va_list) && !defined(__need_va_list) &&                  \
+    !defined(__need_va_arg) && !defined(__need___va_copy) &&                   \
+    !defined(__need_va_copy)
 #define __STDARG_H
-#ifndef _VA_LIST
-typedef __builtin_va_list va_list;
-#define _VA_LIST
-#endif
-
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
-/* C2x does not require the second parameter for va_start. */
-#define va_start(ap, ...) __builtin_va_start(ap, 0)
-#else
-/* Versions before C2x do require the second parameter. */
-#define va_start(ap, param) __builtin_va_start(ap, param)
-#endif
-#define va_end(ap)          __builtin_va_end(ap)
-#define va_arg(ap, type)    __builtin_va_arg(ap, type)
-
+#define __need___va_list
+#define __need_va_list
+#define __need_va_arg
+#define __need___va_copy
 /* GCC always defines __va_copy, but does not define va_copy unless in c99 mode
 * or -ansi is not specified, since it was not part of C90.
 */
-#define __va_copy(d,s) __builtin_va_copy(d,s)
-
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
    !defined(__STRICT_ANSI__)
-#define va_copy(dest, src)  __builtin_va_copy(dest, src)
+#define __need_va_copy
+#endif
 #endif

-#endif /* __STDARG_H */
+#ifdef __need___va_list
+#include <__stdarg___gnuc_va_list.h>
+#undef __need___va_list
+#endif /* defined(__need___va_list) */

-#endif /* not __STDARG_H */
+#ifdef __need_va_list
+#include <__stdarg_va_list.h>
+#undef __need_va_list
+#endif /* defined(__need_va_list) */
+
+#ifdef __need_va_arg
+#include <__stdarg_va_arg.h>
+#undef __need_va_arg
+#endif /* defined(__need_va_arg) */
+
+#ifdef __need___va_copy
+#include <__stdarg___va_copy.h>
+#undef __need___va_copy
+#endif /* defined(__need___va_copy) */
+
+#ifdef __need_va_copy
+#include <__stdarg_va_copy.h>
+#undef __need_va_copy
+#endif /* defined(__need_va_copy) */
+
+#endif
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -45,16 +45,14 @@ extern "C" {
 #define ATOMIC_POINTER_LOCK_FREE    __CLANG_ATOMIC_POINTER_LOCK_FREE

 /* 7.17.2 Initialization */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202000L) ||               \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202311L) ||               \
    defined(__cplusplus)
-/* ATOMIC_VAR_INIT was removed in C2x, but still remains in C++23. */
+/* ATOMIC_VAR_INIT was removed in C23, but still remains in C++23. */
 #define ATOMIC_VAR_INIT(value) (value)
 #endif

 #if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201710L &&              \
-      __STDC_VERSION__ < 202000L) ||                                           \
+      __STDC_VERSION__ < 202311L) ||                                           \
     (defined(__cplusplus) && __cplusplus >= 202002L)) &&                      \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
--- a/lib/include/stdckdint.h
+++ b/lib/include/stdckdint.h
@ -0,0 +1,42 @@
+/*===---- stdckdint.h - Standard header for checking integer----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __STDCKDINT_H
+#define __STDCKDINT_H
+
+/* If we're hosted, fall back to the system's stdckdint.h. FreeBSD, for
+ * example, already has a Clang-compatible stdckdint.h header.
+ *
+ * The `stdckdint.h` header requires C 23 or newer.
+ */
+#if __STDC_HOSTED__ && __has_include_next(<stdckdint.h>)
+#include_next <stdckdint.h>
+#else
+
+/* C23 7.20.1 Defines several macros for performing checked integer arithmetic*/
+
+#define __STDC_VERSION_STDCKDINT_H__ 202311L
+
+// Both A and B shall be any integer type other than "plain" char, bool, a bit-
+// precise integer type, or an enumerated type, and they need not be the same.
+
+// R shall be a modifiable lvalue of any integer type other than "plain" char,
+// bool, a bit-precise integer type, or an enumerated type. It shouldn't be
+// short type, either. Otherwise, it may be unable to hold two the result of
+// operating two 'int's.
+
+// A diagnostic message will be produced if A or B are not suitable integer
+// types, or if R is not a modifiable lvalue of a suitable integer type or R
+// is short type.
+#define ckd_add(R, A, B) __builtin_add_overflow((A), (B), (R))
+#define ckd_sub(R, A, B) __builtin_sub_overflow((A), (B), (R))
+#define ckd_mul(R, A, B) __builtin_mul_overflow((A), (B), (R))
+
+#endif /* __STDC_HOSTED__ */
+#endif /* __STDCKDINT_H */
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@ -7,126 +7,116 @@
 *===-----------------------------------------------------------------------===
 */

-#if !defined(__STDDEF_H) || defined(__need_ptrdiff_t) ||                       \
-    defined(__need_size_t) || defined(__need_wchar_t) ||                       \
-    defined(__need_NULL) || defined(__need_wint_t)
+/*
+ * This header is designed to be included multiple times. If any of the __need_
+ * macros are defined, then only that subset of interfaces are provided. This
+ * can be useful for POSIX headers that need to not expose all of stddef.h, but
+ * need to use some of its interfaces. Otherwise this header provides all of
+ * the expected interfaces.
+ *
+ * When clang modules are enabled, this header is a textual header. It ignores
+ * its header guard so that multiple submodules can export its interfaces.
+ * Take module SM with submodules A and B, whose headers both include stddef.h
+ * When SM.A builds, __STDDEF_H will be defined. When SM.B builds, the
+ * definition from SM.A will leak when building without local submodule
+ * visibility. stddef.h wouldn't include any of its implementation headers, and
+ * SM.B wouldn't import any of the stddef modules, and SM.B's `export *`
+ * wouldn't export any stddef interfaces as expected. However, since stddef.h
+ * ignores its header guard when building with modules, it all works as
+ * expected.
+ *
+ * When clang modules are not enabled, the header guards can function in the
+ * normal simple fashion.
+ */
+#if !defined(__STDDEF_H) || __has_feature(modules) ||                          \
+    (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1) ||        \
+    defined(__need_ptrdiff_t) || defined(__need_size_t) ||                     \
+    defined(__need_rsize_t) || defined(__need_wchar_t) ||                      \
+    defined(__need_NULL) || defined(__need_nullptr_t) ||                       \
+    defined(__need_unreachable) || defined(__need_max_align_t) ||              \
+    defined(__need_offsetof) || defined(__need_wint_t)

 #if !defined(__need_ptrdiff_t) && !defined(__need_size_t) &&                   \
-    !defined(__need_wchar_t) && !defined(__need_NULL) &&                       \
-    !defined(__need_wint_t)
-/* Always define miscellaneous pieces when modules are available. */
-#if !__has_feature(modules)
+    !defined(__need_rsize_t) && !defined(__need_wchar_t) &&                    \
+    !defined(__need_NULL) && !defined(__need_nullptr_t) &&                     \
+    !defined(__need_unreachable) && !defined(__need_max_align_t) &&            \
+    !defined(__need_offsetof) && !defined(__need_wint_t)
 #define __STDDEF_H
-#endif
 #define __need_ptrdiff_t
 #define __need_size_t
+/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
+ * enabled. */
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
+#define __need_rsize_t
+#endif
 #define __need_wchar_t
 #define __need_NULL
-#define __need_STDDEF_H_misc
-/* __need_wint_t is intentionally not defined here. */
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    defined(__cplusplus)
+#define __need_nullptr_t
+#endif
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
+#define __need_unreachable
+#endif
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
+#define __need_max_align_t
+#endif
+#define __need_offsetof
+/* wint_t is provided by <wchar.h> and not <stddef.h>. It's here
+ * for compatibility, but must be explicitly requested. Therefore
+ * __need_wint_t is intentionally not defined here. */
 #endif

 #if defined(__need_ptrdiff_t)
-#if !defined(_PTRDIFF_T) || __has_feature(modules)
-/* Always define ptrdiff_t when modules are available. */
-#if !__has_feature(modules)
-#define _PTRDIFF_T
-#endif
-typedef __PTRDIFF_TYPE__ ptrdiff_t;
-#endif
+#include <__stddef_ptrdiff_t.h>
 #undef __need_ptrdiff_t
 #endif /* defined(__need_ptrdiff_t) */

 #if defined(__need_size_t)
-#if !defined(_SIZE_T) || __has_feature(modules)
-/* Always define size_t when modules are available. */
-#if !__has_feature(modules)
-#define _SIZE_T
-#endif
-typedef __SIZE_TYPE__ size_t;
-#endif
+#include <__stddef_size_t.h>
 #undef __need_size_t
 #endif /*defined(__need_size_t) */

-#if defined(__need_STDDEF_H_misc)
-/* ISO9899:2011 7.20 (C11 Annex K): Define rsize_t if __STDC_WANT_LIB_EXT1__ is
- * enabled. */
-#if (defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1 && \
-     !defined(_RSIZE_T)) || __has_feature(modules)
-/* Always define rsize_t when modules are available. */
-#if !__has_feature(modules)
-#define _RSIZE_T
-#endif
-typedef __SIZE_TYPE__ rsize_t;
-#endif
-#endif /* defined(__need_STDDEF_H_misc) */
+#if defined(__need_rsize_t)
+#include <__stddef_rsize_t.h>
+#undef __need_rsize_t
+#endif /* defined(__need_rsize_t) */

 #if defined(__need_wchar_t)
-#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
-/* Always define wchar_t when modules are available. */
-#if !defined(_WCHAR_T) || __has_feature(modules)
-#if !__has_feature(modules)
-#define _WCHAR_T
-#if defined(_MSC_EXTENSIONS)
-#define _WCHAR_T_DEFINED
-#endif
-#endif
-typedef __WCHAR_TYPE__ wchar_t;
-#endif
-#endif
+#include <__stddef_wchar_t.h>
 #undef __need_wchar_t
 #endif /* defined(__need_wchar_t) */

 #if defined(__need_NULL)
-#undef NULL
-#ifdef __cplusplus
-#  if !defined(__MINGW32__) && !defined(_MSC_VER)
-#    define NULL __null
-#  else
-#    define NULL 0
-#  endif
-#else
-#  define NULL ((void*)0)
-#endif
-#ifdef __cplusplus
-#if defined(_MSC_EXTENSIONS) && defined(_NATIVE_NULLPTR_SUPPORTED)
-namespace std { typedef decltype(nullptr) nullptr_t; }
-using ::std::nullptr_t;
-#endif
-#endif
+#include <__stddef_null.h>
 #undef __need_NULL
 #endif /* defined(__need_NULL) */

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
-typedef typeof(nullptr) nullptr_t;
-#endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
+#if defined(__need_nullptr_t)
+#include <__stddef_nullptr_t.h>
+#undef __need_nullptr_t
+#endif /* defined(__need_nullptr_t) */

-#if defined(__need_STDDEF_H_misc) && defined(__STDC_VERSION__) &&              \
-    __STDC_VERSION__ >= 202000L
-#define unreachable() __builtin_unreachable()
-#endif /* defined(__need_STDDEF_H_misc) && >= C23 */
+#if defined(__need_unreachable)
+#include <__stddef_unreachable.h>
+#undef __need_unreachable
+#endif /* defined(__need_unreachable) */

-#if defined(__need_STDDEF_H_misc)
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
-    (defined(__cplusplus) && __cplusplus >= 201103L)
-#include "__stddef_max_align_t.h"
-#endif
-#define offsetof(t, d) __builtin_offsetof(t, d)
-#undef __need_STDDEF_H_misc
-#endif  /* defined(__need_STDDEF_H_misc) */
+#if defined(__need_max_align_t)
+#include <__stddef_max_align_t.h>
+#undef __need_max_align_t
+#endif /* defined(__need_max_align_t) */
+
+#if defined(__need_offsetof)
+#include <__stddef_offsetof.h>
+#undef __need_offsetof
+#endif /* defined(__need_offsetof) */

 /* Some C libraries expect to see a wint_t here. Others (notably MinGW) will use
 __WINT_TYPE__ directly; accommodate both by requiring __need_wint_t */
 #if defined(__need_wint_t)
-/* Always define wint_t when modules are available. */
-#if !defined(_WINT_T) || __has_feature(modules)
-#if !__has_feature(modules)
-#define _WINT_T
-#endif
-typedef __WINT_TYPE__ wint_t;
-#endif
+#include <__stddef_wint_t.h>
 #undef __need_wint_t
 #endif /* __need_wint_t */

--- a/lib/include/stdint.h
+++ b/lib/include/stdint.h
@ -499,9 +499,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT64_MAX           INT64_C( 9223372036854775807)
 # define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
 # define UINT64_MAX         UINT64_C(18446744073709551615)
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT64_WIDTH         64
 # define INT64_WIDTH          UINT64_WIDTH

@ -545,9 +544,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST64_MAX    __INT_LEAST64_MAX
 # define UINT_FAST64_MAX  __UINT_LEAST64_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) &&  __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) &&  __STDC_VERSION__ >= 202311L
 # define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
 # define INT_LEAST64_WIDTH  UINT_LEAST64_WIDTH
 # define UINT_FAST64_WIDTH  __UINT_LEAST64_WIDTH
@ -586,9 +583,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT56_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT56_WIDTH         56
 # define INT56_WIDTH          UINT56_WIDTH
 # define UINT_LEAST56_WIDTH   UINT56_WIDTH
@ -635,9 +630,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT48_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 #define UINT48_WIDTH         48
 #define INT48_WIDTH          UINT48_WIDTH
 #define UINT_LEAST48_WIDTH   UINT48_WIDTH
@ -684,9 +677,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT40_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT40_WIDTH         40
 # define INT40_WIDTH          UINT40_WIDTH
 # define UINT_LEAST40_WIDTH   UINT40_WIDTH
@ -727,9 +718,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT32_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT32_WIDTH         32
 # define INT32_WIDTH          UINT32_WIDTH
 # undef __UINT_LEAST32_WIDTH
@ -749,9 +738,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST32_MAX    __INT_LEAST32_MAX
 # define UINT_FAST32_MAX  __UINT_LEAST32_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
 # define INT_LEAST32_WIDTH  UINT_LEAST32_WIDTH
 # define UINT_FAST32_WIDTH  __UINT_LEAST32_WIDTH
@ -784,9 +771,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT24_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT24_WIDTH         24
 # define INT24_WIDTH          UINT24_WIDTH
 # define UINT_LEAST24_WIDTH   UINT24_WIDTH
@ -819,9 +804,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT16_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT16_WIDTH         16
 # define INT16_WIDTH          UINT16_WIDTH
 # undef __UINT_LEAST16_WIDTH
@ -839,9 +822,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST16_MAX    __INT_LEAST16_MAX
 # define UINT_FAST16_MAX  __UINT_LEAST16_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
 # define INT_LEAST16_WIDTH  UINT_LEAST16_WIDTH
 # define UINT_FAST16_WIDTH  __UINT_LEAST16_WIDTH
@ -862,9 +843,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT8_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT8_WIDTH         8
 # define INT8_WIDTH          UINT8_WIDTH
 # undef __UINT_LEAST8_WIDTH
@ -880,9 +859,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST8_MAX    __INT_LEAST8_MAX
 # define UINT_FAST8_MAX  __UINT_LEAST8_MAX

-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 # define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
 # define INT_LEAST8_WIDTH  UINT_LEAST8_WIDTH
 # define UINT_FAST8_WIDTH  __UINT_LEAST8_WIDTH
@ -907,10 +884,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define PTRDIFF_MAX   __PTRDIFF_MAX__
 #define    SIZE_MAX      __SIZE_MAX__

-/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C23 7.22.2.4 Width of integer types capable of holding object pointers. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 /* NB: The C standard requires that these be the same value, but the compiler
   exposes separate internal width macros. */
 #define INTPTR_WIDTH  __INTPTR_WIDTH__
@ -928,10 +903,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define  INTMAX_MAX   __INTMAX_MAX__
 #define UINTMAX_MAX  __UINTMAX_MAX__

-/* C2x 7.20.2.5 Width of greatest-width integer types. */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C23 7.22.2.5 Width of greatest-width integer types. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 /* NB: The C standard requires that these be the same value, but the compiler
   exposes separate internal width macros. */
 #define INTMAX_WIDTH __INTMAX_WIDTH__
@ -964,10 +937,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define  INTMAX_C(v) __int_c(v,  __INTMAX_C_SUFFIX__)
 #define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)

-/* C2x 7.20.3.x Width of other integer types. */
-/* FIXME: This is using the placeholder dates Clang produces for these macros
-   in C2x mode; switch to the correct values once they've been published. */
-#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C23 7.22.3.x Width of other integer types. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
 #define PTRDIFF_WIDTH    __PTRDIFF_WIDTH__
 #define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
 #define SIZE_WIDTH       __SIZE_WIDTH__
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@ -15,8 +15,8 @@

 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) &&               \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
-/* The noreturn macro is deprecated in C2x. We do not mark it as such because
-   including the header file in C2x is also deprecated and we do not want to
+/* The noreturn macro is deprecated in C23. We do not mark it as such because
+   including the header file in C23 is also deprecated and we do not want to
   issue a confusing diagnostic for code which includes <stdnoreturn.h>
   followed by code that writes [[noreturn]]. The issue with such code is not
   with the attribute, or the use of 'noreturn', but the inclusion of the
--- a/lib/include/tmmintrin.h
+++ b/lib/include/tmmintrin.h
@ -17,8 +17,13 @@
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("ssse3,no-evex512"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_MMX                                                 \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("mmx,ssse3,no-evex512"),                           \
+                 __min_vector_width__(64)))

 /// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
--- a/lib/include/usermsrintrin.h
+++ b/lib/include/usermsrintrin.h
@ -0,0 +1,51 @@
+/*===--------------- usermsrintrin.h - USERMSR intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error "Never use <usermsrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __USERMSRINTRIN_H
+#define __USERMSRINTRIN_H
+#ifdef __x86_64__
+
+/// Reads the contents of a 64-bit MSR specified in \a __A into \a dst.
+///
+/// This intrinsic corresponds to the <c> URDMSR </c> instruction.
+/// \param __A
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    DEST := MSR[__A]
+/// \endcode
+static __inline__ unsigned long long
+    __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
+    _urdmsr(unsigned long long __A) {
+  return __builtin_ia32_urdmsr(__A);
+}
+
+/// Writes the contents of \a __B into the 64-bit MSR specified in \a __A.
+///
+/// This intrinsic corresponds to the <c> UWRMSR </c> instruction.
+///
+/// \param __A
+///    An unsigned long long.
+/// \param __B
+///    An unsigned long long.
+///
+/// \code{.operation}
+///    MSR[__A] := __B
+/// \endcode
+static __inline__ void
+    __attribute__((__always_inline__, __nodebug__, __target__("usermsr")))
+    _uwrmsr(unsigned long long __A, unsigned long long __B) {
+  return __builtin_ia32_uwrmsr(__A, __B);
+}
+
+#endif // __x86_64__
+#endif // __USERMSRINTRIN_H
--- a/lib/include/vaesintrin.h
+++ b/lib/include/vaesintrin.h
@ -18,8 +18,10 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))

 /* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))
-
+#define __DEFAULT_FN_ATTRS_F                                                   \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512f,evex512,vaes"),                           \
+                 __min_vector_width__(512)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_aesenc_epi128(__m256i __A, __m256i __B)
--- a/Show More
+++ b/Show More