lib/headers: update to clang 10.x C headers

upstream revision: 3cce3790072249cbe51b96cea26bc78019c11fd0
2025-12-06 06:13:07 +00:00 · 2020-01-22 17:42:02 -05:00 · 2020-01-22 17:42:02 -05:00 · 74872263cc
commit 74872263cc
parent 97b2ac598b
29 changed files with 17144 additions and 14957 deletions
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -211,7 +211,15 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
  return __nvvm_vote_ballot_sync(mask, pred);
 }
-inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+inline __device__ unsigned int __activemask() {
 #if CUDA_VERSION < 9020
  return __nvvm_vote_ballot(1);
 #else
  unsigned int mask;
  asm volatile("activemask.b32 %0;" : "=r"(mask));
  return mask;
 #endif
 }
 inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
  return __nvvm_fns(mask, base, offset);
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -2761,8 +2761,8 @@ static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
  return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
 }
-static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
+static __inline__ vector unsigned char __ATTRS_o_ai
-                                                          size_t __b) {
+vec_xl_len_r(unsigned char *__a, size_t __b) {
  vector unsigned char __res =
      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
 #ifdef __LITTLE_ENDIAN__
@ -2876,9 +2876,10 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_ctf(__a, __b)                                                      \
  _Generic((__a), vector int                                                   \
-           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+           : (vector float)__builtin_altivec_vcfsx((vector int)(__a), (__b)),  \
             vector unsigned int                                               \
-           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+           : (vector float)__builtin_altivec_vcfux((vector unsigned int)(__a), \
                                                   (__b)),                     \
             vector unsigned long long                                         \
           : (__builtin_convertvector((vector unsigned long long)(__a),        \
                                      vector double) *                         \
@ -2892,9 +2893,10 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #else
 #define vec_ctf(__a, __b)                                                      \
  _Generic((__a), vector int                                                   \
-           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+           : (vector float)__builtin_altivec_vcfsx((vector int)(__a), (__b)),  \
             vector unsigned int                                               \
-           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
+           : (vector float)__builtin_altivec_vcfux((vector unsigned int)(__a), \
                                                   (__b)))
 #endif
 /* vec_vcfsx */
@ -2910,10 +2912,11 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_cts(__a, __b)                                                      \
  _Generic((__a), vector float                                                 \
-           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __builtin_altivec_vctsxs((vector float)(__a), (__b)),             \
             vector double                                                     \
           : __extension__({                                                   \
             vector double __ret =                                             \
-                 (__a) *                                                       \
+                 (vector double)(__a) *                                        \
                 (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
                                                            << 52);            \
             __builtin_convertvector(__ret, vector signed long long);          \
@ -2931,10 +2934,11 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_ctu(__a, __b)                                                      \
  _Generic((__a), vector float                                                 \
-           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __builtin_altivec_vctuxs((vector float)(__a), (__b)),             \
             vector double                                                     \
           : __extension__({                                                   \
             vector double __ret =                                             \
-                 (__a) *                                                       \
+                 (vector double)(__a) *                                        \
                 (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
                                                            << 52);            \
             __builtin_convertvector(__ret, vector unsigned long long);        \
@ -3286,9 +3290,7 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
 /* vec_dss */
-static __inline__ void __attribute__((__always_inline__)) vec_dss(int __a) {
+#define vec_dss __builtin_altivec_dss
  __builtin_altivec_dss(__a);
 }
 /* vec_dssall */
@ -6301,19 +6303,20 @@ static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_or(vector bool long long __a,
                                                    vector double __b) {
-  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)((vector unsigned long long)__a |
                         (vector unsigned long long)__b);
 }
 static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
                                                    vector bool long long __b) {
-  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)((vector unsigned long long)__a |
                         (vector unsigned long long)__b);
 }
 static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
                                                    vector double __b) {
-  vector unsigned long long __res =
+  return (vector double)((vector unsigned long long)__a |
-      (vector unsigned long long)__a | (vector unsigned long long)__b;
+                         (vector unsigned long long)__b);
  return (vector double)__res;
 }
 static __inline__ vector signed long long __ATTRS_o_ai
@ -14781,7 +14784,7 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
 static __inline__ int __ATTRS_o_ai vec_all_ne(vector float __a,
                                              vector float __b) {
 #ifdef __VSX__
-  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+  return __builtin_vsx_xvcmpeqsp_p(__CR6_EQ, __a, __b);
 #else
  return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
 #endif
@ -16361,27 +16364,32 @@ vec_xl(signed long long __offset, unsigned char *__ptr) {
 static inline __ATTRS_o_ai vector signed short vec_xl(signed long long __offset,
                                                      signed short *__ptr) {
-  return *(unaligned_vec_sshort *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_sshort *)__addr;
 }
 static inline __ATTRS_o_ai vector unsigned short
 vec_xl(signed long long __offset, unsigned short *__ptr) {
-  return *(unaligned_vec_ushort *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_ushort *)__addr;
 }
 static inline __ATTRS_o_ai vector signed int vec_xl(signed long long __offset,
                                                    signed int *__ptr) {
-  return *(unaligned_vec_sint *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_sint *)__addr;
 }
 static inline __ATTRS_o_ai vector unsigned int vec_xl(signed long long __offset,
                                                      unsigned int *__ptr) {
-  return *(unaligned_vec_uint *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_uint *)__addr;
 }
 static inline __ATTRS_o_ai vector float vec_xl(signed long long __offset,
                                               float *__ptr) {
-  return *(unaligned_vec_float *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_float *)__addr;
 }
 #ifdef __VSX__
@ -16391,17 +16399,20 @@ typedef vector double unaligned_vec_double __attribute__((aligned(1)));
 static inline __ATTRS_o_ai vector signed long long
 vec_xl(signed long long __offset, signed long long *__ptr) {
-  return *(unaligned_vec_sll *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_sll *)__addr;
 }
 static inline __ATTRS_o_ai vector unsigned long long
 vec_xl(signed long long __offset, unsigned long long *__ptr) {
-  return *(unaligned_vec_ull *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_ull *)__addr;
 }
 static inline __ATTRS_o_ai vector double vec_xl(signed long long __offset,
                                                double *__ptr) {
-  return *(unaligned_vec_double *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_double *)__addr;
 }
 #endif
@ -16411,12 +16422,14 @@ typedef vector unsigned __int128 unaligned_vec_ui128
    __attribute__((aligned(1)));
 static inline __ATTRS_o_ai vector signed __int128
 vec_xl(signed long long __offset, signed __int128 *__ptr) {
-  return *(unaligned_vec_si128 *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_si128 *)__addr;
 }
 static inline __ATTRS_o_ai vector unsigned __int128
 vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
-  return *(unaligned_vec_ui128 *)(__ptr + __offset);
+  signed char *__addr = (signed char *)__ptr + __offset;
  return *(unaligned_vec_ui128 *)__addr;
 }
 #endif
@ -16425,27 +16438,27 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
 #ifdef __LITTLE_ENDIAN__
 static __inline__ vector signed char __ATTRS_o_ai
 vec_xl_be(signed long long __offset, signed char *__ptr) {
-  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                 13, 12, 11, 10, 9, 8);
 }
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_xl_be(signed long long __offset, unsigned char *__ptr) {
-  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
  return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                 13, 12, 11, 10, 9, 8);
 }
 static __inline__ vector signed short  __ATTRS_o_ai
 vec_xl_be(signed long long __offset, signed short *__ptr) {
-  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_xl_be(signed long long __offset, unsigned short *__ptr) {
-  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
  return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
@ -16513,50 +16526,58 @@ static inline __ATTRS_o_ai void vec_xst(vector unsigned char __vec,
 static inline __ATTRS_o_ai void vec_xst(vector signed short __vec,
                                        signed long long __offset,
                                        signed short *__ptr) {
-  *(unaligned_vec_sshort *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_sshort *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector unsigned short __vec,
                                        signed long long __offset,
                                        unsigned short *__ptr) {
-  *(unaligned_vec_ushort *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_ushort *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector signed int __vec,
                                        signed long long __offset,
                                        signed int *__ptr) {
-  *(unaligned_vec_sint *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_sint *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector unsigned int __vec,
                                        signed long long __offset,
                                        unsigned int *__ptr) {
-  *(unaligned_vec_uint *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_uint *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector float __vec,
                                        signed long long __offset,
                                        float *__ptr) {
-  *(unaligned_vec_float *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_float *)__addr = __vec;
 }
 #ifdef __VSX__
 static inline __ATTRS_o_ai void vec_xst(vector signed long long __vec,
                                        signed long long __offset,
                                        signed long long *__ptr) {
-  *(unaligned_vec_sll *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_sll *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector unsigned long long __vec,
                                        signed long long __offset,
                                        unsigned long long *__ptr) {
-  *(unaligned_vec_ull *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_ull *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector double __vec,
                                        signed long long __offset,
                                        double *__ptr) {
-  *(unaligned_vec_double *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_double *)__addr = __vec;
 }
 #endif
@ -16564,13 +16585,15 @@ static inline __ATTRS_o_ai void vec_xst(vector double __vec,
 static inline __ATTRS_o_ai void vec_xst(vector signed __int128 __vec,
                                        signed long long __offset,
                                        signed __int128 *__ptr) {
-  *(unaligned_vec_si128 *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_si128 *)__addr = __vec;
 }
 static inline __ATTRS_o_ai void vec_xst(vector unsigned __int128 __vec,
                                        signed long long __offset,
                                        unsigned __int128 *__ptr) {
-  *(unaligned_vec_ui128 *)(__ptr + __offset) = __vec;
+  signed char *__addr = (signed char *)__ptr + __offset;
  *(unaligned_vec_ui128 *)__addr = __vec;
 }
 #endif
@ -16583,7 +16606,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
  vector signed char __tmp =
     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                             13, 12, 11, 10, 9, 8);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
@ -16592,7 +16616,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
  vector unsigned char __tmp =
     __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                             13, 12, 11, 10, 9, 8);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
@ -16600,7 +16625,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
                                               signed short *__ptr) {
  vector signed short __tmp =
     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
@ -16608,7 +16634,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
                                               unsigned short *__ptr) {
  vector unsigned short __tmp =
     __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
@ -16620,32 +16647,32 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
                                               signed long long  __offset,
                                               unsigned int *__ptr) {
-  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvw4x_be((vector int)__vec, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
                                               signed long long  __offset,
                                               float *__ptr) {
-  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvw4x_be((vector int)__vec, __offset, __ptr);
 }
 #ifdef __VSX__
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
                                               signed long long  __offset,
                                               signed long long *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
                                               signed long long  __offset,
                                               unsigned long long *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
                                               signed long long  __offset,
                                               double *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 #endif
@ -16668,12 +16695,12 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
 #ifdef __POWER9_VECTOR__
 #define vec_test_data_class(__a, __b)                                          \
-        _Generic((__a),                                                    \
+  _Generic(                                                                    \
-           vector float:                                                   \
+      (__a), vector float                                                      \
-             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
+      : (vector bool int)__builtin_vsx_xvtstdcsp((vector float)(__a), (__b)),  \
-           vector double:                                                  \
+        vector double                                                          \
-             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
+      : (vector bool long long)__builtin_vsx_xvtstdcdp((vector double)(__a),   \
-        )
+                                                       (__b)))
 #endif /* #ifdef __POWER9_VECTOR__ */
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -90,9 +90,11 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 #endif
 /* 8.7 NOP */
 #if !defined(_MSC_VER) || !defined(__aarch64__)
 static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) {
  __builtin_arm_nop();
 }
 #endif
 /* 9 DATA-PROCESSING INTRINSICS */
 /* 9.2 Miscellaneous data-processing intrinsics */
@ -139,6 +141,26 @@ __clzll(uint64_t __t) {
  return __builtin_clzll(__t);
 }
 /* CLS */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __cls(uint32_t __t) {
  return __builtin_arm_cls(__t);
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __clsl(unsigned long __t) {
 #if __SIZEOF_LONG__ == 4
  return __builtin_arm_cls(__t);
 #else
  return __builtin_arm_cls64(__t);
 #endif
 }
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __clsll(uint64_t __t) {
  return __builtin_arm_cls64(__t);
 }
 /* REV */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __rev(uint32_t __t) {
@ -609,11 +631,15 @@ __jcvt(double __a) {
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
-// Memory Tagging Extensions (MTE) Intrinsics
+/* Memory Tagging Extensions (MTE) Intrinsics */
 #if __ARM_FEATURE_MEMORY_TAGGING
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@ -623,6 +649,28 @@ __jcvt(double __a) {
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 #endif
 /* Transactional Memory Extension (TME) Intrinsics */
 #if __ARM_FEATURE_TME
 #define _TMFAILURE_REASON  0x00007fffu
 #define _TMFAILURE_RTRY    0x00008000u
 #define _TMFAILURE_CNCL    0x00010000u
 #define _TMFAILURE_MEM     0x00020000u
 #define _TMFAILURE_IMP     0x00040000u
 #define _TMFAILURE_ERR     0x00080000u
 #define _TMFAILURE_SIZE    0x00100000u
 #define _TMFAILURE_NEST    0x00200000u
 #define _TMFAILURE_DBG     0x00400000u
 #define _TMFAILURE_INT     0x00800000u
 #define _TMFAILURE_TRIVIAL 0x01000000u
 #define __tstart()        __builtin_arm_tstart()
 #define __tcommit()       __builtin_arm_tcommit()
 #define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
 #define __ttest()         __builtin_arm_ttest()
 #endif /* __ARM_FEATURE_TME */
 #if defined(__cplusplus)
 }
 #endif
--- a/lib/include/arm_cmse.h
+++ b/lib/include/arm_cmse.h
@ -0,0 +1,217 @@
 //===---- arm_cmse.h - Arm CMSE support -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef __ARM_CMSE_H
 #define __ARM_CMSE_H
 #if (__ARM_FEATURE_CMSE & 0x1)
 #include <stddef.h>
 #include <stdint.h>
 #define __ARM_CMSE_SECURE_MODE (__ARM_FEATURE_CMSE & 0x2)
 #define CMSE_MPU_READWRITE 1 /* checks if readwrite_ok field is set */
 #define CMSE_AU_NONSECURE  2 /* checks if permissions have secure field unset */
 #define CMSE_MPU_UNPRIV    4 /* sets T flag on TT insrtuction */
 #define CMSE_MPU_READ      8 /* checks if read_ok field is set */
 #define CMSE_MPU_NONSECURE 16 /* sets A flag, checks if secure field unset */
 #define CMSE_NONSECURE (CMSE_AU_NONSECURE | CMSE_MPU_NONSECURE)
 #define cmse_check_pointed_object(p, f) \
  cmse_check_address_range((p), sizeof(*(p)), (f))
 #if defined(__cplusplus)
 extern "C" {
 #endif
 typedef union {
  struct cmse_address_info {
 #ifdef __ARM_BIG_ENDIAN
    /* __ARM_BIG_ENDIAN */
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned idau_region : 8;
    unsigned idau_region_valid : 1;
    unsigned secure : 1;
    unsigned nonsecure_readwrite_ok : 1;
    unsigned nonsecure_read_ok : 1;
 #else
    unsigned : 12;
 #endif
    unsigned readwrite_ok : 1;
    unsigned read_ok : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region_valid : 1;
 #else
    unsigned : 1;
 #endif
    unsigned mpu_region_valid : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region : 8;
 #else
    unsigned : 8;
 #endif
    unsigned mpu_region : 8;
 #else /* __ARM_LITTLE_ENDIAN */
    unsigned mpu_region : 8;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region : 8;
 #else
    unsigned : 8;
 #endif
    unsigned mpu_region_valid : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned sau_region_valid : 1;
 #else
    unsigned : 1;
 #endif
    unsigned read_ok : 1;
    unsigned readwrite_ok : 1;
 #if (__ARM_CMSE_SECURE_MODE)
    unsigned nonsecure_read_ok : 1;
    unsigned nonsecure_readwrite_ok : 1;
    unsigned secure : 1;
    unsigned idau_region_valid : 1;
    unsigned idau_region : 8;
 #else
    unsigned : 12;
 #endif
 #endif /*__ARM_LITTLE_ENDIAN */
  } flags;
  unsigned value;
 } cmse_address_info_t;
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TT(__p);
  return __u;
 }
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTT(__p);
  return __u;
 }
 #if __ARM_CMSE_SECURE_MODE
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTA(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTA(__p);
  return __u;
 }
 static cmse_address_info_t __attribute__((__always_inline__, __nodebug__))
 cmse_TTAT(void *__p) {
  cmse_address_info_t __u;
  __u.value = __builtin_arm_cmse_TTAT(__p);
  return __u;
 }
 #endif
 #define cmse_TT_fptr(p) cmse_TT(__builtin_bit_cast(void *, (p)))
 #define cmse_TTT_fptr(p) cmse_TTT(__builtin_bit_cast(void *, (p)))
 #if __ARM_CMSE_SECURE_MODE
 #define cmse_TTA_fptr(p) cmse_TTA(__builtin_bit_cast(void *, (p)))
 #define cmse_TTAT_fptr(p) cmse_TTAT(__builtin_bit_cast(void *, (p)))
 #endif
 static void *__attribute__((__always_inline__))
 cmse_check_address_range(void *__pb, size_t __s, int __flags) {
  uintptr_t __begin = (uintptr_t)__pb;
  uintptr_t __end = __begin + __s - 1;
  if (__end < __begin)
    return NULL; /* wrap around check */
  /* Check whether the range crosses a 32-bytes aligned address */
  const int __single_check = (__begin ^ __end) < 0x20u;
  /* execute the right variant of the TT instructions */
  void *__pe = (void *)__end;
  cmse_address_info_t __permb, __perme;
  switch (__flags & (CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
  case 0:
    __permb = cmse_TT(__pb);
    __perme = __single_check ? __permb : cmse_TT(__pe);
    break;
  case CMSE_MPU_UNPRIV:
    __permb = cmse_TTT(__pb);
    __perme = __single_check ? __permb : cmse_TTT(__pe);
    break;
 #if __ARM_CMSE_SECURE_MODE
  case CMSE_MPU_NONSECURE:
    __permb = cmse_TTA(__pb);
    __perme = __single_check ? __permb : cmse_TTA(__pe);
    break;
  case CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE:
    __permb = cmse_TTAT(__pb);
    __perme = __single_check ? __permb : cmse_TTAT(__pe);
    break;
 #endif
  /* if CMSE_NONSECURE is specified w/o __ARM_CMSE_SECURE_MODE */
  default:
    return NULL;
  }
  /* check that the range does not cross MPU, SAU, or IDAU region boundaries */
  if (__permb.value != __perme.value)
    return NULL;
 #if !(__ARM_CMSE_SECURE_MODE)
  /* CMSE_AU_NONSECURE is only supported when __ARM_FEATURE_CMSE & 0x2 */
  if (__flags & CMSE_AU_NONSECURE)
    return NULL;
 #endif
  /* check the permission on the range */
  switch (__flags & ~(CMSE_MPU_UNPRIV | CMSE_MPU_NONSECURE)) {
 #if (__ARM_CMSE_SECURE_MODE)
  case CMSE_MPU_READ | CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
  case CMSE_MPU_READWRITE | CMSE_AU_NONSECURE:
    return __permb.flags.nonsecure_readwrite_ok ? __pb : NULL;
  case CMSE_MPU_READ | CMSE_AU_NONSECURE:
    return __permb.flags.nonsecure_read_ok ? __pb : NULL;
  case CMSE_AU_NONSECURE:
    return __permb.flags.secure ? NULL : __pb;
 #endif
  case CMSE_MPU_READ | CMSE_MPU_READWRITE:
  case CMSE_MPU_READWRITE:
    return __permb.flags.readwrite_ok ? __pb : NULL;
  case CMSE_MPU_READ:
    return __permb.flags.read_ok ? __pb : NULL;
  default:
    return NULL;
  }
 }
 #if __ARM_CMSE_SECURE_MODE
 static int __attribute__((__always_inline__, __nodebug__))
 cmse_nonsecure_caller(void) {
  return !((uintptr_t)__builtin_return_address(0) & 1);
 }
 #define cmse_nsfptr_create(p)                                                  \
  __builtin_bit_cast(__typeof__(p),                                            \
                     (__builtin_bit_cast(uintptr_t, p) & ~(uintptr_t)1))
 #define cmse_is_nsfptr(p) ((__builtin_bit_cast(uintptr_t, p) & 1) == 0)
 #endif /* __ARM_CMSE_SECURE_MODE */
 void __attribute__((__noreturn__)) cmse_abort(void);
 #if defined(__cplusplus)
 }
 #endif
 #endif /* (__ARM_FEATURE_CMSE & 0x1) */
 #endif /* __ARM_CMSE_H */
--- a/lib/include/arm_fp16.h
+++ b/lib/include/arm_fp16.h
--- a/lib/include/arm_mve.h
+++ b/lib/include/arm_mve.h
--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -1731,13 +1731,13 @@ _mm512_loadu_epi16 (void const *__P)
  struct __loadu_epi16 {
    __m512i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi16*)__P)->__v;
+  return ((const struct __loadu_epi16*)__P)->__v;
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
                 (__v32hi) __W,
                 (__mmask32) __U);
 }
@ -1745,7 +1745,7 @@ _mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
                 (__v32hi)
                 _mm512_setzero_si512 (),
                 (__mmask32) __U);
@ -1757,13 +1757,13 @@ _mm512_loadu_epi8 (void const *__P)
  struct __loadu_epi8 {
    __m512i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi8*)__P)->__v;
+  return ((const struct __loadu_epi8*)__P)->__v;
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
                 (__v64qi) __W,
                 (__mmask64) __U);
 }
@ -1771,7 +1771,7 @@ _mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
 {
-  return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
                 (__v64qi)
                 _mm512_setzero_si512 (),
                 (__mmask64) __U);
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -4305,7 +4305,7 @@ _mm512_loadu_si512 (void const *__P)
  struct __loadu_si512 {
    __m512i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_si512*)__P)->__v;
+  return ((const struct __loadu_si512*)__P)->__v;
 }
 static __inline __m512i __DEFAULT_FN_ATTRS512
@ -4314,7 +4314,7 @@ _mm512_loadu_epi32 (void const *__P)
  struct __loadu_epi32 {
    __m512i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi32*)__P)->__v;
+  return ((const struct __loadu_epi32*)__P)->__v;
 }
 static __inline __m512i __DEFAULT_FN_ATTRS512
@ -4341,7 +4341,7 @@ _mm512_loadu_epi64 (void const *__P)
  struct __loadu_epi64 {
    __m512i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi64*)__P)->__v;
+  return ((const struct __loadu_epi64*)__P)->__v;
 }
 static __inline __m512i __DEFAULT_FN_ATTRS512
@ -4401,7 +4401,7 @@ _mm512_loadu_pd(void const *__p)
  struct __loadu_pd {
    __m512d_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_pd*)__p)->__v;
+  return ((const struct __loadu_pd*)__p)->__v;
 }
 static __inline __m512 __DEFAULT_FN_ATTRS512
@ -4410,13 +4410,13 @@ _mm512_loadu_ps(void const *__p)
  struct __loadu_ps {
    __m512_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_ps*)__p)->__v;
+  return ((const struct __loadu_ps*)__p)->__v;
 }
 static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_load_ps(void const *__p)
 {
-  return *(__m512*)__p;
+  return *(const __m512*)__p;
 }
 static __inline __m512 __DEFAULT_FN_ATTRS512
@ -4439,7 +4439,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
 static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_load_pd(void const *__p)
 {
-  return *(__m512d*)__p;
+  return *(const __m512d*)__p;
 }
 static __inline __m512d __DEFAULT_FN_ATTRS512
@ -4462,19 +4462,19 @@ _mm512_maskz_load_pd(__mmask8 __U, void const *__P)
 static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_si512 (void const *__P)
 {
-  return *(__m512i *) __P;
+  return *(const __m512i *) __P;
 }
 static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_epi32 (void const *__P)
 {
-  return *(__m512i *) __P;
+  return *(const __m512i *) __P;
 }
 static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_load_epi64 (void const *__P)
 {
-  return *(__m512i *) __P;
+  return *(const __m512i *) __P;
 }
 /* SIMD store ops */
@ -7658,13 +7658,13 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
 #define _mm512_i32gather_ps(index, addr, scale) \
  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
                                       (void const *)(addr), \
-                                       (__v16sf)(__m512)(index), \
+                                       (__v16si)(__m512)(index), \
                                       (__mmask16)-1, (int)(scale))
 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
                                       (void const *)(addr), \
-                                       (__v16sf)(__m512)(index), \
+                                       (__v16si)(__m512)(index), \
                                       (__mmask16)(mask), (int)(scale))
 #define _mm512_i32gather_epi32(index, addr, scale) \
@ -8436,7 +8436,7 @@ _store_mask16(__mmask16 *__A, __mmask16 __B) {
 }
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_si512 (__m512i * __P, __m512i __A)
+_mm512_stream_si512 (void * __P, __m512i __A)
 {
  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
  __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
@ -8450,14 +8450,14 @@ _mm512_stream_load_si512 (void const *__P)
 }
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_pd (double *__P, __m512d __A)
+_mm512_stream_pd (void *__P, __m512d __A)
 {
  typedef __v8df __v8df_aligned __attribute__((aligned(64)));
  __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
 }
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_ps (float *__P, __m512 __A)
+_mm512_stream_ps (void *__P, __m512 __A)
 {
  typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
  __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
@ -8724,13 +8724,13 @@ _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
                                                (__v4sf)_mm_setzero_ps(),
                                                0, 4, 4, 4);
-  return (__m128) __builtin_ia32_loadss128_mask ((__v4sf *) __A, src, __U & 1);
+  return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
 }
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_load_ss (__mmask8 __U, const float* __A)
 {
-  return (__m128)__builtin_ia32_loadss128_mask ((__v4sf *) __A,
+  return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
                                                (__v4sf) _mm_setzero_ps(),
                                                __U & 1);
 }
@ -8742,13 +8742,13 @@ _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
                                                 (__v2df)_mm_setzero_pd(),
                                                 0, 2);
-  return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, src, __U & 1);
+  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
 }
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_load_sd (__mmask8 __U, const double* __A)
 {
-  return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A,
+  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
                                                  (__v2df) _mm_setzero_pd(),
                                                  __U & 1);
 }
@ -9659,6 +9659,23 @@ _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
 }
 #undef _mm512_mask_reduce_operator
 /// Moves the least significant 32 bits of a vector of [16 x i32] to a
 ///    32-bit signed integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
 ///
 /// \param __A
 ///    A vector of [16 x i32]. The least significant 32 bits are moved to the
 ///    destination.
 /// \returns A 32-bit signed integer containing the moved value.
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_cvtsi512_si32(__m512i __A) {
  __v16si __b = (__v16si)__A;
  return __b[0];
 }
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -2289,13 +2289,13 @@ _mm_loadu_epi16 (void const *__P)
  struct __loadu_epi16 {
    __m128i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi16*)__P)->__v;
+  return ((const struct __loadu_epi16*)__P)->__v;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
                 (__v8hi) __W,
                 (__mmask8) __U);
 }
@ -2303,7 +2303,7 @@ _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P,
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
                 (__v8hi)
                 _mm_setzero_si128 (),
                 (__mmask8) __U);
@ -2315,13 +2315,13 @@ _mm256_loadu_epi16 (void const *__P)
  struct __loadu_epi16 {
    __m256i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi16*)__P)->__v;
+  return ((const struct __loadu_epi16*)__P)->__v;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
                 (__v16hi) __W,
                 (__mmask16) __U);
 }
@ -2329,7 +2329,7 @@ _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P,
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
                 (__v16hi)
                 _mm256_setzero_si256 (),
                 (__mmask16) __U);
@ -2341,13 +2341,13 @@ _mm_loadu_epi8 (void const *__P)
  struct __loadu_epi8 {
    __m128i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi8*)__P)->__v;
+  return ((const struct __loadu_epi8*)__P)->__v;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
                 (__v16qi) __W,
                 (__mmask16) __U);
 }
@ -2355,7 +2355,7 @@ _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P,
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
                 (__v16qi)
                 _mm_setzero_si128 (),
                 (__mmask16) __U);
@ -2367,13 +2367,13 @@ _mm256_loadu_epi8 (void const *__P)
  struct __loadu_epi8 {
    __m256i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi8*)__P)->__v;
+  return ((const struct __loadu_epi8*)__P)->__v;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
                 (__v32qi) __W,
                 (__mmask32) __U);
 }
@ -2381,7 +2381,7 @@ _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P,
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
                 (__v32qi)
                 _mm256_setzero_si256 (),
                 (__mmask32) __U);
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -2505,7 +2505,7 @@ _mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
-  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
              (__v2df) __W,
              (__mmask8)
              __U);
@ -2513,7 +2513,7 @@ _mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
-  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
               (__v2df)
               _mm_setzero_pd (),
               (__mmask8)
@ -2522,7 +2522,7 @@ _mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
-  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
              (__v4df) __W,
              (__mmask8)
              __U);
@ -2530,7 +2530,7 @@ _mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
-  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
               (__v4df)
               _mm256_setzero_pd (),
               (__mmask8)
@ -2539,7 +2539,7 @@ _mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
              (__v2di) __W,
              (__mmask8)
              __U);
@ -2547,7 +2547,7 @@ _mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
               (__v2di)
               _mm_setzero_si128 (),
               (__mmask8)
@ -2557,7 +2557,7 @@ _mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
             void const *__P) {
-  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
              (__v4di) __W,
              (__mmask8)
              __U);
@ -2565,7 +2565,7 @@ _mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
-  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
               (__v4di)
               _mm256_setzero_si256 (),
               (__mmask8)
@ -2574,14 +2574,14 @@ _mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
-  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
                   (__v4sf) __W,
                   (__mmask8) __U);
 }
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
-  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
              (__v4sf)
              _mm_setzero_ps (),
              (__mmask8)
@ -2590,14 +2590,14 @@ _mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
-  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
                   (__v8sf) __W,
                   (__mmask8) __U);
 }
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
-  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
              (__v8sf)
              _mm256_setzero_ps (),
              (__mmask8)
@ -2606,7 +2606,7 @@ _mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
              (__v4si) __W,
              (__mmask8)
              __U);
@ -2614,7 +2614,7 @@ _mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
-  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
               (__v4si)
               _mm_setzero_si128 (),
               (__mmask8)     __U);
@ -2623,7 +2623,7 @@ _mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
             void const *__P) {
-  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
              (__v8si) __W,
              (__mmask8)
              __U);
@ -2631,7 +2631,7 @@ _mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
-  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
               (__v8si)
               _mm256_setzero_si256 (),
               (__mmask8)
@ -5073,13 +5073,13 @@ _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
 static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_load_epi32 (void const *__P)
 {
-  return *(__m128i *) __P;
+  return *(const __m128i *) __P;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
              (__v4si) __W,
              (__mmask8)
              __U);
@ -5088,7 +5088,7 @@ _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
              (__v4si)
              _mm_setzero_si128 (),
              (__mmask8)
@ -5098,13 +5098,13 @@ _mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
 static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_load_epi32 (void const *__P)
 {
-  return *(__m256i *) __P;
+  return *(const __m256i *) __P;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
              (__v8si) __W,
              (__mmask8)
              __U);
@ -5113,7 +5113,7 @@ _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
              (__v8si)
              _mm256_setzero_si256 (),
              (__mmask8)
@ -5183,13 +5183,13 @@ _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
 static __inline __m128i __DEFAULT_FN_ATTRS128
 _mm_load_epi64 (void const *__P)
 {
-  return *(__m128i *) __P;
+  return *(const __m128i *) __P;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
              (__v2di) __W,
              (__mmask8)
              __U);
@ -5198,7 +5198,7 @@ _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
              (__v2di)
              _mm_setzero_si128 (),
              (__mmask8)
@ -5208,13 +5208,13 @@ _mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
 static __inline __m256i __DEFAULT_FN_ATTRS256
 _mm256_load_epi64 (void const *__P)
 {
-  return *(__m256i *) __P;
+  return *(const __m256i *) __P;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
              (__v4di) __W,
              (__mmask8)
              __U);
@ -5223,7 +5223,7 @@ _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
              (__v4di)
              _mm256_setzero_si256 (),
              (__mmask8)
@ -5430,7 +5430,7 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
 {
-  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
               (__v2df) __W,
               (__mmask8) __U);
 }
@ -5438,7 +5438,7 @@ _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_load_pd (__mmask8 __U, void const *__P)
 {
-  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
               (__v2df)
               _mm_setzero_pd (),
               (__mmask8) __U);
@ -5447,7 +5447,7 @@ _mm_maskz_load_pd (__mmask8 __U, void const *__P)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
 {
-  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
               (__v4df) __W,
               (__mmask8) __U);
 }
@ -5455,7 +5455,7 @@ _mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_pd (__mmask8 __U, void const *__P)
 {
-  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
               (__v4df)
               _mm256_setzero_pd (),
               (__mmask8) __U);
@ -5464,7 +5464,7 @@ _mm256_maskz_load_pd (__mmask8 __U, void const *__P)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
 {
-  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
              (__v4sf) __W,
              (__mmask8) __U);
 }
@ -5472,7 +5472,7 @@ _mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_load_ps (__mmask8 __U, void const *__P)
 {
-  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
              (__v4sf)
              _mm_setzero_ps (),
              (__mmask8) __U);
@ -5481,7 +5481,7 @@ _mm_maskz_load_ps (__mmask8 __U, void const *__P)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
 {
-  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
              (__v8sf) __W,
              (__mmask8) __U);
 }
@ -5489,7 +5489,7 @@ _mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_load_ps (__mmask8 __U, void const *__P)
 {
-  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
              (__v8sf)
              _mm256_setzero_ps (),
              (__mmask8) __U);
@ -5501,13 +5501,13 @@ _mm_loadu_epi64 (void const *__P)
  struct __loadu_epi64 {
    __m128i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi64*)__P)->__v;
+  return ((const struct __loadu_epi64*)__P)->__v;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
                 (__v2di) __W,
                 (__mmask8) __U);
 }
@ -5515,7 +5515,7 @@ _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P,
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
                 (__v2di)
                 _mm_setzero_si128 (),
                 (__mmask8) __U);
@ -5527,13 +5527,13 @@ _mm256_loadu_epi64 (void const *__P)
  struct __loadu_epi64 {
    __m256i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi64*)__P)->__v;
+  return ((const struct __loadu_epi64*)__P)->__v;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
                 (__v4di) __W,
                 (__mmask8) __U);
 }
@ -5541,7 +5541,7 @@ _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P,
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
                 (__v4di)
                 _mm256_setzero_si256 (),
                 (__mmask8) __U);
@ -5553,13 +5553,13 @@ _mm_loadu_epi32 (void const *__P)
  struct __loadu_epi32 {
    __m128i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi32*)__P)->__v;
+  return ((const struct __loadu_epi32*)__P)->__v;
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
                 (__v4si) __W,
                 (__mmask8) __U);
 }
@ -5567,7 +5567,7 @@ _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
 {
-  return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P,
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
                 (__v4si)
                 _mm_setzero_si128 (),
                 (__mmask8) __U);
@ -5579,13 +5579,13 @@ _mm256_loadu_epi32 (void const *__P)
  struct __loadu_epi32 {
    __m256i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_epi32*)__P)->__v;
+  return ((const struct __loadu_epi32*)__P)->__v;
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
                 (__v8si) __W,
                 (__mmask8) __U);
 }
@ -5593,7 +5593,7 @@ _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
 {
-  return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P,
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
                 (__v8si)
                 _mm256_setzero_si256 (),
                 (__mmask8) __U);
@ -5602,7 +5602,7 @@ _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
 {
-  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
               (__v2df) __W,
               (__mmask8) __U);
 }
@ -5610,7 +5610,7 @@ _mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
 {
-  return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P,
+  return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
               (__v2df)
               _mm_setzero_pd (),
               (__mmask8) __U);
@ -5619,7 +5619,7 @@ _mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
 {
-  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
               (__v4df) __W,
               (__mmask8) __U);
 }
@ -5627,7 +5627,7 @@ _mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
 {
-  return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P,
+  return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
               (__v4df)
               _mm256_setzero_pd (),
               (__mmask8) __U);
@ -5636,7 +5636,7 @@ _mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
 {
-  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
              (__v4sf) __W,
              (__mmask8) __U);
 }
@ -5644,7 +5644,7 @@ _mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
 {
-  return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P,
+  return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
              (__v4sf)
              _mm_setzero_ps (),
              (__mmask8) __U);
@ -5653,7 +5653,7 @@ _mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
 {
-  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
              (__v8sf) __W,
              (__mmask8) __U);
 }
@ -5661,7 +5661,7 @@ _mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
 {
-  return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P,
+  return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
              (__v8sf)
              _mm256_setzero_ps (),
              (__mmask8) __U);
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -3069,7 +3069,7 @@ _mm256_broadcast_ps(__m128 const *__a)
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_load_pd(double const *__p)
 {
-  return *(__m256d *)__p;
+  return *(const __m256d *)__p;
 }
 /// Loads 8 single-precision floating point values from a 32-byte aligned
@ -3085,7 +3085,7 @@ _mm256_load_pd(double const *__p)
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_load_ps(float const *__p)
 {
-  return *(__m256 *)__p;
+  return *(const __m256 *)__p;
 }
 /// Loads 4 double-precision floating point values from an unaligned
@ -3105,7 +3105,7 @@ _mm256_loadu_pd(double const *__p)
  struct __loadu_pd {
    __m256d_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_pd*)__p)->__v;
+  return ((const struct __loadu_pd*)__p)->__v;
 }
 /// Loads 8 single-precision floating point values from an unaligned
@ -3125,7 +3125,7 @@ _mm256_loadu_ps(float const *__p)
  struct __loadu_ps {
    __m256_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_ps*)__p)->__v;
+  return ((const struct __loadu_ps*)__p)->__v;
 }
 /// Loads 256 bits of integer data from a 32-byte aligned memory
@ -3161,7 +3161,7 @@ _mm256_loadu_si256(__m256i_u const *__p)
  struct __loadu_si256 {
    __m256i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_si256*)__p)->__v;
+  return ((const struct __loadu_si256*)__p)->__v;
 }
 /// Loads 256 bits of integer data from an unaligned memory location
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -14,27 +14,13 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 #define _tzcnt_u16(a)     (__tzcnt_u16((a)))
 #define _andn_u32(a, b)   (__andn_u32((a), (b)))
 /* _bextr_u32 != __bextr_u32 */
 #define _blsi_u32(a)      (__blsi_u32((a)))
 #define _blsmsk_u32(a)    (__blsmsk_u32((a)))
 #define _blsr_u32(a)      (__blsr_u32((a)))
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
 /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
   instruction behaves as BSF on non-BMI targets, there is code that expects
   to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 #define _tzcnt_u16(a)     (__tzcnt_u16((a)))
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
@ -51,6 +37,94 @@ __tzcnt_u16(unsigned short __X)
  return __builtin_ia32_tzcnt_u16(__X);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
  return __builtin_ia32_tzcnt_u32(__X);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
  return __builtin_ia32_tzcnt_u32(__X);
 }
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 #ifdef __x86_64__
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
  return __builtin_ia32_tzcnt_u64(__X);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
  return __builtin_ia32_tzcnt_u64(__X);
 }
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 #endif /* __x86_64__ */
 #undef __RELAXED_FN_ATTRS
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
 #define _andn_u32(a, b)   (__andn_u32((a), (b)))
 /* _bextr_u32 != __bextr_u32 */
 #define _blsi_u32(a)      (__blsi_u32((a)))
 #define _blsmsk_u32(a)    (__blsmsk_u32((a)))
 #define _blsr_u32(a)      (__blsr_u32((a)))
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
@ -169,38 +243,6 @@ __blsr_u32(unsigned int __X)
  return __X & (__X - 1);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
  return __builtin_ia32_tzcnt_u32(__X);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
  return __builtin_ia32_tzcnt_u32(__X);
 }
 #ifdef __x86_64__
 #define _andn_u64(a, b)   (__andn_u64((a), (b)))
@ -212,8 +254,6 @@ _mm_tzcnt_32(unsigned int __X)
 #define _blsr_u64(a)      (__blsr_u64((a)))
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
@ -332,41 +372,10 @@ __blsr_u64(unsigned long long __X)
  return __X & (__X - 1);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
  return __builtin_ia32_tzcnt_u64(__X);
 }
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> TZCNT </c> instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
  return __builtin_ia32_tzcnt_u64(__X);
 }
 #endif /* __x86_64__ */
 #undef __DEFAULT_FN_ATTRS
-#undef __RELAXED_FN_ATTRS
+
 #endif /* !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) */
 #endif /* __BMIINTRIN_H */
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -38,8 +38,8 @@
 #define signature_TM2_ecx 0x3638784d
 /* NSC:     "Geode by NSC" */
 #define signature_NSC_ebx 0x646f6547
-#define signature_NSC_edx 0x43534e20
+#define signature_NSC_edx 0x79622065
-#define signature_NSC_ecx 0x79622065
+#define signature_NSC_ecx 0x43534e20
 /* NEXGEN:  "NexGenDriven" */
 #define signature_NEXGEN_ebx 0x4778654e
 #define signature_NEXGEN_edx 0x72446e65
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -1578,7 +1578,7 @@ _mm_cvtsd_f64(__m128d __a)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_pd(double const *__dp)
 {
-  return *(__m128d*)__dp;
+  return *(const __m128d*)__dp;
 }
 /// Loads a double-precision floating-point value from a specified memory
@ -1599,7 +1599,7 @@ _mm_load1_pd(double const *__dp)
  struct __mm_load1_pd_struct {
    double __u;
  } __attribute__((__packed__, __may_alias__));
-  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
+  double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
  return __extension__ (__m128d){ __u, __u };
 }
@ -1622,7 +1622,7 @@ _mm_load1_pd(double const *__dp)
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_loadr_pd(double const *__dp)
 {
-  __m128d __u = *(__m128d*)__dp;
+  __m128d __u = *(const __m128d*)__dp;
  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
 }
@ -1643,7 +1643,7 @@ _mm_loadu_pd(double const *__dp)
  struct __loadu_pd {
    __m128d_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_pd*)__dp)->__v;
+  return ((const struct __loadu_pd*)__dp)->__v;
 }
 /// Loads a 64-bit integer value to the low element of a 128-bit integer
@ -1663,7 +1663,7 @@ _mm_loadu_si64(void const *__a)
  struct __loadu_si64 {
    long long __v;
  } __attribute__((__packed__, __may_alias__));
-  long long __u = ((struct __loadu_si64*)__a)->__v;
+  long long __u = ((const struct __loadu_si64*)__a)->__v;
  return __extension__ (__m128i)(__v2di){__u, 0LL};
 }
@ -1684,7 +1684,7 @@ _mm_loadu_si32(void const *__a)
  struct __loadu_si32 {
    int __v;
  } __attribute__((__packed__, __may_alias__));
-  int __u = ((struct __loadu_si32*)__a)->__v;
+  int __u = ((const struct __loadu_si32*)__a)->__v;
  return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
 }
@ -1705,7 +1705,7 @@ _mm_loadu_si16(void const *__a)
  struct __loadu_si16 {
    short __v;
  } __attribute__((__packed__, __may_alias__));
-  short __u = ((struct __loadu_si16*)__a)->__v;
+  short __u = ((const struct __loadu_si16*)__a)->__v;
  return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
 }
@ -1726,7 +1726,7 @@ _mm_load_sd(double const *__dp)
  struct __mm_load_sd_struct {
    double __u;
  } __attribute__((__packed__, __may_alias__));
-  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
+  double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
  return __extension__ (__m128d){ __u, 0 };
 }
@ -1753,7 +1753,7 @@ _mm_loadh_pd(__m128d __a, double const *__dp)
  struct __mm_loadh_pd_struct {
    double __u;
  } __attribute__((__packed__, __may_alias__));
-  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
+  double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
  return __extension__ (__m128d){ __a[0], __u };
 }
@ -1780,7 +1780,7 @@ _mm_loadl_pd(__m128d __a, double const *__dp)
  struct __mm_loadl_pd_struct {
    double __u;
  } __attribute__((__packed__, __may_alias__));
-  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
+  double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
  return __extension__ (__m128d){ __u, __a[1] };
 }
@ -2288,7 +2288,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b)
  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
 }
-/// Computes the rounded avarages of corresponding elements of two
+/// Computes the rounded averages of corresponding elements of two
 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
 ///    corresponding element of a 128-bit result vector of [16 x i8].
 ///
@ -2308,7 +2308,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b)
  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
 }
-/// Computes the rounded avarages of corresponding elements of two
+/// Computes the rounded averages of corresponding elements of two
 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
 ///    corresponding element of a 128-bit result vector of [8 x i16].
 ///
@ -3550,7 +3550,7 @@ _mm_loadu_si128(__m128i_u const *__p)
  struct __loadu_si128 {
    __m128i_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_si128*)__p)->__v;
+  return ((const struct __loadu_si128*)__p)->__v;
 }
 /// Returns a vector of [2 x i64] where the lower element is taken from
@ -3571,7 +3571,7 @@ _mm_loadl_epi64(__m128i_u const *__p)
  struct __mm_loadl_epi64_struct {
    long long __u;
  } __attribute__((__packed__, __may_alias__));
-  return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+  return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
 }
 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@ -195,6 +195,74 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */
 /** Cast a 32-bit float value to a 32-bit unsigned integer value
 *
 *  \headerfile <x86intrin.h>
 *  This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction in x86_64,
 *  and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
 *
 *  \param __A
 *     A 32-bit float value.
 *  \returns a 32-bit unsigned integer containing the converted value.
 */
 static __inline__ unsigned int __attribute__((__always_inline__))
 _castf32_u32(float __A) {
  unsigned int D;
  __builtin_memcpy(&D, &__A, sizeof(__A));
  return D;
 }
 /** Cast a 64-bit float value to a 64-bit unsigned integer value
 *
 *  \headerfile <x86intrin.h>
 *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
 *  and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
 *
 *  \param __A
 *     A 64-bit float value.
 *  \returns a 64-bit unsigned integer containing the converted value.
 */
 static __inline__ unsigned long long __attribute__((__always_inline__))
 _castf64_u64(double __A) {
  unsigned long long D;
  __builtin_memcpy(&D, &__A, sizeof(__A));
  return D;
 }
 /** Cast a 32-bit unsigned integer value to a 32-bit float value
 *
 *  \headerfile <x86intrin.h>
 *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
 *  and corresponds to the <c> FLDS </c> instruction in ia32.
 *
 *  \param __A
 *     A 32-bit unsigned integer value.
 *  \returns a 32-bit float value containing the converted value.
 */
 static __inline__ float __attribute__((__always_inline__))
 _castu32_f32(unsigned int __A) {
  float D;
  __builtin_memcpy(&D, &__A, sizeof(__A));
  return D;
 }
 /** Cast a 64-bit unsigned integer value to a 64-bit float value
 *
 *  \headerfile <x86intrin.h>
 *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
 *  and corresponds to the <c> FLDL </c> instruction in ia32.
 *
 *  \param __A
 *     A 64-bit unsigned integer value.
 *  \returns a 64-bit float value containing the converted value.
 */
 static __inline__ double __attribute__((__always_inline__))
 _castu64_f64(unsigned long long __A) {
  double D;
  __builtin_memcpy(&D, &__A, sizeof(__A));
  return D;
 }
 /** Adds the unsigned integer operand to the CRC-32C checksum of the
 *     unsigned char operand.
 *
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -64,9 +64,8 @@
 #include <vpclmulqdqintrin.h>
 #endif
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+/* No feature check desired due to internal checks */
 #include <bmiintrin.h>
 #endif
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
@ -302,7 +301,7 @@ _loadbe_i16(void const * __P) {
  struct __loadu_i16 {
    short __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap16(((struct __loadu_i16*)__P)->__v);
+  return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
@ -318,7 +317,7 @@ _loadbe_i32(void const * __P) {
  struct __loadu_i32 {
    int __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap32(((struct __loadu_i32*)__P)->__v);
+  return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
@ -335,7 +334,7 @@ _loadbe_i64(void const * __P) {
  struct __loadu_i64 {
    long long __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap64(((struct __loadu_i64*)__P)->__v);
+  return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -36,6 +36,12 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 #if __x86_64__
 #define __LPTRINT_TYPE__ __int64
 #else
 #define __LPTRINT_TYPE__ long
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -94,8 +100,7 @@ void __outword(unsigned short, unsigned short);
 void __outwordstring(unsigned short, unsigned short *, unsigned long);
 unsigned long __readcr0(void);
 unsigned long __readcr2(void);
-static __inline__
+unsigned __LPTRINT_TYPE__ __readcr3(void);
 unsigned long __readcr3(void);
 unsigned long __readcr4(void);
 unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
@ -132,7 +137,7 @@ void __vmx_vmptrst(unsigned __int64 *);
 void __wbinvd(void);
 void __writecr0(unsigned int);
 static __inline__
-void __writecr3(unsigned int);
+void __writecr3(unsigned __INTPTR_TYPE__);
 void __writecr4(unsigned int);
 void __writecr8(unsigned int);
 void __writedr(unsigned int, unsigned int);
@ -164,7 +169,6 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
 void __cdecl _invpcid(unsigned int, void *);
 static __inline__ void
 __attribute__((__deprecated__("use other intrinsics or C++11 atomics instead")))
 _ReadBarrier(void);
@ -565,24 +569,26 @@ __readmsr(unsigned long __register) {
  __asm__ ("rdmsr" : "=d"(__edx), "=a"(__eax) : "c"(__register));
  return (((unsigned __int64)__edx) << 32) | (unsigned __int64)__eax;
 }
 #endif
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
+static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS
 __readcr3(void) {
-  unsigned long __cr3_val;
+  unsigned __LPTRINT_TYPE__ __cr3_val;
-  __asm__ __volatile__ ("mov %%cr3, %0" : "=q"(__cr3_val) : : "memory");
+  __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
  return __cr3_val;
 }
 static __inline__ void __DEFAULT_FN_ATTRS
-__writecr3(unsigned int __cr3_val) {
+__writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
-  __asm__ ("mov %0, %%cr3" : : "q"(__cr3_val) : "memory");
+  __asm__ ("mov %0, %%cr3" : : "r"(__cr3_val) : "memory");
 }
 #endif
 #ifdef __cplusplus
 }
 #endif
 #undef __LPTRINT_TYPE__
 #undef __DEFAULT_FN_ATTRS
 #endif /* __INTRIN_H */
--- a/lib/include/mwaitxintrin.h
+++ b/lib/include/mwaitxintrin.h
@ -17,9 +17,9 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_monitorx(void const * __p, unsigned __extensions, unsigned __hints)
+_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
 {
-  __builtin_ia32_monitorx((void *)__p, __extensions, __hints);
+  __builtin_ia32_monitorx(__p, __extensions, __hints);
 }
 static __inline__ void __DEFAULT_FN_ATTRS
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -406,7 +406,7 @@ typedef enum memory_order
 #define CLK_OUT_OF_RESOURCES                        -5
 #define CLK_NULL_QUEUE                              0
-#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+#define CLK_NULL_EVENT (__builtin_astype(((__SIZE_MAX__)), clk_event_t))
 // execution model related definitions
 #define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -263,7 +263,7 @@ _mm_movedup_pd(__m128d __a)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
 {
-  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
+  __builtin_ia32_monitor(__p, __extensions, __hints);
 }
 /// Used with the MONITOR instruction to wait while the processor is in
--- a/lib/include/ppc_wrappers/emmintrin.h
+++ b/lib/include/ppc_wrappers/emmintrin.h
@ -35,6 +35,8 @@
 #ifndef EMMINTRIN_H_
 #define EMMINTRIN_H_
 #if defined(__linux__) && defined(__ppc64__)
 #include <altivec.h>
 /* We need definitions from the SSE header files.  */
@ -1747,7 +1749,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B)
  lshift = vec_splat ((__v2du) __B, 0);
  shmask = vec_cmplt (lshift, shmax);
  result = vec_sl ((__v2du) __A, lshift);
-  result = vec_sel ((__v2du) shmask, result, shmask);
+  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
  return (__m128i) result;
 }
@ -1841,7 +1843,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B)
  rshift = vec_splat ((__v2du) __B, 0);
  shmask = vec_cmplt (rshift, shmax);
  result = vec_sr ((__v2du) __A, rshift);
-  result = vec_sel ((__v2du) shmask, result, shmask);
+  result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
  return (__m128i) result;
 }
@ -2315,4 +2317,8 @@ _mm_castsi128_pd(__m128i __A)
  return (__m128d) __A;
 }
 #else
 #include_next <emmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* EMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/mm_malloc.h
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@ -10,6 +10,8 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED
 #if defined(__linux__) && defined(__ppc64__)
 #include <stdlib.h>
 /* We can't depend on <stdlib.h> since the prototype of posix_memalign
@ -41,4 +43,8 @@ _mm_free (void * ptr)
  free (ptr);
 }
 #else
 #include_next <mm_malloc.h>
 #endif
 #endif /* _MM_MALLOC_H_INCLUDED */
--- a/lib/include/ppc_wrappers/mmintrin.h
+++ b/lib/include/ppc_wrappers/mmintrin.h
@ -35,6 +35,8 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 #if defined(__linux__) && defined(__ppc64__)
 #include <altivec.h>
 /* The Intel API is flexible enough that we must allow aliasing with other
   vector types, and their scalar components.  */
@ -1440,4 +1442,9 @@ extern __inline __m64
  return (res.as_m64);
 #endif
 }
 #else
 #include_next <mmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* _MMINTRIN_H_INCLUDED */
--- a/lib/include/ppc_wrappers/pmmintrin.h
+++ b/lib/include/ppc_wrappers/pmmintrin.h
@ -0,0 +1,150 @@
 /*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 /* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.0.  */
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
   makes explicit use of Intel intrinsics to powerpc64le.
   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
   Note that much code that uses Intel intrinsics can be rewritten in
   standard C or GNU C extensions, which are more portable and better
   optimized across multiple targets.
   In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
   is a good match for most SIMD operations.  However the Horizontal
   add/sub requires the data pairs be permuted into a separate
   registers with vertical even/odd alignment for the operation.
   And the addsub operation requires the sign of only the even numbered
   elements be flipped (xored with -0.0).
   For larger blocks of code using these intrinsic implementations,
   the compiler be should be able to schedule instructions to avoid
   additional latency.
   In the specific case of the monitor and mwait instructions there are
   no direct equivalent in the PowerISA at this time.  So those
   intrinsics are not implemented.  */
 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
 #endif
 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_
 #if defined(__linux__) && defined(__ppc64__)
 /* We need definitions from the SSE2 and SSE header files*/
 #include <emmintrin.h>
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_addsub_ps (__m128 __X, __m128 __Y)
 {
  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
  return (__m128) vec_add (__X, even_neg_Y);
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_addsub_pd (__m128d __X, __m128d __Y)
 {
  const __v2df even_n0 = {-0.0, 0.0};
  __v2df even_neg_Y = vec_xor(__Y, even_n0);
  return (__m128d) vec_add (__X, even_neg_Y);
 }
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_ps (__m128 __X, __m128 __Y)
 {
  __vector unsigned char xform2 = {
      0x00, 0x01, 0x02, 0x03,
      0x08, 0x09, 0x0A, 0x0B,
      0x10, 0x11, 0x12, 0x13,
      0x18, 0x19, 0x1A, 0x1B
    };
  __vector unsigned char xform1 = {
      0x04, 0x05, 0x06, 0x07,
      0x0C, 0x0D, 0x0E, 0x0F,
      0x14, 0x15, 0x16, 0x17,
      0x1C, 0x1D, 0x1E, 0x1F
    };
  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
 			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
 }
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_ps (__m128 __X, __m128 __Y)
 {
  __vector unsigned char xform2 = {
      0x00, 0x01, 0x02, 0x03,
      0x08, 0x09, 0x0A, 0x0B,
      0x10, 0x11, 0x12, 0x13,
      0x18, 0x19, 0x1A, 0x1B
    };
  __vector unsigned char xform1 = {
      0x04, 0x05, 0x06, 0x07,
      0x0C, 0x0D, 0x0E, 0x0F,
      0x14, 0x15, 0x16, 0x17,
      0x1C, 0x1D, 0x1E, 0x1F
    };
  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
 			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_pd (__m128d __X, __m128d __Y)
 {
  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
 				  vec_mergel ((__v2df) __X, (__v2df)__Y));
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_pd (__m128d __X, __m128d __Y)
 {
  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
 			    vec_mergel ((__v2df) __X, (__v2df)__Y));
 }
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movehdup_ps (__m128 __X)
 {
  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
 }
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_moveldup_ps (__m128 __X)
 {
  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loaddup_pd (double const *__P)
 {
  return (__m128d) vec_splats (*__P);
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movedup_pd (__m128d __X)
 {
  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
 }
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_lddqu_si128 (__m128i const *__P)
 {
  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
 }
 /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */
 #else
 #include_next <pmmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* PMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -0,0 +1,85 @@
 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 /* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.0.
   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
   Note that much code that uses Intel intrinsics can be rewritten in
   standard C or GNU C extensions, which are more portable and better
   optimized across multiple targets.  */
 #error                                                                         \
    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_
 #if defined(__linux__) && defined(__ppc64__)
 #include <altivec.h>
 #include <emmintrin.h>
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_epi8(__m128i __X, const int __N) {
  return (unsigned char)((__v16qi)__X)[__N & 15];
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_epi32(__m128i __X, const int __N) {
  return ((__v4si)__X)[__N & 3];
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_epi64(__m128i __X, const int __N) {
  return ((__v2di)__X)[__N & 1];
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_ps(__m128 __X, const int __N) {
  return ((__v4si)__X)[__N & 3];
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
  __v16qi __charmask = vec_splats((signed char)__imm8);
  __charmask = vec_gb(__charmask);
  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
 #ifdef __BIG_ENDIAN__
  __shortmask = vec_reve(__shortmask);
 #endif
  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
  const __v16qu __seven = vec_splats((unsigned char)0x07);
  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
 }
 #else
 #include_next <smmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* _SMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/tmmintrin.h
+++ b/lib/include/ppc_wrappers/tmmintrin.h
@ -0,0 +1,495 @@
 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 /* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 9.0.  */
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
   makes explicit use of Intel intrinsics to powerpc64le.
   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
   Note that much code that uses Intel intrinsics can be rewritten in
   standard C or GNU C extensions, which are more portable and better
   optimized across multiple targets.  */
 #endif
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_
 #if defined(__linux__) && defined(__ppc64__)
 #include <altivec.h>
 /* We need definitions from the SSE header files.  */
 #include <pmmintrin.h>
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_epi16 (__m128i __A)
 {
  return (__m128i) vec_abs ((__v8hi) __A);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_epi32 (__m128i __A)
 {
  return (__m128i) vec_abs ((__v4si) __A);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_epi8 (__m128i __A)
 {
  return (__m128i) vec_abs ((__v16qi) __A);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_pi16 (__m64 __A)
 {
  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_pi32 (__m64 __A)
 {
  __v4si __B = (__v4si) (__v2du) { __A, __A };
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_abs_pi8 (__m64 __A)
 {
  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
 {
  if (__builtin_constant_p (__count) && __count < 16)
    {
 #ifdef __LITTLE_ENDIAN__
      __A = (__m128i) vec_reve ((__v16qu) __A);
      __B = (__m128i) vec_reve ((__v16qu) __B);
 #endif
      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
 #ifdef __LITTLE_ENDIAN__
      __A = (__m128i) vec_reve ((__v16qu) __A);
 #endif
      return __A;
    }
  if (__count == 0)
    return __B;
  if (__count >= 16)
    {
      if (__count >= 32)
 	{
 	  const __v16qu zero = { 0 };
 	  return (__m128i) zero;
 	}
      else
 	{
 	  const __v16qu __shift =
 	    vec_splats ((unsigned char) ((__count - 16) * 8));
 #ifdef __LITTLE_ENDIAN__
 	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
 #else
 	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
 #endif
 	}
    }
  else
    {
      const __v16qu __shiftA =
 	vec_splats ((unsigned char) ((16 - __count) * 8));
      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
 #ifdef __LITTLE_ENDIAN__
      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
 #else
      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
 #endif
      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
    }
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
 {
  if (__count < 16)
    {
      __v2du __C = { __B, __A };
 #ifdef __LITTLE_ENDIAN__
      const __v4su __shift = { __count << 3, 0, 0, 0 };
      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
 #else
      const __v4su __shift = { 0, 0, 0, __count << 3 };
      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
 #endif
      return (__m64) __C[0];
    }
  else
    {
      const __m64 __zero = { 0 };
      return __zero;
    }
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_epi16 (__m128i __A, __m128i __B)
 {
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
  return (__m128i) vec_add (__C, __D);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_epi32 (__m128i __A, __m128i __B)
 {
  const __v16qu __P =
    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
  const __v16qu __Q =
    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
  return (__m128i) vec_add (__C, __D);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_pi16 (__m64 __A, __m64 __B)
 {
  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
  __v8hi __D = vec_perm (__C, __C, __Q);
  __C = vec_perm (__C, __C, __P);
  __C = vec_add (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadd_pi32 (__m64 __A, __m64 __B)
 {
  __v4si __C = (__v4si) (__v2du) { __A, __B };
  const __v16qu __P =
    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
  const __v16qu __Q =
    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
  __v4si __D = vec_perm (__C, __C, __Q);
  __C = vec_perm (__C, __C, __P);
  __C = vec_add (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadds_epi16 (__m128i __A, __m128i __B)
 {
  __v4si __C = { 0 }, __D = { 0 };
  __C = vec_sum4s ((__v8hi) __A, __C);
  __D = vec_sum4s ((__v8hi) __B, __D);
  __C = (__v4si) vec_packs (__C, __D);
  return (__m128i) __C;
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hadds_pi16 (__m64 __A, __m64 __B)
 {
  const __v4si __zero = { 0 };
  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
  __v4si __D = vec_sum4s (__C, __zero);
  __C = vec_packs (__D, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_epi16 (__m128i __A, __m128i __B)
 {
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
  return (__m128i) vec_sub (__C, __D);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_epi32 (__m128i __A, __m128i __B)
 {
  const __v16qu __P =
    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
  const __v16qu __Q =
    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
  return (__m128i) vec_sub (__C, __D);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_pi16 (__m64 __A, __m64 __B)
 {
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
  __v8hi __D = vec_perm (__C, __C, __Q);
  __C = vec_perm (__C, __C, __P);
  __C = vec_sub (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsub_pi32 (__m64 __A, __m64 __B)
 {
  const __v16qu __P =
    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
  const __v16qu __Q =
    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
  __v4si __C = (__v4si) (__v2du) { __A, __B };
  __v4si __D = vec_perm (__C, __C, __Q);
  __C = vec_perm (__C, __C, __P);
  __C = vec_sub (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsubs_epi16 (__m128i __A, __m128i __B)
 {
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
  return (__m128i) vec_subs (__C, __D);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_hsubs_pi16 (__m64 __A, __m64 __B)
 {
  const __v16qu __P =
    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
  const __v16qu __Q =
    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
  __v8hi __D = vec_perm (__C, __C, __P);
  __v8hi __E = vec_perm (__C, __C, __Q);
  __C = vec_subs (__D, __E);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_epi8 (__m128i __A, __m128i __B)
 {
  const __v16qi __zero = { 0 };
  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
  return (__m128i) vec_sel (__C, __zero, __select);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pi8 (__m64 __A, __m64 __B)
 {
  const __v16qi __zero = { 0 };
  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
  __C = vec_sel (__C, __zero, __select);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi8 (__m128i __A, __m128i __B)
 {
  const __v16qi __zero = { 0 };
  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
  __v16qi __selectpos =
    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
  __v16qi __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi16 (__m128i __A, __m128i __B)
 {
  const __v8hi __zero = { 0 };
  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
  __v8hi __selectpos =
    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
  __v8hi __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi32 (__m128i __A, __m128i __B)
 {
  const __v4si __zero = { 0 };
  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
  __v4si __selectpos =
    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
  __v4si __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi8 (__m64 __A, __m64 __B)
 {
  const __v16qi __zero = { 0 };
  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi16 (__m64 __A, __m64 __B)
 {
  const __v8hi __zero = { 0 };
  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi32 (__m64 __A, __m64 __B)
 {
  const __v4si __zero = { 0 };
  __v4si __C = (__v4si) (__v2du) { __A, __A };
  __v4si __D = (__v4si) (__v2du) { __B, __B };
  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
 {
  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
  __v8hi __E = vec_unpackh ((__v16qi) __B);
  __v8hi __F = vec_unpackl ((__v16qi) __B);
  __C = vec_mul (__C, __E);
  __D = vec_mul (__D, __F);
  const __v16qu __odds  =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __evens =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __E = vec_perm (__C, __D, __odds);
  __F = vec_perm (__C, __D, __evens);
  return (__m128i) vec_adds (__E, __F);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
 {
  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
  __C = vec_unpackl ((__v16qi) __C);
  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
  __C = vec_and (__C, __unsigned);
  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
  __D = vec_unpackl ((__v16qi) __D);
  __D = vec_mul (__C, __D);
  const __v16qu __odds  =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __evens =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __C = vec_perm (__D, __D, __odds);
  __D = vec_perm (__D, __D, __evens);
  __C = vec_adds (__C, __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
 {
  __v4si __C = vec_unpackh ((__v8hi) __A);
  __v4si __D = vec_unpackh ((__v8hi) __B);
  __C = vec_mul (__C, __D);
  __D = vec_unpackl ((__v8hi) __A);
  __v4si __E = vec_unpackl ((__v8hi) __B);
  __D = vec_mul (__D, __E);
  const __v4su __shift = vec_splats ((unsigned int) 14);
  __C = vec_sr (__C, __shift);
  __D = vec_sr (__D, __shift);
  const __v4si __ones = vec_splats ((signed int) 1);
  __C = vec_add (__C, __ones);
  __C = vec_sr (__C, (__v4su) __ones);
  __D = vec_add (__D, __ones);
  __D = vec_sr (__D, (__v4su) __ones);
  return (__m128i) vec_pack (__C, __D);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
 {
  __v4si __C = (__v4si) (__v2du) { __A, __A };
  __C = vec_unpackh ((__v8hi) __C);
  __v4si __D = (__v4si) (__v2du) { __B, __B };
  __D = vec_unpackh ((__v8hi) __D);
  __C = vec_mul (__C, __D);
  const __v4su __shift = vec_splats ((unsigned int) 14);
  __C = vec_sr (__C, __shift);
  const __v4si __ones = vec_splats ((signed int) 1);
  __C = vec_add (__C, __ones);
  __C = vec_sr (__C, (__v4su) __ones);
  __v8hi __E = vec_pack (__C, __D);
  return (__m64) ((__v2du) (__E))[0];
 }
 #else
 #include_next <tmmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* TMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@ -34,6 +34,8 @@
 #ifndef _XMMINTRIN_H_INCLUDED
 #define _XMMINTRIN_H_INCLUDED
 #if defined(__linux__) && defined(__ppc64__)
 /* Define four value permute mask */
 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
@ -1835,4 +1837,8 @@ do {									\
 /* For backward source compatibility.  */
 //# include <emmintrin.h>
 #else
 #include_next <xmmintrin.h>
 #endif /* defined(__linux__) && defined(__ppc64__) */
 #endif /* _XMMINTRIN_H_INCLUDED */
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -1627,7 +1627,7 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p)
  struct __mm_loadh_pi_struct {
    __mm_loadh_pi_v2f32 __u;
  } __attribute__((__packed__, __may_alias__));
-  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
+  __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
 }
@ -1654,7 +1654,7 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p)
  struct __mm_loadl_pi_struct {
    __mm_loadl_pi_v2f32 __u;
  } __attribute__((__packed__, __may_alias__));
-  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
+  __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
 }
@ -1680,7 +1680,7 @@ _mm_load_ss(const float *__p)
  struct __mm_load_ss_struct {
    float __u;
  } __attribute__((__packed__, __may_alias__));
-  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
+  float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
  return __extension__ (__m128){ __u, 0, 0, 0 };
 }
@ -1702,7 +1702,7 @@ _mm_load1_ps(const float *__p)
  struct __mm_load1_ps_struct {
    float __u;
  } __attribute__((__packed__, __may_alias__));
-  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
+  float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
  return __extension__ (__m128){ __u, __u, __u, __u };
 }
@ -1722,7 +1722,7 @@ _mm_load1_ps(const float *__p)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_load_ps(const float *__p)
 {
-  return *(__m128*)__p;
+  return *(const __m128*)__p;
 }
 /// Loads a 128-bit floating-point vector of [4 x float] from an
@ -1742,7 +1742,7 @@ _mm_loadu_ps(const float *__p)
  struct __loadu_ps {
    __m128_u __v;
  } __attribute__((__packed__, __may_alias__));
-  return ((struct __loadu_ps*)__p)->__v;
+  return ((const struct __loadu_ps*)__p)->__v;
 }
 /// Loads four packed float values, in reverse order, from an aligned
@ -2100,7 +2100,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
 ///    be generated.
-#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
+#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
                                                 ((sel) >> 2) & 1, (sel) & 0x3))
 #endif