update C headers to LLVM 15

release/15.x 37007475ca1b345b4c5d340e228bcd7a62732d81
2025-12-06 14:23:09 +00:00 · 2022-07-28 11:54:23 -07:00 · 2022-07-28 11:54:23 -07:00 · d3389eadf4
commit d3389eadf4
parent adb4a95302
60 changed files with 14251 additions and 107228 deletions
--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -71,8 +71,8 @@
  }                                                                            \
  inline __device__ unsigned long long __FnName(                               \
      unsigned long long __val, __Type __offset, int __width = warpSize) {     \
-    return static_cast<unsigned long long>(::__FnName(                         \
+    return static_cast<unsigned long long>(                                    \
-        static_cast<unsigned long long>(__val), __offset, __width));           \
+        ::__FnName(static_cast<long long>(__val), __offset, __width));         \
  }                                                                            \
  inline __device__ double __FnName(double __val, __Type __offset,             \
                                    int __width = warpSize) {                  \
@ -139,8 +139,8 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
  inline __device__ unsigned long long __FnName(                               \
      unsigned int __mask, unsigned long long __val, __Type __offset,          \
      int __width = warpSize) {                                                \
-    return static_cast<unsigned long long>(::__FnName(                         \
+    return static_cast<unsigned long long>(                                    \
-        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+        ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
  }                                                                            \
  inline __device__ long __FnName(unsigned int __mask, long __val,             \
                                  __Type __offset, int __width = warpSize) {   \
@ -234,7 +234,7 @@ inline __device__ unsigned int __match32_any_sync(unsigned int mask,
  return __nvvm_match_any_sync_i32(mask, value);
 }
-inline __device__ unsigned long long
+inline __device__ unsigned int
 __match64_any_sync(unsigned int mask, unsigned long long value) {
  return __nvvm_match_any_sync_i64(mask, value);
 }
@ -244,7 +244,7 @@ __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
  return __nvvm_match_all_sync_i32p(mask, value, pred);
 }
-inline __device__ unsigned long long
+inline __device__ unsigned int
 __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
  return __nvvm_match_all_sync_i64p(mask, value, pred);
 }
--- a/lib/include/__wmmintrin_pclmul.h
+++ b/lib/include/__wmmintrin_pclmul.h
@ -22,23 +22,23 @@
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
 ///
-/// \param __X
+/// \param X
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __Y
+/// \param Y
 ///    A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __I
+/// \param I
 ///    An immediate value specifying which 64-bit values to select from the
-///    operands. Bit 0 is used to select a value from operand \a __X, and bit
+///    operands. Bit 0 is used to select a value from operand \a X, and bit
-///    4 is used to select a value from operand \a __Y: \n
+///    4 is used to select a value from operand \a Y: \n
-///    Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+///    Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
-///    Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+///    Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
-///    Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+///    Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
-///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+///    Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
 #define _mm_clmulepi64_si128(X, Y, I) \
--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -439,8 +439,6 @@ static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 ///
 /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
 ///
 /// \param dst
 ///    A destination tile. Max size is 1024 Bytes.
 /// \param base
 ///    A pointer to base address.
 /// \param stride
--- a/lib/include/arm_sve.h
+++ b/lib/include/arm_sve.h
@ -2407,15 +2407,15 @@ svuint64_t svcnt_s64_z(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
 svuint16_t svcnt_s16_z(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb)))
-uint64_t svcntb();
+uint64_t svcntb(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb_pat)))
 uint64_t svcntb_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd)))
-uint64_t svcntd();
+uint64_t svcntd(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd_pat)))
 uint64_t svcntd_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth)))
-uint64_t svcnth();
+uint64_t svcnth(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth_pat)))
 uint64_t svcnth_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b8)))
@ -2427,7 +2427,7 @@ uint64_t svcntp_b64(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b16)))
 uint64_t svcntp_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw)))
-uint64_t svcntw();
+uint64_t svcntw(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw_pat)))
 uint64_t svcntw_pat(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
@ -6521,7 +6521,7 @@ int64_t svorv_s64(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
 int16_t svorv_s16(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse_b();
+svbool_t svpfalse_b(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst_b(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b8)))
@ -6627,13 +6627,13 @@ svbool_t svptrue_pat_b64(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b16)))
 svbool_t svptrue_pat_b16(enum svpattern);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b8)))
-svbool_t svptrue_b8();
+svbool_t svptrue_b8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b32)))
-svbool_t svptrue_b32();
+svbool_t svptrue_b32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b64)))
-svbool_t svptrue_b64();
+svbool_t svptrue_b64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b16)))
-svbool_t svptrue_b16();
+svbool_t svptrue_b16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
 svint8_t svqadd_n_s8(svint8_t, int8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
@ -7011,7 +7011,7 @@ svint64_t svrbit_s64_z(svbool_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
 svint16_t svrbit_s16_z(svbool_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr)))
-svbool_t svrdffr();
+svbool_t svrdffr(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr_z)))
 svbool_t svrdffr_z(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
@ -7411,7 +7411,7 @@ svint64x4_t svset4_s64(svint64x4_t, uint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
 svint16x4_t svset4_s16(svint16x4_t, uint64_t, svint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsetffr)))
-void svsetffr();
+void svsetffr(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
 svuint8_t svsplice_u8(svbool_t, svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
@ -8285,93 +8285,93 @@ svfloat32_t svtssel_f32(svfloat32_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
 svfloat16_t svtssel_f16(svfloat16_t, svuint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u8)))
-svuint8x2_t svundef2_u8();
+svuint8x2_t svundef2_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u32)))
-svuint32x2_t svundef2_u32();
+svuint32x2_t svundef2_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u64)))
-svuint64x2_t svundef2_u64();
+svuint64x2_t svundef2_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u16)))
-svuint16x2_t svundef2_u16();
+svuint16x2_t svundef2_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s8)))
-svint8x2_t svundef2_s8();
+svint8x2_t svundef2_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f64)))
-svfloat64x2_t svundef2_f64();
+svfloat64x2_t svundef2_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f32)))
-svfloat32x2_t svundef2_f32();
+svfloat32x2_t svundef2_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f16)))
-svfloat16x2_t svundef2_f16();
+svfloat16x2_t svundef2_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s32)))
-svint32x2_t svundef2_s32();
+svint32x2_t svundef2_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s64)))
-svint64x2_t svundef2_s64();
+svint64x2_t svundef2_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s16)))
-svint16x2_t svundef2_s16();
+svint16x2_t svundef2_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u8)))
-svuint8x3_t svundef3_u8();
+svuint8x3_t svundef3_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u32)))
-svuint32x3_t svundef3_u32();
+svuint32x3_t svundef3_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u64)))
-svuint64x3_t svundef3_u64();
+svuint64x3_t svundef3_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u16)))
-svuint16x3_t svundef3_u16();
+svuint16x3_t svundef3_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s8)))
-svint8x3_t svundef3_s8();
+svint8x3_t svundef3_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f64)))
-svfloat64x3_t svundef3_f64();
+svfloat64x3_t svundef3_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f32)))
-svfloat32x3_t svundef3_f32();
+svfloat32x3_t svundef3_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f16)))
-svfloat16x3_t svundef3_f16();
+svfloat16x3_t svundef3_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s32)))
-svint32x3_t svundef3_s32();
+svint32x3_t svundef3_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s64)))
-svint64x3_t svundef3_s64();
+svint64x3_t svundef3_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s16)))
-svint16x3_t svundef3_s16();
+svint16x3_t svundef3_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u8)))
-svuint8x4_t svundef4_u8();
+svuint8x4_t svundef4_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u32)))
-svuint32x4_t svundef4_u32();
+svuint32x4_t svundef4_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u64)))
-svuint64x4_t svundef4_u64();
+svuint64x4_t svundef4_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u16)))
-svuint16x4_t svundef4_u16();
+svuint16x4_t svundef4_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s8)))
-svint8x4_t svundef4_s8();
+svint8x4_t svundef4_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f64)))
-svfloat64x4_t svundef4_f64();
+svfloat64x4_t svundef4_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f32)))
-svfloat32x4_t svundef4_f32();
+svfloat32x4_t svundef4_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f16)))
-svfloat16x4_t svundef4_f16();
+svfloat16x4_t svundef4_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s32)))
-svint32x4_t svundef4_s32();
+svint32x4_t svundef4_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s64)))
-svint64x4_t svundef4_s64();
+svint64x4_t svundef4_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s16)))
-svint16x4_t svundef4_s16();
+svint16x4_t svundef4_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u8)))
-svuint8_t svundef_u8();
+svuint8_t svundef_u8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u32)))
-svuint32_t svundef_u32();
+svuint32_t svundef_u32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u64)))
-svuint64_t svundef_u64();
+svuint64_t svundef_u64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u16)))
-svuint16_t svundef_u16();
+svuint16_t svundef_u16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s8)))
-svint8_t svundef_s8();
+svint8_t svundef_s8(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f64)))
-svfloat64_t svundef_f64();
+svfloat64_t svundef_f64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f32)))
-svfloat32_t svundef_f32();
+svfloat32_t svundef_f32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f16)))
-svfloat16_t svundef_f16();
+svfloat16_t svundef_f16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s32)))
-svint32_t svundef_s32();
+svint32_t svundef_s32(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s64)))
-svint64_t svundef_s64();
+svint64_t svundef_s64(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s16)))
-svint16_t svundef_s16();
+svint16_t svundef_s16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
 svbool_t svunpkhi_b(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
@ -13830,8 +13830,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s64)))
 int64_t svorv(svbool_t, svint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
 int16_t svorv(svbool_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
-svbool_t svpfalse();
+svbool_t svpfalse(void);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
 svbool_t svpfirst(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
@ -23456,13 +23456,13 @@ svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
 svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
-svbfloat16x2_t svundef2_bf16();
+svbfloat16x2_t svundef2_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
-svbfloat16x3_t svundef3_bf16();
+svbfloat16x3_t svundef3_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
-svbfloat16x4_t svundef4_bf16();
+svbfloat16x4_t svundef4_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
-svbfloat16_t svundef_bf16();
+svbfloat16_t svundef_bf16(void);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
 svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -92,25 +92,25 @@ _mm256_add_epi64(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_adds_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 }
 #define _mm256_alignr_epi8(a, b, n) \
@ -628,25 +628,25 @@ _mm256_sub_epi64(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_subs_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -617,7 +617,7 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -639,7 +639,7 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -661,7 +661,7 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -683,7 +683,7 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_adds_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -950,7 +950,7 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -972,7 +972,7 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -994,7 +994,7 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1016,7 +1016,7 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_subs_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1506,7 +1506,7 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1598,7 +1598,7 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1643,7 +1643,7 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi16(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+  return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1659,7 +1659,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
 {
  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
-                                         (__v32hi)_mm512_srli_epi16(__A, __B),
+                                         (__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B),
                                         (__v32hi)_mm512_setzero_si512());
 }
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
@ -1780,7 +1780,7 @@ _mm512_floor_ps(__m512 __A)
 {
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
                                                  _MM_FROUND_FLOOR,
-                                                  (__v16sf) __A, -1,
+                                                  (__v16sf) __A, (unsigned short)-1,
                                                  _MM_FROUND_CUR_DIRECTION);
 }
@ -1798,7 +1798,7 @@ _mm512_floor_pd(__m512d __A)
 {
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
                                                   _MM_FROUND_FLOOR,
-                                                   (__v8df) __A, -1,
+                                                   (__v8df) __A, (unsigned char)-1,
                                                   _MM_FROUND_CUR_DIRECTION);
 }
@ -1825,7 +1825,7 @@ _mm512_ceil_ps(__m512 __A)
 {
  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
                                                  _MM_FROUND_CEIL,
-                                                  (__v16sf) __A, -1,
+                                                  (__v16sf) __A, (unsigned short)-1,
                                                  _MM_FROUND_CUR_DIRECTION);
 }
@ -1834,7 +1834,7 @@ _mm512_ceil_pd(__m512d __A)
 {
  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
                                                   _MM_FROUND_CEIL,
-                                                   (__v8df) __A, -1,
+                                                   (__v8df) __A, (unsigned char)-1,
                                                   _MM_FROUND_CUR_DIRECTION);
 }
@ -5117,7 +5117,7 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5139,7 +5139,7 @@ _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5161,7 +5161,7 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5183,7 +5183,7 @@ _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srli_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -5929,41 +5929,44 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
                                            (__v8di)_mm512_setzero_si512());
 }
 /// \enum _MM_TERNLOG_ENUM
 ///    A helper to represent the ternary logic operations among vector \a A,
 ///    \a B and \a C. The representation is passed to \a imm.
 typedef enum {
  _MM_TERNLOG_A = 0xF0,
  _MM_TERNLOG_B = 0xCC,
  _MM_TERNLOG_C = 0xAA
 } _MM_TERNLOG_ENUM;
 #define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
-  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
-                                             (__v16si)(__m512i)(B), \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask16)-1))
                                             (__mmask16)-1))
 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
-  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
-                                             (__v16si)(__m512i)(B), \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-                                             (__v16si)(__m512i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask16)(U)))
                                             (__mmask16)(U)))
 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
-  ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
-                                              (__v16si)(__m512i)(B), \
+      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
-                                              (__v16si)(__m512i)(C), \
+      (unsigned char)(imm), (__mmask16)(U)))
                                              (int)(imm), (__mmask16)(U)))
 #define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
-  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
-                                             (__v8di)(__m512i)(B), \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)-1))
                                             (__mmask8)-1))
 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
-  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
-                                             (__v8di)(__m512i)(B), \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                             (__mmask8)(U)))
 #define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
-  ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
-                                              (__v8di)(__m512i)(B), \
+      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
-                                              (__v8di)(__m512i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                              (__mmask8)(U)))
 #ifdef __x86_64__
 #define _mm_cvt_roundsd_i64(A, R) \
@ -6603,7 +6606,7 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi32(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -6626,7 +6629,7 @@ _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi64(__m512i __A, unsigned int __B)
 {
-  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
 }
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -9316,11 +9319,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
 */
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-  return __builtin_ia32_reduce_add_q512(__W);
+  return __builtin_reduce_add((__v8di)__W);
 }
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-  return __builtin_ia32_reduce_mul_q512(__W);
+  return __builtin_reduce_mul((__v8di)__W);
 }
 static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
@ -9334,18 +9337,18 @@ static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi64(__M, __W);
-  return __builtin_ia32_reduce_add_q512(__W);
+  return __builtin_reduce_add((__v8di)__W);
 }
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-  return __builtin_ia32_reduce_mul_q512(__W);
+  return __builtin_reduce_mul((__v8di)__W);
 }
 static __inline__ long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
+  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
  return __builtin_reduce_and((__v8di)__W);
 }
@ -9380,12 +9383,12 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_add_epi32(__m512i __W) {
-  return __builtin_ia32_reduce_add_d512((__v16si)__W);
+  return __builtin_reduce_add((__v16si)__W);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_reduce_mul_epi32(__m512i __W) {
-  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+  return __builtin_reduce_mul((__v16si)__W);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
@ -9401,18 +9404,18 @@ _mm512_reduce_or_epi32(__m512i __W) {
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_maskz_mov_epi32(__M, __W);
-  return __builtin_ia32_reduce_add_d512((__v16si)__W);
+  return __builtin_reduce_add((__v16si)__W);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-  return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+  return __builtin_reduce_mul((__v16si)__W);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
-  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
+  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
  return __builtin_reduce_and((__v16si)__W);
 }
@ -9484,7 +9487,7 @@ _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
+  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
  return __builtin_reduce_min((__v8du)__V);
 }
 static __inline__ int __DEFAULT_FN_ATTRS512
@ -9527,7 +9530,7 @@ _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
 static __inline__ unsigned int __DEFAULT_FN_ATTRS512
 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
-  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
+  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
  return __builtin_reduce_min((__v16su)__V);
 }
@ -9598,7 +9601,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9606,7 +9609,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   dst[i+63:i] := MEM[addr+63:addr]
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
@ -9618,7 +9621,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9630,7 +9633,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   FI
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
                           (base_addr), (scale))
@ -9641,7 +9644,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9649,7 +9652,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   dst[i+63:i] := MEM[addr+63:addr]
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
@ -9660,7 +9663,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9672,7 +9675,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///   FI
 /// ENDFOR
 /// dst[MAX:512] := 0
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
                              (base_addr), (scale))
@ -9683,14 +9686,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
 ///   MEM[addr+63:addr] := v1[i+63:i]
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
@ -9702,7 +9705,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9711,7 +9714,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///     MEM[addr+63:addr] := a[i+63:i]
 ///   FI
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
                            _mm512_castsi512_si256(vindex), (v1), (scale))
@ -9722,14 +9725,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
 ///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
 ///   MEM[addr+63:addr] := a[i+63:i]
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
  _mm512_i32scatter_epi64((base_addr),                                         \
                          _mm512_castsi512_si256(vindex), (v1), (scale))
@ -9741,7 +9744,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///
 /// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// FOR j := 0 to 7
 ///   i := j*64
 ///   m := j*32
@ -9750,7 +9753,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
 ///     MEM[addr+63:addr] := a[i+63:i]
 ///   FI
 /// ENDFOR
-/// \endoperation
+/// \endcode
 #define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
                               _mm512_castsi512_si256(vindex), (v1), (scale))
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -417,7 +417,7 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  __v4sf __V = {__A, 0, 0, 0};
  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-  return __R[0];
+  return (__bfloat16)__R[0];
 }
 /// Convert Packed BF16 Data to Packed float Data.
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -1942,7 +1942,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
                                             (__v8hi)__W);
 }
@ -1950,7 +1950,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_slli_epi16(__A, __B),
+                                             (__v8hi)_mm_slli_epi16(__A, (int)__B),
                                             (__v8hi)_mm_setzero_si128());
 }
@ -1959,7 +1959,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
                                         (__v16hi)__W);
 }
@ -1967,7 +1967,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_slli_epi16(__A, __B),
+                                         (__v16hi)_mm256_slli_epi16(__A, (int)__B),
                                         (__v16hi)_mm256_setzero_si256());
 }
@ -2095,7 +2095,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
                                             (__v8hi)__W);
 }
@ -2103,7 +2103,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_srai_epi16(__A, __B),
+                                             (__v8hi)_mm_srai_epi16(__A, (int)__B),
                                             (__v8hi)_mm_setzero_si128());
 }
@ -2112,7 +2112,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
                       unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
                                         (__v16hi)__W);
 }
@ -2120,7 +2120,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_srai_epi16(__A, __B),
+                                         (__v16hi)_mm256_srai_epi16(__A, (int)__B),
                                         (__v16hi)_mm256_setzero_si256());
 }
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
@ -4525,7 +4525,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }
@ -4533,7 +4533,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_slli_epi32(__A, __B),
+                                             (__v4si)_mm_slli_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }
@ -4541,7 +4541,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }
@ -4549,7 +4549,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_slli_epi32(__A, __B),
+                                             (__v8si)_mm256_slli_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }
@ -4589,7 +4589,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
                                             (__v2di)__W);
 }
@ -4597,7 +4597,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_slli_epi64(__A, __B),
+                                             (__v2di)_mm_slli_epi64(__A, (int)__B),
                                             (__v2di)_mm_setzero_si128());
 }
@ -4605,7 +4605,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
                                             (__v4di)__W);
 }
@ -4613,7 +4613,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_slli_epi64(__A, __B),
+                                             (__v4di)_mm256_slli_epi64(__A, (int)__B),
                                             (__v4di)_mm256_setzero_si256());
 }
@ -4869,7 +4869,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }
@ -4877,7 +4877,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srli_epi32(__A, __B),
+                                             (__v4si)_mm_srli_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }
@ -4885,7 +4885,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }
@ -4893,7 +4893,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srli_epi32(__A, __B),
+                                             (__v8si)_mm256_srli_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }
@ -4933,7 +4933,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
                                             (__v2di)__W);
 }
@ -4941,7 +4941,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
-                                             (__v2di)_mm_srli_epi64(__A, __B),
+                                             (__v2di)_mm_srli_epi64(__A, (int)__B),
                                             (__v2di)_mm_setzero_si128());
 }
@ -4949,7 +4949,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
                                             (__v4di)__W);
 }
@ -4957,7 +4957,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
-                                             (__v4di)_mm256_srli_epi64(__A, __B),
+                                             (__v4di)_mm256_srli_epi64(__A, (int)__B),
                                             (__v4di)_mm256_setzero_si256());
 }
@ -6408,7 +6408,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
                                             (__v4si)__W);
 }
@ -6416,7 +6416,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
 {
  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
-                                             (__v4si)_mm_srai_epi32(__A, __B),
+                                             (__v4si)_mm_srai_epi32(__A, (int)__B),
                                             (__v4si)_mm_setzero_si128());
 }
@ -6424,7 +6424,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
                                             (__v8si)__W);
 }
@ -6432,7 +6432,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
 {
  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
-                                             (__v8si)_mm256_srai_epi32(__A, __B),
+                                             (__v8si)_mm256_srai_epi32(__A, (int)__B),
                                             (__v8si)_mm256_setzero_si256());
 }
@ -6483,7 +6483,7 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srai_epi64(__m128i __A, unsigned int __imm)
 {
-  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+  return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm);
 }
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
@ -6505,7 +6505,7 @@ _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srai_epi64(__m256i __A, unsigned int __imm)
 {
-  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+  return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm);
 }
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -6526,78 +6526,64 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
 }
 #define _mm_ternarylogic_epi32(A, B, C, imm)                                   \
-  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
-                                             (__v4si)(__m128i)(B), \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)-1))
                                             (__mmask8)-1))
 #define _mm_mask_ternarylogic_epi32(A, U, B, C, imm)                           \
-  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogd128_mask(                                  \
-                                             (__v4si)(__m128i)(B), \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-                                             (__v4si)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                             (__mmask8)(U)))
 #define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm)                          \
-  ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogd128_maskz(                                 \
-                                              (__v4si)(__m128i)(B), \
+      (__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C),        \
-                                              (__v4si)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                              (__mmask8)(U)))
 #define _mm256_ternarylogic_epi32(A, B, C, imm)                                \
-  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
-                                             (__v8si)(__m256i)(B), \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)-1))
                                             (__mmask8)-1))
 #define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
-  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogd256_mask(                                  \
-                                             (__v8si)(__m256i)(B), \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-                                             (__v8si)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                             (__mmask8)(U)))
 #define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
-  ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogd256_maskz(                                 \
-                                              (__v8si)(__m256i)(B), \
+      (__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C),        \
-                                              (__v8si)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                              (__mmask8)(U)))
 #define _mm_ternarylogic_epi64(A, B, C, imm)                                   \
-  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
-                                             (__v2di)(__m128i)(B), \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)-1))
                                             (__mmask8)-1))
 #define _mm_mask_ternarylogic_epi64(A, U, B, C, imm)                           \
-  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogq128_mask(                                  \
-                                             (__v2di)(__m128i)(B), \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                             (__mmask8)(U)))
 #define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm)                          \
-  ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogq128_maskz(                                 \
-                                              (__v2di)(__m128i)(B), \
+      (__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C),        \
-                                              (__v2di)(__m128i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                              (__mmask8)(U)))
 #define _mm256_ternarylogic_epi64(A, B, C, imm)                                \
-  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
-                                             (__v4di)(__m256i)(B), \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)-1))
                                             (__mmask8)-1))
 #define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
-  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogq256_mask(                                  \
-                                             (__v4di)(__m256i)(B), \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                             (__mmask8)(U)))
 #define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
-  ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogq256_maskz(                                 \
-                                              (__v4di)(__m256i)(B), \
+      (__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C),        \
-                                              (__v4di)(__m256i)(C), (int)(imm), \
+      (unsigned char)(imm), (__mmask8)(U)))
                                              (__mmask8)(U)))
 #define _mm256_shuffle_f32x4(A, B, imm) \
  ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -25,7 +25,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -34,7 +34,7 @@
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpbusd_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
@ -45,7 +45,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -54,7 +54,7 @@
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpbusds_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
@ -65,14 +65,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpwssd_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
@ -83,14 +83,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 #define _mm256_dpwssds_epi32(S, A, B) \
  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
@ -101,7 +101,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -110,7 +110,7 @@
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpbusd_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
@ -121,7 +121,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
@ -130,7 +130,7 @@
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpbusds_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
@ -141,14 +141,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpwssd_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
@ -159,14 +159,14 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
 ///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 #define _mm_dpwssds_epi32(S, A, B) \
  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -1504,7 +1504,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
 ///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
 ///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
-///    11: Bits [127:96] and [255:224] are copied from the selected operand.
+///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
@ -1953,12 +1956,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// int _mm256_extract_epi32(__m256i X, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit vector of [8 x i32].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [2:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
@ -1971,12 +1978,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// int _mm256_extract_epi16(__m256i X, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [16 x i16].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [3:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
@ -1990,12 +2001,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// int _mm256_extract_epi8(__m256i X, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [32 x i8].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [4:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
@ -2010,12 +2025,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// long long _mm256_extract_epi64(__m256i X, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A 256-bit integer vector of [4 x i64].
-/// \param __imm
+/// \param N
 ///    An immediate integer operand with bits [1:0] determining which vector
 ///    element is extracted and returned.
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
@ -2030,18 +2049,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [8 x i32] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a __imm with \a __b.
+///    \a N with \a I.
 #define _mm256_insert_epi32(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
                                        (int)(I), (int)(N)))
@ -2053,18 +2076,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [16 x i16] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i16 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a __imm with \a __b.
+///    \a N with \a I.
 #define _mm256_insert_epi16(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
                                         (int)(I), (int)(N)))
@ -2075,18 +2102,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [32 x i8] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    An i8 integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \returns A copy of vector \a X, after replacing its element indexed by
-///    \a __imm with \a __b.
+///    \a N with \a I.
 #define _mm256_insert_epi8(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
                                         (int)(I), (int)(N)))
@ -2098,18 +2129,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
 /// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
 ///   instruction.
 ///
-/// \param __a
+/// \param X
 ///    A vector of [4 x i64] to be used by the insert operation.
-/// \param __b
+/// \param I
 ///    A 64-bit integer value. The replacement value for the insert operation.
-/// \param __imm
+/// \param N
 ///    An immediate integer specifying the index of the vector element to be
 ///    replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \returns A copy of vector \a X, after replacing its element indexed by
-///     \a __imm with \a __b.
+///     \a N with \a I.
 #define _mm256_insert_epi64(X, I, N) \
  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
                                        (long long)(I), (int)(N)))
@ -3177,7 +3212,7 @@ _mm256_loadu_si256(__m256i_u const *__p)
 ///    A pointer to a 256-bit integer vector containing integer values.
 /// \returns A 256-bit integer vector containing the moved values.
 static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_lddqu_si256(__m256i const *__p)
+_mm256_lddqu_si256(__m256i_u const *__p)
 {
  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
 }
--- a/lib/include/avxvnniintrin.h
+++ b/lib/include/avxvnniintrin.h
@ -50,7 +50,7 @@
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -59,7 +59,7 @@
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -73,7 +73,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -82,7 +82,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -96,14 +96,14 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -117,14 +117,14 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 7
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:256] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
@ -138,7 +138,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -147,7 +147,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -161,7 +161,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 ///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
@ -170,7 +170,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -184,14 +184,14 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
@ -205,14 +205,14 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 ///
 /// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 ///    FOR j := 0 to 3
 ///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
 ///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 ///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
 ///    ENDFOR
 ///    DST[MAX:128] := 0
-/// \endoperation
+/// \endcode
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
--- a/lib/include/bmiintrin.h
+++ b/lib/include/bmiintrin.h
@ -47,6 +47,7 @@ __tzcnt_u16(unsigned short __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
 /// \see _mm_tzcnt_32
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@ -63,10 +64,11 @@ __tzcnt_u32(unsigned int __X)
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
 /// \see __tzcnt_u32
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
-  return __builtin_ia32_tzcnt_u32(__X);
+  return (int)__builtin_ia32_tzcnt_u32(__X);
 }
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
@ -83,6 +85,7 @@ _mm_tzcnt_32(unsigned int __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
 /// \see _mm_tzcnt_64
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@ -99,10 +102,11 @@ __tzcnt_u64(unsigned long long __X)
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
 /// \see __tzcnt_u64
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
-  return __builtin_ia32_tzcnt_u64(__X);
+  return (long long)__builtin_ia32_tzcnt_u64(__X);
 }
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
--- a/lib/include/cetintrin.h
+++ b/lib/include/cetintrin.h
@ -19,7 +19,7 @@
  __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
 static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
-  __builtin_ia32_incsspd(__a);
+  __builtin_ia32_incsspd((unsigned int)__a);
 }
 #ifdef __x86_64__
@ -34,7 +34,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
 }
 #else /* __x86_64__ */
 static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
-  __builtin_ia32_incsspd((int)__a);
+  __builtin_ia32_incsspd(__a);
 }
 #endif /* __x86_64__ */
@ -42,9 +42,12 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
  return __builtin_ia32_rdsspd(__a);
 }
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wuninitialized"
  unsigned int t;
  return __builtin_ia32_rdsspd(t);
 #pragma clang diagnostic pop
 }
 #ifdef __x86_64__
@ -52,9 +55,12 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo
  return __builtin_ia32_rdsspq(__a);
 }
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wuninitialized"
  unsigned long long t;
  return __builtin_ia32_rdsspq(t);
 #pragma clang diagnostic pop
 }
 #endif /* __x86_64__ */
@ -68,7 +74,7 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
 }
 #endif /* __x86_64__ */
-static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
  __builtin_ia32_saveprevssp();
 }
@ -96,7 +102,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void *
 }
 #endif /* __x86_64__ */
-static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
  __builtin_ia32_setssbsy();
 }
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
--- a/lib/include/f16cintrin.h
+++ b/lib/include/f16cintrin.h
@ -65,9 +65,9 @@ _cvtsh_ss(unsigned short __a)
 ///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) \
+#define _cvtss_sh(a, imm) __extension__ ({ \
-  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0]))
+                                                     (imm)))[0]); })
 /// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
--- a/lib/include/hlsl.h
+++ b/lib/include/hlsl.h
@ -0,0 +1,15 @@
 //===----- hlsl.h - HLSL definitions --------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef _HLSL_H_
 #define _HLSL_H_
 #include "hlsl/hlsl_basic_types.h"
 #include "hlsl/hlsl_intrinsics.h"
 #endif //_HLSL_H_
--- a/lib/include/hlsl_basic_types.h
+++ b/lib/include/hlsl_basic_types.h
@ -0,0 +1,64 @@
 //===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef _HLSL_HLSL_BASIC_TYPES_H_
 #define _HLSL_HLSL_BASIC_TYPES_H_
 // built-in scalar data types:
 #ifdef __HLSL_ENABLE_16_BIT
 // 16-bit integer.
 typedef unsigned short uint16_t;
 typedef short int16_t;
 #endif
 // unsigned 32-bit integer.
 typedef unsigned int uint;
 // 64-bit integer.
 typedef unsigned long uint64_t;
 typedef long int64_t;
 // built-in vector data types:
 #ifdef __HLSL_ENABLE_16_BIT
 typedef vector<int16_t, 2> int16_t2;
 typedef vector<int16_t, 3> int16_t3;
 typedef vector<int16_t, 4> int16_t4;
 typedef vector<uint16_t, 2> uint16_t2;
 typedef vector<uint16_t, 3> uint16_t3;
 typedef vector<uint16_t, 4> uint16_t4;
 #endif
 typedef vector<int, 2> int2;
 typedef vector<int, 3> int3;
 typedef vector<int, 4> int4;
 typedef vector<uint, 2> uint2;
 typedef vector<uint, 3> uint3;
 typedef vector<uint, 4> uint4;
 typedef vector<int64_t, 2> int64_t2;
 typedef vector<int64_t, 3> int64_t3;
 typedef vector<int64_t, 4> int64_t4;
 typedef vector<uint64_t, 2> uint64_t2;
 typedef vector<uint64_t, 3> uint64_t3;
 typedef vector<uint64_t, 4> uint64_t4;
 #ifdef __HLSL_ENABLE_16_BIT
 typedef vector<half, 2> half2;
 typedef vector<half, 3> half3;
 typedef vector<half, 4> half4;
 #endif
 typedef vector<float, 2> float2;
 typedef vector<float, 3> float3;
 typedef vector<float, 4> float4;
 typedef vector<double, 2> double2;
 typedef vector<double, 3> double3;
 typedef vector<double, 4> double4;
 #endif //_HLSL_HLSL_BASIC_TYPES_H_
--- a/lib/include/hlsl_intrinsics.h
+++ b/lib/include/hlsl_intrinsics.h
@ -0,0 +1,15 @@
 //===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef _HLSL_HLSL_INTRINSICS_H_
 #define _HLSL_HLSL_INTRINSICS_H_
 __attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
 WaveActiveCountBits(bool bBit);
 #endif //_HLSL_HLSL_INTRINSICS_H_
--- a/lib/include/hresetintrin.h
+++ b/lib/include/hresetintrin.h
@ -25,7 +25,7 @@
 ///
 /// This intrinsic corresponds to the <c> HRESET </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///    IF __eax == 0
 ///      // nop
 ///    ELSE
@ -35,7 +35,7 @@
 ///        FI
 ///      ENDFOR
 ///    FI
-/// \endoperation
+/// \endcode
 static __inline void __DEFAULT_FN_ATTRS
 _hreset(int __eax)
 {
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@ -40,7 +40,7 @@
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfd(int __A) {
-  return __builtin_ctz(__A);
+  return __builtin_ctz((unsigned int)__A);
 }
 /** Find the first set bit starting from the msb. Result is undefined if
@ -57,7 +57,7 @@ __bsfd(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrd(int __A) {
-  return 31 - __builtin_clz(__A);
+  return 31 - __builtin_clz((unsigned int)__A);
 }
 /** Swaps the bytes in the input. Converting little endian to big endian or
@ -73,12 +73,12 @@ __bsrd(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapd(int __A) {
-  return __builtin_bswap32(__A);
+  return (int)__builtin_bswap32((unsigned int)__A);
 }
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 _bswap(int __A) {
-  return __builtin_bswap32(__A);
+  return (int)__builtin_bswap32((unsigned int)__A);
 }
 #define _bit_scan_forward(A) __bsfd((A))
@ -99,7 +99,7 @@ _bswap(int __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfq(long long __A) {
-  return __builtin_ctzll(__A);
+  return (long long)__builtin_ctzll((unsigned long long)__A);
 }
 /** Find the first set bit starting from the msb. Result is undefined if
@ -116,7 +116,7 @@ __bsfq(long long __A) {
 */
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrq(long long __A) {
-  return 63 - __builtin_clzll(__A);
+  return 63 - __builtin_clzll((unsigned long long)__A);
 }
 /** Swaps the bytes in the input. Converting little endian to big endian or
@ -132,7 +132,7 @@ __bsrq(long long __A) {
 */
 static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapq(long long __A) {
-  return __builtin_bswap64(__A);
+  return (long long)__builtin_bswap64((unsigned long long)__A);
 }
 #define _bswap64(A) __bswapq((A))
@ -395,23 +395,23 @@ __rorw(unsigned short __X, int __C) {
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rold(unsigned int __X, int __C) {
-  return __builtin_rotateleft32(__X, __C);
+  return __builtin_rotateleft32(__X, (unsigned int)__C);
 }
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rord(unsigned int __X, int __C) {
-  return __builtin_rotateright32(__X, __C);
+  return __builtin_rotateright32(__X, (unsigned int)__C);
 }
 #ifdef __x86_64__
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolq(unsigned long long __X, int __C) {
-  return __builtin_rotateleft64(__X, __C);
+  return __builtin_rotateleft64(__X, (unsigned long long)__C);
 }
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorq(unsigned long long __X, int __C) {
-  return __builtin_rotateright64(__X, __C);
+  return __builtin_rotateright64(__X, (unsigned long long)__C);
 }
 #endif /* __x86_64__ */
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -276,20 +276,20 @@ _rdpid_u32(void) {
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
-  return __builtin_ia32_rdrand16_step(__p);
+  return (int)__builtin_ia32_rdrand16_step(__p);
 }
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand32_step(unsigned int *__p)
 {
-  return __builtin_ia32_rdrand32_step(__p);
+  return (int)__builtin_ia32_rdrand32_step(__p);
 }
 #ifdef __x86_64__
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand64_step(unsigned long long *__p)
 {
-  return __builtin_ia32_rdrand64_step(__p);
+  return (int)__builtin_ia32_rdrand64_step(__p);
 }
 #endif
 #endif /* __RDRND__ */
@ -360,50 +360,50 @@ _writegsbase_u64(unsigned long long __V)
 static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i16(void const * __P) {
  struct __loadu_i16 {
-    short __v;
+    unsigned short __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
+  return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i16(void * __P, short __D) {
  struct __storeu_i16 {
-    short __v;
+    unsigned short __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D);
+  ((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
 }
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i32(void const * __P) {
  struct __loadu_i32 {
-    int __v;
+    unsigned int __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
+  return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i32(void * __P, int __D) {
  struct __storeu_i32 {
-    int __v;
+    unsigned int __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D);
+  ((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
 }
 #ifdef __x86_64__
 static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _loadbe_i64(void const * __P) {
  struct __loadu_i64 {
-    long long __v;
+    unsigned long long __v;
  } __attribute__((__packed__, __may_alias__));
-  return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
+  return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
 }
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
 _storebe_i64(void * __P, long long __D) {
  struct __storeu_i64 {
-    long long __v;
+    unsigned long long __v;
  } __attribute__((__packed__, __may_alias__));
-  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D);
+  ((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
 }
 #endif
 #endif /* __MOVBE */
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -534,27 +534,6 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 |* Misc
 \*----------------------------------------------------------------------------*/
 #if defined(__i386__) || defined(__x86_64__)
 #if defined(__i386__)
 #define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
  __asm("cpuid"                                                                \
        : "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
        : "0"(__leaf), "2"(__count))
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
 #define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
  __asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n"                                      \
        "cpuid\n"                                                              \
        "xchg{q} {%%rbx, %q1|%q1, rbx}"                                        \
        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
        : "0"(__leaf), "2"(__count))
 #endif
 static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
  __cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
                                                    int __ecx) {
  __cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
 }
 static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
  __asm__ volatile("hlt");
 }
@ -581,6 +560,18 @@ unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
 __int64 __mulh(__int64 __a, __int64 __b);
 unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
 void __break(int);
 void __writex18byte(unsigned long offset, unsigned char data);
 void __writex18word(unsigned long offset, unsigned short data);
 void __writex18dword(unsigned long offset, unsigned long data);
 void __writex18qword(unsigned long offset, unsigned __int64 data);
 unsigned char __readx18byte(unsigned long offset);
 unsigned short __readx18word(unsigned long offset);
 unsigned long __readx18dword(unsigned long offset);
 unsigned __int64 __readx18qword(unsigned long offset);
 #endif
 /*----------------------------------------------------------------------------*\
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -46,7 +46,7 @@
 ///
 /// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
 ///   GP (0)
 /// FI
@ -91,7 +91,7 @@
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
               __m128i __enkey_lo, __m128i __enkey_hi) {
@ -106,7 +106,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 ///
 /// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// InputKey[127:0] := __key[127:0]
 /// KeyMetadata[2:0] := __htype[2:0]
 /// KeyMetadata[23:3] := 0 // Reserved for future usage
@ -126,7 +126,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
  return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
@ -141,7 +141,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 ///
 /// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// InputKey[127:0] := __key_lo[127:0]
 /// InputKey[255:128] := __key_hi[255:128]
 /// KeyMetadata[2:0] := __htype[2:0]
@ -163,7 +163,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
                     void *__h) {
@ -179,7 +179,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
 ///
 /// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -202,7 +202,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -216,7 +216,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -241,7 +241,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -255,7 +255,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
 /// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
 ///                  (Handle[127:0] AND (CPL > 0)) ||
@ -280,7 +280,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -294,7 +294,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
 ///                   (Handle[127:0] AND (CPL > 0)) ||
@ -319,7 +319,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
  return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
@ -346,7 +346,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 ///
 /// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle := MEM[__h+383:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -377,7 +377,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
@ -392,7 +392,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -423,7 +423,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
@ -438,7 +438,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[383:0] := MEM[__h+383:__h]
 /// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
 ///                    (Handle[127:0] AND (CPL > 0)) ||
@ -469,7 +469,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
@ -484,7 +484,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 ///
 /// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
 ///
-/// \operation
+/// \code{.operation}
 /// Handle[511:0] := MEM[__h+511:__h]
 /// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
 ///                   (Handle[127:0] AND (CPL > 0)) ||
@ -515,7 +515,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
 /// AF := 0
 /// PF := 0
 /// CF := 0
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
  return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
--- a/lib/include/mm_malloc.h
+++ b/lib/include/mm_malloc.h
@ -28,9 +28,9 @@ extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size
 #if !(defined(_WIN32) && defined(_mm_malloc))
 static __inline__ void *__attribute__((__always_inline__, __nodebug__,
-                                       __malloc__))
+                                       __malloc__, __alloc_size__(1),
-_mm_malloc(size_t __size, size_t __align)
+                                       __alloc_align__(2)))
-{
+_mm_malloc(size_t __size, size_t __align) {
  if (__align == 1) {
    return malloc(__size);
  }
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -21,6 +21,7 @@
 #define cl_khr_subgroup_shuffle 1
 #define cl_khr_subgroup_shuffle_relative 1
 #define cl_khr_subgroup_clustered_reduce 1
 #define cl_khr_subgroup_rotate 1
 #define cl_khr_extended_bit_ops 1
 #define cl_khr_integer_dot_product 1
 #define __opencl_c_integer_dot_product_input_4x8bit 1
@ -67,6 +68,7 @@
 #if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
 // For the SPIR and SPIR-V target all features are supported.
 #if defined(__SPIR__) || defined(__SPIRV__)
 #define __opencl_c_work_group_collective_functions 1
 #define __opencl_c_atomic_order_seq_cst 1
 #define __opencl_c_atomic_scope_device 1
 #define __opencl_c_atomic_scope_all_devices 1
@ -80,6 +82,11 @@
 #define __opencl_c_named_address_space_builtins 1
 #endif // !defined(__opencl_c_generic_address_space)
 #if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
 // Internal feature macro to provide subgroup builtins.
 #define __opencl_subgroup_builtins 1
 #endif
 // built-in scalar data types:
 /**
@ -197,6 +204,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
 typedef double double16 __attribute__((ext_vector_type(16)));
 #endif
 // An internal alias for half, for use by OpenCLBuiltins.td.
 #define __half half
 #if defined(__OPENCL_CPP_VERSION__)
 #define NULL nullptr
 #elif defined(__OPENCL_C_VERSION__)
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -35,7 +35,7 @@
 ///    A pointer to a 128-bit integer vector containing integer values.
 /// \returns A 128-bit vector containing the moved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_lddqu_si128(__m128i const *__p)
+_mm_lddqu_si128(__m128i_u const *__p)
 {
  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }
--- a/lib/include/ppc_wrappers/bmi2intrin.h
+++ b/lib/include/ppc_wrappers/bmi2intrin.h
@ -0,0 +1,134 @@
 /*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #if !defined X86GPRINTRIN_H_
 #error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 #ifndef BMI2INTRIN_H_
 #define BMI2INTRIN_H_
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _bzhi_u32(unsigned int __X, unsigned int __Y) {
  return ((__X << (32 - __Y)) >> (32 - __Y));
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
  unsigned long long __res = (unsigned long long)__X * __Y;
  *__P = (unsigned int)(__res >> 32);
  return (unsigned int)__res;
 }
 #ifdef __PPC64__
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
  return ((__X << (64 - __Y)) >> (64 - __Y));
 }
 /* __int128 requires base 64-bit.  */
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mulx_u64(unsigned long long __X, unsigned long long __Y,
              unsigned long long *__P) {
  unsigned __int128 __res = (unsigned __int128)__X * __Y;
  *__P = (unsigned long long)(__res >> 64);
  return (unsigned long long)__res;
 }
 #ifdef _ARCH_PWR7
 /* popcount and bpermd require power7 minimum.  */
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _pdep_u64(unsigned long long __X, unsigned long long __M) {
  unsigned long __result = 0x0UL;
  const unsigned long __mask = 0x8000000000000000UL;
  unsigned long __m = __M;
  unsigned long __c, __t;
  unsigned long __p;
  /* The pop-count of the mask gives the number of the bits from
   source to process.  This is also needed to shift bits from the
   source into the correct position for the result.  */
  __p = 64 - __builtin_popcountl(__M);
  /* The loop is for the number of '1' bits in the mask and clearing
   each mask bit as it is processed.  */
  while (__m != 0) {
    __c = __builtin_clzl(__m);
    __t = __X << (__p - __c);
    __m ^= (__mask >> __c);
    __result |= (__t & (__mask >> __c));
    __p++;
  }
  return __result;
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _pext_u64(unsigned long long __X, unsigned long long __M) {
  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
  const unsigned long __mask = 0x8000000000000000UL;
  unsigned long __m = __M;
  unsigned long __c;
  unsigned long __result;
  /* if the mask is constant and selects 8 bits or less we can use
   the Power8 Bit permute instruction.  */
  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
    /* Also if the pext mask is constant, then the popcount is
     constant, we can evaluate the following loop at compile
     time and use a constant bit permute vector.  */
    long __i;
    for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
      __c = __builtin_clzl(__m);
      __p = (__p << 8) | __c;
      __m ^= (__mask >> __c);
    }
    __result = __builtin_bpermd(__p, __X);
  } else {
    __p = 64 - __builtin_popcountl(__M);
    __result = 0;
    /* We could a use a for loop here, but that combined with
     -funroll-loops can expand to a lot of code.  The while
     loop avoids unrolling and the compiler commons the xor
     from clearing the mask bit with the (m != 0) test.  The
     result is a more compact loop setup and body.  */
    while (__m != 0) {
      unsigned long __t;
      __c = __builtin_clzl(__m);
      __t = (__X & (__mask >> __c)) >> (__p - __c);
      __m ^= (__mask >> __c);
      __result |= (__t);
      __p++;
    }
  }
  return __result;
 }
 /* these 32-bit implementations depend on 64-bit pdep/pext
   which depend on _ARCH_PWR7.  */
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _pdep_u32(unsigned int __X, unsigned int __Y) {
  return _pdep_u64(__X, __Y);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _pext_u32(unsigned int __X, unsigned int __Y) {
  return _pext_u64(__X, __Y);
 }
 #endif /* _ARCH_PWR7  */
 #endif /* __PPC64__  */
 #endif /* BMI2INTRIN_H_ */
--- a/lib/include/ppc_wrappers/bmiintrin.h
+++ b/lib/include/ppc_wrappers/bmiintrin.h
@ -0,0 +1,165 @@
 /*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #if !defined X86GPRINTRIN_H_
 #error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
 #endif
 #ifndef BMIINTRIN_H_
 #define BMIINTRIN_H_
 extern __inline unsigned short
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __tzcnt_u16(unsigned short __X) {
  return __builtin_ctz(__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __andn_u32(unsigned int __X, unsigned int __Y) {
  return (~__X & __Y);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
  return ((__X << (32 - (__L + __P))) >> (32 - __L));
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __bextr_u32(unsigned int __X, unsigned int __Y) {
  unsigned int __P, __L;
  __P = __Y & 0xFF;
  __L = (__Y >> 8) & 0xFF;
  return (_bextr_u32(__X, __P, __L));
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsi_u32(unsigned int __X) {
  return (__X & -__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsi_u32(unsigned int __X) {
  return __blsi_u32(__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsmsk_u32(unsigned int __X) {
  return (__X ^ (__X - 1));
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsmsk_u32(unsigned int __X) {
  return __blsmsk_u32(__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsr_u32(unsigned int __X) {
  return (__X & (__X - 1));
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsr_u32(unsigned int __X) {
  return __blsr_u32(__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __tzcnt_u32(unsigned int __X) {
  return __builtin_ctz(__X);
 }
 extern __inline unsigned int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _tzcnt_u32(unsigned int __X) {
  return __builtin_ctz(__X);
 }
 /* use the 64-bit shift, rotate, and count leading zeros instructions
   for long long.  */
 #ifdef __PPC64__
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __andn_u64(unsigned long long __X, unsigned long long __Y) {
  return (~__X & __Y);
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
  return ((__X << (64 - (__L + __P))) >> (64 - __L));
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __bextr_u64(unsigned long long __X, unsigned long long __Y) {
  unsigned int __P, __L;
  __P = __Y & 0xFF;
  __L = (__Y & 0xFF00) >> 8;
  return (_bextr_u64(__X, __P, __L));
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsi_u64(unsigned long long __X) {
  return __X & -__X;
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsi_u64(unsigned long long __X) {
  return __blsi_u64(__X);
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsmsk_u64(unsigned long long __X) {
  return (__X ^ (__X - 1));
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsmsk_u64(unsigned long long __X) {
  return __blsmsk_u64(__X);
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __blsr_u64(unsigned long long __X) {
  return (__X & (__X - 1));
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _blsr_u64(unsigned long long __X) {
  return __blsr_u64(__X);
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    __tzcnt_u64(unsigned long long __X) {
  return __builtin_ctzll(__X);
 }
 extern __inline unsigned long long
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _tzcnt_u64(unsigned long long __X) {
  return __builtin_ctzll(__X);
 }
 #endif /* __PPC64__  */
 #endif /* BMIINTRIN_H_ */
--- a/lib/include/ppc_wrappers/emmintrin.h
+++ b/lib/include/ppc_wrappers/emmintrin.h
--- a/lib/include/ppc_wrappers/immintrin.h
+++ b/lib/include/ppc_wrappers/immintrin.h
@ -0,0 +1,27 @@
 /*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef IMMINTRIN_H_
 #define IMMINTRIN_H_
 #include <x86gprintrin.h>
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
 #include <pmmintrin.h>
 #include <tmmintrin.h>
 #include <smmintrin.h>
 #endif /* IMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/mm_malloc.h
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@ -10,38 +10,33 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED
-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 #include <stdlib.h>
 /* We can't depend on <stdlib.h> since the prototype of posix_memalign
   may not be visible.  */
 #ifndef __cplusplus
-extern int posix_memalign (void **, size_t, size_t);
+extern int posix_memalign(void **, size_t, size_t);
 #else
-extern "C" int posix_memalign (void **, size_t, size_t);
+extern "C" int posix_memalign(void **, size_t, size_t);
 #endif
-static __inline void *
+static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
 _mm_malloc (size_t size, size_t alignment)
 {
  /* PowerPC64 ELF V2 ABI requires quadword alignment.  */
-  size_t vec_align = sizeof (__vector float);
+  size_t __vec_align = sizeof(__vector float);
-  void *ptr;
+  void *__ptr;
-  if (alignment < vec_align)
+  if (__alignment < __vec_align)
-    alignment = vec_align;
+    __alignment = __vec_align;
-  if (posix_memalign (&ptr, alignment, size) == 0)
+  if (posix_memalign(&__ptr, __alignment, __size) == 0)
-    return ptr;
+    return __ptr;
  else
    return NULL;
 }
-static __inline void
+static __inline void _mm_free(void *__ptr) { free(__ptr); }
 _mm_free (void * ptr)
 {
  free (ptr);
 }
 #else
 #include_next <mm_malloc.h>
--- a/lib/include/ppc_wrappers/mmintrin.h
+++ b/lib/include/ppc_wrappers/mmintrin.h
--- a/lib/include/ppc_wrappers/pmmintrin.h
+++ b/lib/include/ppc_wrappers/pmmintrin.h
@ -32,120 +32,114 @@
   In the specific case of the monitor and mwait instructions there are
   no direct equivalent in the PowerISA at this time.  So those
   intrinsics are not implemented.  */
-#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#error                                                                         \
    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
 #endif
 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_
-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 /* We need definitions from the SSE2 and SSE header files*/
 #include <emmintrin.h>
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128
-_mm_addsub_ps (__m128 __X, __m128 __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_addsub_ps(__m128 __X, __m128 __Y) {
-  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
+  const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
-  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
+  __v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
-  return (__m128) vec_add (__X, even_neg_Y);
+  return (__m128)vec_add(__X, __even_neg_Y);
 }
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128d
-_mm_addsub_pd (__m128d __X, __m128d __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_addsub_pd(__m128d __X, __m128d __Y) {
-  const __v2df even_n0 = {-0.0, 0.0};
+  const __v2df __even_n0 = {-0.0, 0.0};
-  __v2df even_neg_Y = vec_xor(__Y, even_n0);
+  __v2df __even_neg_Y = vec_xor(__Y, __even_n0);
-  return (__m128d) vec_add (__X, even_neg_Y);
+  return (__m128d)vec_add(__X, __even_neg_Y);
 }
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128
-_mm_hadd_ps (__m128 __X, __m128 __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_hadd_ps(__m128 __X, __m128 __Y) {
-  __vector unsigned char xform2 = {
+  __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
-      0x00, 0x01, 0x02, 0x03,
+                                     0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
-      0x08, 0x09, 0x0A, 0x0B,
+                                     0x18, 0x19, 0x1A, 0x1B};
-      0x10, 0x11, 0x12, 0x13,
+  __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
-      0x18, 0x19, 0x1A, 0x1B
+                                     0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
-    };
+                                     0x1C, 0x1D, 0x1E, 0x1F};
-  __vector unsigned char xform1 = {
+  return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
-      0x04, 0x05, 0x06, 0x07,
+                         vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
      0x0C, 0x0D, 0x0E, 0x0F,
      0x14, 0x15, 0x16, 0x17,
      0x1C, 0x1D, 0x1E, 0x1F
    };
  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
 			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
 }
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128
-_mm_hsub_ps (__m128 __X, __m128 __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_hsub_ps(__m128 __X, __m128 __Y) {
-  __vector unsigned char xform2 = {
+  __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
-      0x00, 0x01, 0x02, 0x03,
+                                     0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
-      0x08, 0x09, 0x0A, 0x0B,
+                                     0x18, 0x19, 0x1A, 0x1B};
-      0x10, 0x11, 0x12, 0x13,
+  __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
-      0x18, 0x19, 0x1A, 0x1B
+                                     0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
-    };
+                                     0x1C, 0x1D, 0x1E, 0x1F};
-  __vector unsigned char xform1 = {
+  return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
-      0x04, 0x05, 0x06, 0x07,
+                         vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
      0x0C, 0x0D, 0x0E, 0x0F,
      0x14, 0x15, 0x16, 0x17,
      0x1C, 0x1D, 0x1E, 0x1F
    };
  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
 			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
 }
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128d
-_mm_hadd_pd (__m128d __X, __m128d __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_hadd_pd(__m128d __X, __m128d __Y) {
-  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+  return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
-				  vec_mergel ((__v2df) __X, (__v2df)__Y));
+                          vec_mergel((__v2df)__X, (__v2df)__Y));
 }
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128d
-_mm_hsub_pd (__m128d __X, __m128d __Y)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_hsub_pd(__m128d __X, __m128d __Y) {
-  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+  return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
-			    vec_mergel ((__v2df) __X, (__v2df)__Y));
+                          vec_mergel((__v2df)__X, (__v2df)__Y));
 }
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#ifdef _ARCH_PWR8
-_mm_movehdup_ps (__m128 __X)
+extern __inline __m128
-{
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
+    _mm_movehdup_ps(__m128 __X) {
  return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_moveldup_ps(__m128 __X) {
  return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
 }
 #endif
 extern __inline __m128d
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_loaddup_pd(double const *__P) {
  return (__m128d)vec_splats(*__P);
 }
-extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128d
-_mm_moveldup_ps (__m128 __X)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_movedup_pd(__m128d __X) {
-  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
+  return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
 }
-extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+extern __inline __m128i
-_mm_loaddup_pd (double const *__P)
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-{
+    _mm_lddqu_si128(__m128i const *__P) {
-  return (__m128d) vec_splats (*__P);
+  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
 }
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movedup_pd (__m128d __X)
 {
  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
 }
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_lddqu_si128 (__m128i const *__P)
 {
  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
 }
 /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */
 #else
 #include_next <pmmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+#endif /* defined(__ppc64__) &&
-        */
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 #endif /* PMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -29,11 +29,254 @@
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_
-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 #include <altivec.h>
 #include <tmmintrin.h>
 /* Rounding mode macros. */
 #define _MM_FROUND_TO_NEAREST_INT 0x00
 #define _MM_FROUND_TO_ZERO 0x01
 #define _MM_FROUND_TO_POS_INF 0x02
 #define _MM_FROUND_TO_NEG_INF 0x03
 #define _MM_FROUND_CUR_DIRECTION 0x04
 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
 #define _MM_FROUND_RAISE_EXC 0x00
 #define _MM_FROUND_NO_EXC 0x08
 extern __inline __m128d
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_round_pd(__m128d __A, int __rounding) {
  __v2df __r;
  union {
    double __fr;
    long long __fpscr;
  } __enables_save, __fpscr_save;
  if (__rounding & _MM_FROUND_NO_EXC) {
    /* Save enabled exceptions, disable all exceptions,
       and preserve the rounding mode.  */
 #ifdef _ARCH_PWR9
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
    __fpscr_save.__fr = __builtin_mffs();
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    __fpscr_save.__fpscr &= ~0xf8;
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : "+wa"(__A));
  }
  switch (__rounding) {
  case _MM_FROUND_TO_NEAREST_INT:
    __fpscr_save.__fr = __builtin_mffsl();
    __attribute__((fallthrough));
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
    __builtin_set_fpscr_rn(0b00);
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : "+wa"(__A));
    __r = vec_rint((__v2df)__A);
    /* Insert an artificial "read" reference to the variable written
       above, to ensure the compiler does not schedule the computation
       of the value after the manipulation of the FPSCR, below.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
    break;
  case _MM_FROUND_TO_NEG_INF:
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
    __r = vec_floor((__v2df)__A);
    break;
  case _MM_FROUND_TO_POS_INF:
  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
    __r = vec_ceil((__v2df)__A);
    break;
  case _MM_FROUND_TO_ZERO:
  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
    __r = vec_trunc((__v2df)__A);
    break;
  case _MM_FROUND_CUR_DIRECTION:
    __r = vec_rint((__v2df)__A);
    break;
  }
  if (__rounding & _MM_FROUND_NO_EXC) {
    /* Insert an artificial "read" reference to the variable written
       above, to ensure the compiler does not schedule the computation
       of the value after the manipulation of the FPSCR, below.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
    /* Restore enabled exceptions.  */
    __fpscr_save.__fr = __builtin_mffsl();
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  }
  return (__m128d)__r;
 }
 extern __inline __m128d
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
  __B = _mm_round_pd(__B, __rounding);
  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
  return (__m128d)__r;
 }
 extern __inline __m128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_round_ps(__m128 __A, int __rounding) {
  __v4sf __r;
  union {
    double __fr;
    long long __fpscr;
  } __enables_save, __fpscr_save;
  if (__rounding & _MM_FROUND_NO_EXC) {
    /* Save enabled exceptions, disable all exceptions,
       and preserve the rounding mode.  */
 #ifdef _ARCH_PWR9
    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
    __fpscr_save.__fr = __builtin_mffs();
    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
    __fpscr_save.__fpscr &= ~0xf8;
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : "+wa"(__A));
  }
  switch (__rounding) {
  case _MM_FROUND_TO_NEAREST_INT:
    __fpscr_save.__fr = __builtin_mffsl();
    __attribute__((fallthrough));
  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
    __builtin_set_fpscr_rn(0b00);
    /* Insert an artificial "read/write" reference to the variable
       read below, to ensure the compiler does not schedule
       a read/use of the variable before the FPSCR is modified, above.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : "+wa"(__A));
    __r = vec_rint((__v4sf)__A);
    /* Insert an artificial "read" reference to the variable written
       above, to ensure the compiler does not schedule the computation
       of the value after the manipulation of the FPSCR, below.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
    break;
  case _MM_FROUND_TO_NEG_INF:
  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
    __r = vec_floor((__v4sf)__A);
    break;
  case _MM_FROUND_TO_POS_INF:
  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
    __r = vec_ceil((__v4sf)__A);
    break;
  case _MM_FROUND_TO_ZERO:
  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
    __r = vec_trunc((__v4sf)__A);
    break;
  case _MM_FROUND_CUR_DIRECTION:
    __r = vec_rint((__v4sf)__A);
    break;
  }
  if (__rounding & _MM_FROUND_NO_EXC) {
    /* Insert an artificial "read" reference to the variable written
       above, to ensure the compiler does not schedule the computation
       of the value after the manipulation of the FPSCR, below.
       This can be removed if and when GCC PR102783 is fixed.
     */
    __asm__("" : : "wa"(__r));
    /* Restore enabled exceptions.  */
    __fpscr_save.__fr = __builtin_mffsl();
    __fpscr_save.__fpscr |= __enables_save.__fpscr;
    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
  }
  return (__m128)__r;
 }
 extern __inline __m128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
  __B = _mm_round_ps(__B, __rounding);
  __v4sf __r = (__v4sf)__A;
  __r[0] = ((__v4sf)__B)[0];
  return (__m128)__r;
 }
 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
  __v16qi __result = (__v16qi)__A;
  __result[__N & 0xf] = __D;
  return (__m128i)__result;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
  __v4si __result = (__v4si)__A;
  __result[__N & 3] = __D;
  return (__m128i)__result;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
  __v2di __result = (__v2di)__A;
  __result[__N & 1] = __D;
  return (__m128i)__result;
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_extract_epi8(__m128i __X, const int __N) {
@ -58,6 +301,7 @@ extern __inline int
  return ((__v4si)__X)[__N & 3];
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
@ -69,42 +313,351 @@ extern __inline __m128i
 #endif
  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
 #ifdef _ARCH_PWR10
  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
 #else
  const __v16qu __seven = vec_splats((unsigned char)0x07);
  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
-  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
 #endif
 }
 extern __inline __m128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
  __v16qu __pcv[] = {
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
  };
  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
  return (__m128)__r;
 }
 extern __inline __m128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
 #ifdef _ARCH_PWR10
  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
 #else
  const __v4si __zero = {0};
  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
 #endif
 }
 extern __inline __m128d
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
  __v16qu __pcv[] = {
      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
  return (__m128d)__r;
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128d
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
 #ifdef _ARCH_PWR10
  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
 #else
  const __v2di __zero = {0};
  const __vector __bool long long __boolmask =
      vec_cmplt((__v2di)__mask, __zero);
  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
 #endif
 }
 #endif
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_testz_si128(__m128i __A, __m128i __B) {
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  const __v16qu __zero = {0};
  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_testc_si128(__m128i __A, __m128i __B) {
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  const __v16qu __zero = {0};
  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
 }
 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_testnzc_si128(__m128i __A, __m128i __B) {
  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
 }
 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_min_epi8(__m128i __X, __m128i __Y) {
  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
+    _mm_min_epu16(__m128i __X, __m128i __Y) {
-  __v16qi result = (__v16qi)__A;
+  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
  result[__N & 0xf] = __D;
  return (__m128i)result;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
+    _mm_min_epi32(__m128i __X, __m128i __Y) {
-  __v4si result = (__v4si)__A;
+  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
  result[__N & 3] = __D;
  return (__m128i)result;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
+    _mm_min_epu32(__m128i __X, __m128i __Y) {
-  __v2di result = (__v2di)__A;
+  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
  result[__N & 1] = __D;
  return (__m128i)result;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_max_epi8(__m128i __X, __m128i __Y) {
  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_max_epu16(__m128i __X, __m128i __Y) {
  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_max_epi32(__m128i __X, __m128i __Y) {
  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_max_epu32(__m128i __X, __m128i __Y) {
  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_mullo_epi32(__m128i __X, __m128i __Y) {
  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_mul_epi32(__m128i __X, __m128i __Y) {
  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi8_epi16(__m128i __A) {
  return (__m128i)vec_unpackh((__v16qi)__A);
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi8_epi32(__m128i __A) {
  __A = (__m128i)vec_unpackh((__v16qi)__A);
  return (__m128i)vec_unpackh((__v8hi)__A);
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi8_epi64(__m128i __A) {
  __A = (__m128i)vec_unpackh((__v16qi)__A);
  __A = (__m128i)vec_unpackh((__v8hi)__A);
  return (__m128i)vec_unpackh((__v4si)__A);
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi16_epi32(__m128i __A) {
  return (__m128i)vec_unpackh((__v8hi)__A);
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi16_epi64(__m128i __A) {
  __A = (__m128i)vec_unpackh((__v8hi)__A);
  return (__m128i)vec_unpackh((__v4si)__A);
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepi32_epi64(__m128i __A) {
  return (__m128i)vec_unpackh((__v4si)__A);
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu8_epi16(__m128i __A) {
  const __v16qu __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu8_epi32(__m128i __A) {
  const __v16qu __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu8_epi64(__m128i __A) {
  const __v16qu __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu16_epi32(__m128i __A) {
  const __v8hu __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu16_epi64(__m128i __A) {
  const __v8hu __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cvtepu32_epi64(__m128i __A) {
  const __v4su __zero = {0};
 #ifdef __LITTLE_ENDIAN__
  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
 #else  /* __BIG_ENDIAN__.  */
  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
 #endif /* __BIG_ENDIAN__.  */
  return __A;
 }
 /* Return horizontal packed word minimum and its index in bits [15:0]
   and bits [18:16] respectively.  */
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_minpos_epu16(__m128i __A) {
  union __u {
    __m128i __m;
    __v8hu __uh;
  };
  union __u __u = {.__m = __A}, __r = {.__m = {0}};
  unsigned short __ridx = 0;
  unsigned short __rmin = __u.__uh[__ridx];
  unsigned long __i;
  for (__i = 1; __i < 8; __i++) {
    if (__u.__uh[__i] < __rmin) {
      __rmin = __u.__uh[__i];
      __ridx = __i;
    }
  }
  __r.__uh[0] = __rmin;
  __r.__uh[1] = __ridx;
  return __r.__m;
 }
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_packus_epi32(__m128i __X, __m128i __Y) {
  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
 }
 #endif
 #else
 #include_next <smmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+#endif /* defined(__ppc64__) &&
-        */
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/tmmintrin.h
+++ b/lib/include/ppc_wrappers/tmmintrin.h
@ -25,7 +25,8 @@
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_
-#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
+#if defined(__ppc64__) &&                                                      \
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 #include <altivec.h>
@ -33,63 +34,55 @@
 #include <pmmintrin.h>
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi16 (__m128i __A)
+    _mm_abs_epi16(__m128i __A) {
-{
+  return (__m128i)vec_abs((__v8hi)__A);
  return (__m128i) vec_abs ((__v8hi) __A);
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi32 (__m128i __A)
+    _mm_abs_epi32(__m128i __A) {
-{
+  return (__m128i)vec_abs((__v4si)__A);
  return (__m128i) vec_abs ((__v4si) __A);
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_epi8 (__m128i __A)
+    _mm_abs_epi8(__m128i __A) {
-{
+  return (__m128i)vec_abs((__v16qi)__A);
  return (__m128i) vec_abs ((__v16qi) __A);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi16 (__m64 __A)
+    _mm_abs_pi16(__m64 __A) {
-{
+  __v8hi __B = (__v8hi)(__v2du){__A, __A};
-  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
+  return (__m64)((__v2du)vec_abs(__B))[0];
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi32 (__m64 __A)
+    _mm_abs_pi32(__m64 __A) {
-{
+  __v4si __B = (__v4si)(__v2du){__A, __A};
-  __v4si __B = (__v4si) (__v2du) { __A, __A };
+  return (__m64)((__v2du)vec_abs(__B))[0];
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_abs_pi8 (__m64 __A)
+    _mm_abs_pi8(__m64 __A) {
-{
+  __v16qi __B = (__v16qi)(__v2du){__A, __A};
-  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
+  return (__m64)((__v2du)vec_abs(__B))[0];
  return (__m64) ((__v2du) vec_abs (__B))[0];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
+    _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
-{
+  if (__builtin_constant_p(__count) && __count < 16) {
  if (__builtin_constant_p (__count) && __count < 16)
    {
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_reve ((__v16qu) __A);
+    __A = (__m128i)vec_reve((__v16qu)__A);
-      __B = (__m128i) vec_reve ((__v16qu) __B);
+    __B = (__m128i)vec_reve((__v16qu)__B);
 #endif
-      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
+    __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_reve ((__v16qu) __A);
+    __A = (__m128i)vec_reve((__v16qu)__A);
 #endif
    return __A;
  }
@ -97,400 +90,364 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
  if (__count == 0)
    return __B;
-  if (__count >= 16)
+  if (__count >= 16) {
-    {
+    if (__count >= 32) {
-      if (__count >= 32)
+      const __v16qu __zero = {0};
-	{
+      return (__m128i)__zero;
-	  const __v16qu zero = { 0 };
+    } else {
-	  return (__m128i) zero;
+      const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
 	}
      else
 	{
 	  const __v16qu __shift =
 	    vec_splats ((unsigned char) ((__count - 16) * 8));
 #ifdef __LITTLE_ENDIAN__
-	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
+      return (__m128i)vec_sro((__v16qu)__A, __shift);
 #else
-	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
+      return (__m128i)vec_slo((__v16qu)__A, __shift);
 #endif
    }
-    }
+  } else {
-  else
+    const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
-    {
+    const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
      const __v16qu __shiftA =
 	vec_splats ((unsigned char) ((16 - __count) * 8));
      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
 #ifdef __LITTLE_ENDIAN__
-      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
+    __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
-      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
+    __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
 #else
-      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
+    __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
-      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
+    __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
 #endif
-      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
+    return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
  }
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
+    _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
-{
+  if (__count < 16) {
-  if (__count < 16)
+    __v2du __C = {__B, __A};
    {
      __v2du __C = { __B, __A };
 #ifdef __LITTLE_ENDIAN__
-      const __v4su __shift = { __count << 3, 0, 0, 0 };
+    const __v4su __shift = {__count << 3, 0, 0, 0};
-      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
+    __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
 #else
-      const __v4su __shift = { 0, 0, 0, __count << 3 };
+    const __v4su __shift = {0, 0, 0, __count << 3};
-      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
+    __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
 #endif
-      return (__m64) __C[0];
+    return (__m64)__C[0];
-    }
+  } else {
-  else
+    const __m64 __zero = {0};
    {
      const __m64 __zero = { 0 };
    return __zero;
  }
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_epi16 (__m128i __A, __m128i __B)
+    _mm_hadd_epi16(__m128i __A, __m128i __B) {
-{
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
-  const __v16qu __P =
+                       16, 17, 20, 21, 24, 25, 28, 29};
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
-  const __v16qu __Q =
+                       18, 19, 22, 23, 26, 27, 30, 31};
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i)vec_add(__C, __D);
  return (__m128i) vec_add (__C, __D);
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_epi32 (__m128i __A, __m128i __B)
+    _mm_hadd_epi32(__m128i __A, __m128i __B) {
-{
+  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
-  const __v16qu __P =
+                       16, 17, 18, 19, 24, 25, 26, 27};
-    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
-  const __v16qu __Q =
+                       20, 21, 22, 23, 28, 29, 30, 31};
-    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
-  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
-  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i)vec_add(__C, __D);
  return (__m128i) vec_add (__C, __D);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_pi16 (__m64 __A, __m64 __B)
+    _mm_hadd_pi16(__m64 __A, __m64 __B) {
-{
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-  const __v16qu __P =
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  __v8hi __D = vec_perm(__C, __C, __Q);
-  const __v16qu __Q =
+  __C = vec_perm(__C, __C, __P);
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __C = vec_add(__C, __D);
-  __v8hi __D = vec_perm (__C, __C, __Q);
+  return (__m64)((__v2du)__C)[1];
  __C = vec_perm (__C, __C, __P);
  __C = vec_add (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadd_pi32 (__m64 __A, __m64 __B)
+    _mm_hadd_pi32(__m64 __A, __m64 __B) {
-{
+  __v4si __C = (__v4si)(__v2du){__A, __B};
-  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
-  const __v16qu __P =
+  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
-    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  __v4si __D = vec_perm(__C, __C, __Q);
-  const __v16qu __Q =
+  __C = vec_perm(__C, __C, __P);
-    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __C = vec_add(__C, __D);
-  __v4si __D = vec_perm (__C, __C, __Q);
+  return (__m64)((__v2du)__C)[1];
  __C = vec_perm (__C, __C, __P);
  __C = vec_add (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadds_epi16 (__m128i __A, __m128i __B)
+    _mm_hadds_epi16(__m128i __A, __m128i __B) {
-{
+  __v4si __C = {0}, __D = {0};
-  __v4si __C = { 0 }, __D = { 0 };
+  __C = vec_sum4s((__v8hi)__A, __C);
-  __C = vec_sum4s ((__v8hi) __A, __C);
+  __D = vec_sum4s((__v8hi)__B, __D);
-  __D = vec_sum4s ((__v8hi) __B, __D);
+  __C = (__v4si)vec_packs(__C, __D);
-  __C = (__v4si) vec_packs (__C, __D);
+  return (__m128i)__C;
  return (__m128i) __C;
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hadds_pi16 (__m64 __A, __m64 __B)
+    _mm_hadds_pi16(__m64 __A, __m64 __B) {
-{
+  const __v4si __zero = {0};
-  const __v4si __zero = { 0 };
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v4si __D = vec_sum4s(__C, __zero);
-  __v4si __D = vec_sum4s (__C, __zero);
+  __C = vec_packs(__D, __D);
-  __C = vec_packs (__D, __D);
+  return (__m64)((__v2du)__C)[1];
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_epi16 (__m128i __A, __m128i __B)
+    _mm_hsub_epi16(__m128i __A, __m128i __B) {
-{
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
-  const __v16qu __P =
+                       16, 17, 20, 21, 24, 25, 28, 29};
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
-  const __v16qu __Q =
+                       18, 19, 22, 23, 26, 27, 30, 31};
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i)vec_sub(__C, __D);
  return (__m128i) vec_sub (__C, __D);
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_epi32 (__m128i __A, __m128i __B)
+    _mm_hsub_epi32(__m128i __A, __m128i __B) {
-{
+  const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
-  const __v16qu __P =
+                       16, 17, 18, 19, 24, 25, 26, 27};
-    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
-  const __v16qu __Q =
+                       20, 21, 22, 23, 28, 29, 30, 31};
-    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
-  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
-  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i)vec_sub(__C, __D);
  return (__m128i) vec_sub (__C, __D);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_pi16 (__m64 __A, __m64 __B)
+    _mm_hsub_pi16(__m64 __A, __m64 __B) {
-{
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-  const __v16qu __P =
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
-  const __v16qu __Q =
+  __v8hi __D = vec_perm(__C, __C, __Q);
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __C = vec_perm(__C, __C, __P);
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __C = vec_sub(__C, __D);
-  __v8hi __D = vec_perm (__C, __C, __Q);
+  return (__m64)((__v2du)__C)[1];
  __C = vec_perm (__C, __C, __P);
  __C = vec_sub (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsub_pi32 (__m64 __A, __m64 __B)
+    _mm_hsub_pi32(__m64 __A, __m64 __B) {
-{
+  const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
-  const __v16qu __P =
+  const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
-    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  __v4si __C = (__v4si)(__v2du){__A, __B};
-  const __v16qu __Q =
+  __v4si __D = vec_perm(__C, __C, __Q);
-    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __C = vec_perm(__C, __C, __P);
-  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  __C = vec_sub(__C, __D);
-  __v4si __D = vec_perm (__C, __C, __Q);
+  return (__m64)((__v2du)__C)[1];
  __C = vec_perm (__C, __C, __P);
  __C = vec_sub (__C, __D);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsubs_epi16 (__m128i __A, __m128i __B)
+    _mm_hsubs_epi16(__m128i __A, __m128i __B) {
-{
+  const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
-  const __v16qu __P =
+                       16, 17, 20, 21, 24, 25, 28, 29};
-    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
-  const __v16qu __Q =
+                       18, 19, 22, 23, 26, 27, 30, 31};
-    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
-  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
-  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i)vec_subs(__C, __D);
  return (__m128i) vec_subs (__C, __D);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_hsubs_pi16 (__m64 __A, __m64 __B)
+    _mm_hsubs_pi16(__m64 __A, __m64 __B) {
-{
+  const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-  const __v16qu __P =
+  const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
-    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  __v8hi __C = (__v8hi)(__v2du){__A, __B};
-  const __v16qu __Q =
+  __v8hi __D = vec_perm(__C, __C, __P);
-    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __E = vec_perm(__C, __C, __Q);
-  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __C = vec_subs(__D, __E);
-  __v8hi __D = vec_perm (__C, __C, __P);
+  return (__m64)((__v2du)__C)[1];
  __v8hi __E = vec_perm (__C, __C, __Q);
  __C = vec_subs (__D, __E);
  return (__m64) ((__v2du) __C)[1];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_epi8 (__m128i __A, __m128i __B)
+    _mm_shuffle_epi8(__m128i __A, __m128i __B) {
-{
+  const __v16qi __zero = {0};
-  const __v16qi __zero = { 0 };
+  __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
-  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
+  __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
-  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
+  return (__m128i)vec_sel(__C, __zero, __select);
  return (__m128i) vec_sel (__C, __zero, __select);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_pi8 (__m64 __A, __m64 __B)
+    _mm_shuffle_pi8(__m64 __A, __m64 __B) {
-{
+  const __v16qi __zero = {0};
-  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi)(__v2du){__A, __A};
-  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v16qi __D = (__v16qi)(__v2du){__B, __B};
-  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
-  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
+  __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
-  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
+  __C = vec_sel(__C, __zero, __select);
-  __C = vec_sel (__C, __zero, __select);
+  return (__m64)((__v2du)(__C))[0];
  return (__m64) ((__v2du) (__C))[0];
 }
 #ifdef _ARCH_PWR8
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_epi8 (__m128i __A, __m128i __B)
+    _mm_sign_epi8(__m128i __A, __m128i __B) {
-{
+  const __v16qi __zero = {0};
-  const __v16qi __zero = { 0 };
+  __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
  __v16qi __selectpos =
-    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
+      (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
-  __v16qi __conv = vec_add (__selectneg, __selectpos);
+  __v16qi __conv = vec_add(__selectneg, __selectpos);
-  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
+  return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_sign_epi16(__m128i __A, __m128i __B) {
  const __v8hi __zero = {0};
  __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
  __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
  __v8hi __conv = vec_add(__selectneg, __selectpos);
  return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_sign_epi32(__m128i __A, __m128i __B) {
  const __v4si __zero = {0};
  __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
  __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
  __v4si __conv = vec_add(__selectneg, __selectpos);
  return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m64
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_sign_pi8(__m64 __A, __m64 __B) {
  const __v16qi __zero = {0};
  __v16qi __C = (__v16qi)(__v2du){__A, __A};
  __v16qi __D = (__v16qi)(__v2du){__B, __B};
  __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
  return (__m64)((__v2du)(__C))[0];
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m64
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_sign_pi16(__m64 __A, __m64 __B) {
  const __v8hi __zero = {0};
  __v8hi __C = (__v8hi)(__v2du){__A, __A};
  __v8hi __D = (__v8hi)(__v2du){__B, __B};
  __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
  return (__m64)((__v2du)(__C))[0];
 }
 #endif
 #ifdef _ARCH_PWR8
 extern __inline __m64
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_sign_pi32(__m64 __A, __m64 __B) {
  const __v4si __zero = {0};
  __v4si __C = (__v4si)(__v2du){__A, __A};
  __v4si __D = (__v4si)(__v2du){__B, __B};
  __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
  return (__m64)((__v2du)(__C))[0];
 }
 #endif
 extern __inline __m128i
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_maddubs_epi16(__m128i __A, __m128i __B) {
  __v8hi __unsigned = vec_splats((signed short)0x00ff);
  __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
  __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
  __v8hi __E = vec_unpackh((__v16qi)__B);
  __v8hi __F = vec_unpackl((__v16qi)__B);
  __C = vec_mul(__C, __E);
  __D = vec_mul(__D, __F);
  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
                          16, 17, 20, 21, 24, 25, 28, 29};
  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
                           18, 19, 22, 23, 26, 27, 30, 31};
  __E = vec_perm(__C, __D, __odds);
  __F = vec_perm(__C, __D, __evens);
  return (__m128i)vec_adds(__E, __F);
 }
 extern __inline __m64
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    _mm_maddubs_pi16(__m64 __A, __m64 __B) {
  __v8hi __C = (__v8hi)(__v2du){__A, __A};
  __C = vec_unpackl((__v16qi)__C);
  const __v8hi __unsigned = vec_splats((signed short)0x00ff);
  __C = vec_and(__C, __unsigned);
  __v8hi __D = (__v8hi)(__v2du){__B, __B};
  __D = vec_unpackl((__v16qi)__D);
  __D = vec_mul(__C, __D);
  const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
                          16, 17, 20, 21, 24, 25, 28, 29};
  const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
                           18, 19, 22, 23, 26, 27, 30, 31};
  __C = vec_perm(__D, __D, __odds);
  __D = vec_perm(__D, __D, __evens);
  __C = vec_adds(__C, __D);
  return (__m64)((__v2du)(__C))[0];
 }
 extern __inline __m128i
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_epi16 (__m128i __A, __m128i __B)
+    _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
-{
+  __v4si __C = vec_unpackh((__v8hi)__A);
-  const __v8hi __zero = { 0 };
+  __v4si __D = vec_unpackh((__v8hi)__B);
-  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
+  __C = vec_mul(__C, __D);
-  __v8hi __selectpos =
+  __D = vec_unpackl((__v8hi)__A);
-    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
+  __v4si __E = vec_unpackl((__v8hi)__B);
-  __v8hi __conv = vec_add (__selectneg, __selectpos);
+  __D = vec_mul(__D, __E);
-  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
+  const __v4su __shift = vec_splats((unsigned int)14);
-}
+  __C = vec_sr(__C, __shift);
-
+  __D = vec_sr(__D, __shift);
-extern __inline __m128i
+  const __v4si __ones = vec_splats((signed int)1);
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+  __C = vec_add(__C, __ones);
-_mm_sign_epi32 (__m128i __A, __m128i __B)
+  __C = vec_sr(__C, (__v4su)__ones);
-{
+  __D = vec_add(__D, __ones);
-  const __v4si __zero = { 0 };
+  __D = vec_sr(__D, (__v4su)__ones);
-  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
+  return (__m128i)vec_pack(__C, __D);
  __v4si __selectpos =
    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
  __v4si __conv = vec_add (__selectneg, __selectpos);
  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
 }
 extern __inline __m64
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sign_pi8 (__m64 __A, __m64 __B)
+    _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
-{
+  __v4si __C = (__v4si)(__v2du){__A, __A};
-  const __v16qi __zero = { 0 };
+  __C = vec_unpackh((__v8hi)__C);
-  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v4si __D = (__v4si)(__v2du){__B, __B};
-  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __D = vec_unpackh((__v8hi)__D);
-  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
+  __C = vec_mul(__C, __D);
-  return (__m64) ((__v2du) (__C))[0];
+  const __v4su __shift = vec_splats((unsigned int)14);
-}
+  __C = vec_sr(__C, __shift);
-
+  const __v4si __ones = vec_splats((signed int)1);
-extern __inline __m64
+  __C = vec_add(__C, __ones);
-__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+  __C = vec_sr(__C, (__v4su)__ones);
-_mm_sign_pi16 (__m64 __A, __m64 __B)
+  __v8hi __E = vec_pack(__C, __D);
-{
+  return (__m64)((__v2du)(__E))[0];
  const __v8hi __zero = { 0 };
  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi32 (__m64 __A, __m64 __B)
 {
  const __v4si __zero = { 0 };
  __v4si __C = (__v4si) (__v2du) { __A, __A };
  __v4si __D = (__v4si) (__v2du) { __B, __B };
  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maddubs_epi16 (__m128i __A, __m128i __B)
 {
  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
  __v8hi __E = vec_unpackh ((__v16qi) __B);
  __v8hi __F = vec_unpackl ((__v16qi) __B);
  __C = vec_mul (__C, __E);
  __D = vec_mul (__D, __F);
  const __v16qu __odds  =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __evens =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __E = vec_perm (__C, __D, __odds);
  __F = vec_perm (__C, __D, __evens);
  return (__m128i) vec_adds (__E, __F);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_maddubs_pi16 (__m64 __A, __m64 __B)
 {
  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
  __C = vec_unpackl ((__v16qi) __C);
  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
  __C = vec_and (__C, __unsigned);
  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
  __D = vec_unpackl ((__v16qi) __D);
  __D = vec_mul (__C, __D);
  const __v16qu __odds  =
    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
  const __v16qu __evens =
    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
  __C = vec_perm (__D, __D, __odds);
  __D = vec_perm (__D, __D, __evens);
  __C = vec_adds (__C, __D);
  return (__m64) ((__v2du) (__C))[0];
 }
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhrs_epi16 (__m128i __A, __m128i __B)
 {
  __v4si __C = vec_unpackh ((__v8hi) __A);
  __v4si __D = vec_unpackh ((__v8hi) __B);
  __C = vec_mul (__C, __D);
  __D = vec_unpackl ((__v8hi) __A);
  __v4si __E = vec_unpackl ((__v8hi) __B);
  __D = vec_mul (__D, __E);
  const __v4su __shift = vec_splats ((unsigned int) 14);
  __C = vec_sr (__C, __shift);
  __D = vec_sr (__D, __shift);
  const __v4si __ones = vec_splats ((signed int) 1);
  __C = vec_add (__C, __ones);
  __C = vec_sr (__C, (__v4su) __ones);
  __D = vec_add (__D, __ones);
  __D = vec_sr (__D, (__v4su) __ones);
  return (__m128i) vec_pack (__C, __D);
 }
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
 {
  __v4si __C = (__v4si) (__v2du) { __A, __A };
  __C = vec_unpackh ((__v8hi) __C);
  __v4si __D = (__v4si) (__v2du) { __B, __B };
  __D = vec_unpackh ((__v8hi) __D);
  __C = vec_mul (__C, __D);
  const __v4su __shift = vec_splats ((unsigned int) 14);
  __C = vec_sr (__C, __shift);
  const __v4si __ones = vec_splats ((signed int) 1);
  __C = vec_add (__C, __ones);
  __C = vec_sr (__C, (__v4su) __ones);
  __v8hi __E = vec_pack (__C, __D);
  return (__m64) ((__v2du) (__E))[0];
 }
 #else
 #include_next <tmmintrin.h>
-#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+#endif /* defined(__ppc64__) &&
-        */
+        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 #endif /* TMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/x86gprintrin.h
+++ b/lib/include/ppc_wrappers/x86gprintrin.h
@ -0,0 +1,17 @@
 /*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef X86GPRINTRIN_H_
 #define X86GPRINTRIN_H_
 #include <bmiintrin.h>
 #include <bmi2intrin.h>
 #endif /* X86GPRINTRIN_H_ */
--- a/lib/include/ppc_wrappers/x86intrin.h
+++ b/lib/include/ppc_wrappers/x86intrin.h
@ -0,0 +1,28 @@
 /*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
   makes explicit use of Intel intrinsics to powerpc64le.
   It is the user's responsibility to determine if the results are
   acceptable and make additional changes as necessary.
   Note that much code that uses Intel intrinsics can be rewritten in
   standard C or GNU C extensions, which are more portable and better
   optimized across multiple targets.  */
 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
 #endif
 #ifndef X86INTRIN_H_
 #define X86INTRIN_H_
 #ifdef __ALTIVEC__
 #include <immintrin.h>
 #endif /* __ALTIVEC__ */
 #endif /* X86INTRIN_H_ */
--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
--- a/lib/include/rdpruintrin.h
+++ b/lib/include/rdpruintrin.h
@ -0,0 +1,57 @@
 /*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #if !defined __X86INTRIN_H
 #error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
 #endif
 #ifndef __RDPRUINTRIN_H
 #define __RDPRUINTRIN_H
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
  __attribute__((__always_inline__, __nodebug__,  __target__("rdpru")))
 /// Reads the content of a processor register.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> RDPRU </c> instruction.
 ///
 /// \param reg_id
 ///    A processor register identifier.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdpru (int reg_id)
 {
  return __builtin_ia32_rdpru(reg_id);
 }
 #define __RDPRU_MPERF 0
 #define __RDPRU_APERF 1
 /// Reads the content of processor register MPERF.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic generates instruction <c> RDPRU </c> to read the value of
 /// register MPERF.
 #define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
 /// Reads the content of processor register APERF.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic generates instruction <c> RDPRU </c> to read the value of
 /// register APERF.
 #define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
 #undef __DEFAULT_FN_ATTRS
 #endif /* __RDPRUINTRIN_H */
--- a/lib/include/rdseedintrin.h
+++ b/lib/include/rdseedintrin.h
@ -20,20 +20,20 @@
 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed16_step(unsigned short *__p)
 {
-  return __builtin_ia32_rdseed16_step(__p);
+  return (int) __builtin_ia32_rdseed16_step(__p);
 }
 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed32_step(unsigned int *__p)
 {
-  return __builtin_ia32_rdseed32_step(__p);
+  return (int) __builtin_ia32_rdseed32_step(__p);
 }
 #ifdef __x86_64__
 static __inline__ int __DEFAULT_FN_ATTRS
 _rdseed64_step(unsigned long long *__p)
 {
-  return __builtin_ia32_rdseed64_step(__p);
+  return (int) __builtin_ia32_rdseed64_step(__p);
 }
 #endif
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
--- a/lib/include/rtmintrin.h
+++ b/lib/include/rtmintrin.h
@ -29,7 +29,7 @@
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _xbegin(void)
 {
-  return __builtin_ia32_xbegin();
+  return (unsigned int)__builtin_ia32_xbegin();
 }
 static __inline__ void __DEFAULT_FN_ATTRS
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -17,7 +17,9 @@
 #include <tmmintrin.h>
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
                 __min_vector_width__(128)))
 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT 0x00
@ -276,8 +278,8 @@
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
 #define _mm_round_ss(X, Y, M)                                                  \
-  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
-                                  (__v4sf)(__m128)(Y), (M)))
+                                  (M)))
 /// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
@ -351,8 +353,8 @@
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
 #define _mm_round_sd(X, Y, M)                                                  \
-  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
-                                   (__v2df)(__m128d)(Y), (M)))
+                                   (M)))
 /* SSE4 Packed Blending Intrinsics.  */
 /// Returns a 128-bit vector of [2 x double] where the values are
@ -380,7 +382,7 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M)                                                \
-  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+  ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
                                   (__v2df)(__m128d)(V2), (int)(M)))
 /// Returns a 128-bit vector of [4 x float] where the values are selected
@ -408,8 +410,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M)                                                \
-  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+  ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
-                                    (__v4sf)(__m128)(V2), (int)(M)))
+                                  (int)(M)))
 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@ -431,10 +433,10 @@
 ///    position in the result. When a mask bit is 1, the corresponding 64-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
-_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+                                                           __m128d __V2,
-{
+                                                           __m128d __M) {
-  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+  return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
                                          (__v2df)__M);
 }
@ -458,10 +460,10 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 ///    position in the result. When a mask bit is 1, the corresponding 32-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
-_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+                                                          __m128 __V2,
-{
+                                                          __m128 __M) {
-  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+  return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
                                         (__v4sf)__M);
 }
@ -485,10 +487,10 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 ///    position in the result. When a mask bit is 1, the corresponding 8-bit
 ///    element in operand \a __V2 is copied to the same position in the result.
 /// \returns A 128-bit vector of [16 x i8] containing the copied values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
-_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+                                                             __m128i __V2,
-{
+                                                             __m128i __M) {
-  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+  return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
                                             (__v16qi)__M);
 }
@ -517,7 +519,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M)                                             \
-  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+  ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
                                      (__v8hi)(__m128i)(V2), (int)(M)))
 /* SSE4 Dword Multiply Instructions.  */
@ -534,10 +536,9 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the products of both operands.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
-_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+                                                             __m128i __V2) {
-{
+  return (__m128i)((__v4su)__V1 * (__v4su)__V2);
  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }
 /// Multiplies corresponding even-indexed elements of two 128-bit
@ -554,10 +555,9 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [2 x i64] containing the products of both
 ///    operands.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
-_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
 }
 /* SSE4 Floating Point Dot Product Instructions.  */
@ -594,8 +594,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M)                                                     \
-  ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+  ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
                                (__v4sf)(__m128)(Y), (M)))
 /// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
@ -629,8 +628,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M)                                                     \
-  ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+  ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
-                                 (__v2df)(__m128d)(Y), (M)))
+                                (M)))
 /* SSE4 Streaming Load Hint Instruction.  */
 /// Loads integer values from a 128-bit aligned memory location to a
@ -646,9 +645,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 /// \returns A 128-bit integer vector containing the data stored at the
 ///    specified memory location.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128 (__m128i const *__V)
+_mm_stream_load_si128(__m128i const *__V) {
-{
+  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
 }
 /* SSE4 Packed Integer Min/Max Instructions.  */
@ -665,10 +663,9 @@ _mm_stream_load_si128 (__m128i const *__V)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8]
 /// \returns A 128-bit vector of [16 x i8] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
-_mm_min_epi8 (__m128i __V1, __m128i __V2)
+                                                          __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
  return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -684,10 +681,9 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8].
 /// \returns A 128-bit vector of [16 x i8] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
-_mm_max_epi8 (__m128i __V1, __m128i __V2)
+                                                          __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
  return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -703,10 +699,9 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [8 x u16].
 /// \returns A 128-bit vector of [8 x u16] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
-_mm_min_epu16 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
  return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -722,10 +717,9 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [8 x u16].
 /// \returns A 128-bit vector of [8 x u16] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
-_mm_max_epu16 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
  return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -741,10 +735,9 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [4 x i32] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
-_mm_min_epi32 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
  return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -760,10 +753,9 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x i32].
 /// \returns A 128-bit vector of [4 x i32] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
-_mm_max_epi32 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
  return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -779,10 +771,9 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x u32].
 /// \returns A 128-bit vector of [4 x u32] containing the lesser values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
-_mm_min_epu32 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
  return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
 }
 /// Compares the corresponding elements of two 128-bit vectors of
@ -798,10 +789,9 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
 /// \param __V2
 ///    A 128-bit vector of [4 x u32].
 /// \returns A 128-bit vector of [4 x u32] containing the greater values.
-static __inline__  __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
-_mm_max_epu32 (__m128i __V1, __m128i __V2)
+                                                           __m128i __V2) {
-{
+  return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
  return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
 }
 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
@ -870,20 +860,23 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N)                                                   \
-  __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
+  __builtin_bit_cast(                                                          \
      int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
 #define _MM_EXTRACT_FLOAT(D, X, N)                                             \
-  do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
+  do {                                                                         \
    (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
  } while (0)
 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
   an index suitable for _mm_insert_ps.  */
 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
 /* Extract a float from X at index N into the first index of the return.  */
-#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
+#define _MM_PICK_OUT_PS(X, N)                                                  \
-                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+  _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 /* Insert int into packed integer array at index.  */
 /// Constructs a 128-bit vector of [16 x i8] by first making a copy of
@ -927,8 +920,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi8(X, I, N)                                               \
-  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
-                                         (int)(I), (int)(N)))
+                                         (int)(N)))
 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
@ -959,8 +952,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi32(X, I, N)                                              \
-  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
-                                        (int)(I), (int)(N)))
+                                        (int)(N)))
 #ifdef __x86_64__
 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -990,8 +983,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi64(X, I, N)                                              \
-  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
-                                        (long long)(I), (int)(N)))
+                                        (int)(N)))
 #endif /* __x86_64__ */
 /* Extract int from packed integer array at index.  This returns the element
@ -1061,7 +1054,6 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 #define _mm_extract_epi32(X, N)                                                \
  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
 #ifdef __x86_64__
 /// Extracts a 64-bit element from the 128-bit integer vector of
 ///    [2 x i64], using the immediate value parameter \a N as a selector.
 ///
@ -1071,7 +1063,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// long long _mm_extract_epi64(__m128i X, const int N);
 /// \endcode
 ///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
 /// in 64-bit mode.
 ///
 /// \param X
 ///    A 128-bit integer vector.
@ -1083,7 +1076,6 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  A 64-bit integer.
 #define _mm_extract_epi64(X, N)                                                \
  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
 #endif /* __x86_64 */
 /* SSE4 128-bit Packed Integer Comparisons.  */
 /// Tests whether the specified bits in a 128-bit integer vector are all
@ -1098,9 +1090,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \param __V
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
-_mm_testz_si128(__m128i __M, __m128i __V)
+                                                         __m128i __V) {
 {
  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
@ -1116,9 +1107,8 @@ _mm_testz_si128(__m128i __M, __m128i __V)
 /// \param __V
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are all ones; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
-_mm_testc_si128(__m128i __M, __m128i __V)
+                                                         __m128i __V) {
 {
  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
@ -1135,9 +1125,8 @@ _mm_testc_si128(__m128i __M, __m128i __V)
 ///    A 128-bit integer vector selecting which bits to test in operand \a __M.
 /// \returns TRUE if the specified bits are neither all zeros nor all ones;
 ///    FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
+static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
-_mm_testnzc_si128(__m128i __M, __m128i __V)
+                                                           __m128i __V) {
 {
  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }
@ -1193,7 +1182,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 /// \param V
 ///    A 128-bit integer vector selecting which bits to test in operand \a M.
 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
 /* SSE4 64-bit Packed Integer Comparisons.  */
 /// Compares each of the corresponding 64-bit values of the 128-bit
@ -1208,9 +1197,8 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
-_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+                                                             __m128i __V2) {
 {
  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
 }
@ -1225,15 +1213,16 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
-///    extended to 16-bit values.
+///    sign-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
 _mm_cvtepi8_epi16(__m128i __V)
 {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+  return (__m128i) __builtin_convertvector(
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
                              7),
      __v8hi);
 }
 /// Sign-extends each of the lower four 8-bit integer elements of a
@ -1249,12 +1238,11 @@ _mm_cvtepi8_epi16(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
 ///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
 _mm_cvtepi8_epi32(__m128i __V)
 {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+  return (__m128i) __builtin_convertvector(
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 /// Sign-extends each of the lower two 8-bit integer elements of a
@ -1270,12 +1258,11 @@ _mm_cvtepi8_epi32(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
 ///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
 _mm_cvtepi8_epi64(__m128i __V)
 {
  /* This function always performs a signed extension, but __v16qi is a char
     which may be signed or unsigned, so use __v16qs. */
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+  return (__m128i) __builtin_convertvector(
      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 /// Sign-extends each of the lower four 16-bit integer elements of a
@ -1291,10 +1278,9 @@ _mm_cvtepi8_epi64(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
 ///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
-_mm_cvtepi16_epi32(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 /// Sign-extends each of the lower two 16-bit integer elements of a
@ -1310,10 +1296,9 @@ _mm_cvtepi16_epi32(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
 ///     sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
-_mm_cvtepi16_epi64(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 /// Sign-extends each of the lower two 32-bit integer elements of a
@ -1329,10 +1314,9 @@ _mm_cvtepi16_epi64(__m128i __V)
 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
 ///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
-_mm_cvtepi32_epi64(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
 }
 /* SSE4 Packed Integer Zero-Extension.  */
@ -1349,10 +1333,11 @@ _mm_cvtepi32_epi64(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
 ///    zero-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
-_mm_cvtepu8_epi16(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
-  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+                              7),
      __v8hi);
 }
 /// Zero-extends each of the lower four 8-bit integer elements of a
@ -1368,10 +1353,9 @@ _mm_cvtepu8_epi16(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
 ///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
-_mm_cvtepu8_epi32(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }
 /// Zero-extends each of the lower two 8-bit integer elements of a
@ -1387,10 +1371,9 @@ _mm_cvtepu8_epi32(__m128i __V)
 ///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
-_mm_cvtepu8_epi64(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }
 /// Zero-extends each of the lower four 16-bit integer elements of a
@ -1406,10 +1389,9 @@ _mm_cvtepu8_epi64(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
 ///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
-_mm_cvtepu16_epi32(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }
 /// Zero-extends each of the lower two 16-bit integer elements of a
@ -1425,10 +1407,9 @@ _mm_cvtepu16_epi32(__m128i __V)
 ///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
-_mm_cvtepu16_epi64(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }
 /// Zero-extends each of the lower two 32-bit integer elements of a
@ -1444,10 +1425,9 @@ _mm_cvtepu16_epi64(__m128i __V)
 ///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
 ///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
-_mm_cvtepu32_epi64(__m128i __V)
+  return (__m128i) __builtin_convertvector(
-{
+      __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
 }
 /* SSE4 Pack with Unsigned Saturation.  */
@ -1473,10 +1453,9 @@ _mm_cvtepu32_epi64(__m128i __V)
 ///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
-_mm_packus_epi32(__m128i __V1, __m128i __V2)
+                                                              __m128i __V2) {
-{
+  return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
 }
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
@ -1516,7 +1495,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M)                                              \
-  ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+  ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
                                      (__v16qi)(__m128i)(Y), (M)))
 /// Finds the minimum unsigned 16-bit element in the input 128-bit
@ -1532,10 +1511,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 /// \returns A 128-bit value where bits [15:0] contain the minimum value found
 ///    in parameter \a __V, bits [18:16] contain the index of the minimum value
 ///    and the remaining bits are set to 0.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
-_mm_minpos_epu16(__m128i __V)
+  return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
 {
  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
 }
 /* Handle the sse4.2 definitions here. */
@ -1544,7 +1521,8 @@ _mm_minpos_epu16(__m128i __V)
   so we'll do the same.  */
 #undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+#define __DEFAULT_FN_ATTRS                                                     \
  __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
 /* These specify the type of data that we're comparing.  */
 #define _SIDD_UBYTE_OPS 0x00
@ -2336,9 +2314,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
-_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+                                                             __m128i __V2) {
 {
  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -17,7 +17,8 @@
 * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
 * to the clang resource header until that is fully supported.
 */
-#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
+#if __STDC_HOSTED__ &&                                                         \
    __has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
 # include_next <stdatomic.h>
 #else
@ -158,10 +159,6 @@ typedef _Atomic(uintmax_t)          atomic_uintmax_t;
 typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
 #define ATOMIC_FLAG_INIT { 0 }
 #if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
 #pragma clang deprecated(ATOMIC_FLAG_INIT)
 #endif
 /* These should be provided by the libc implementation. */
 #ifdef __cplusplus
--- a/lib/include/stdbool.h
+++ b/lib/include/stdbool.h
@ -10,8 +10,13 @@
 #ifndef __STDBOOL_H
 #define __STDBOOL_H
-/* Don't define bool, true, and false in C++, except as a GNU extension. */
+#define __bool_true_false_are_defined 1
-#ifndef __cplusplus
+
 #if __STDC_VERSION__ > 201710L
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
 * to system headers which include this header file unconditionally.
 */
 #elif !defined(__cplusplus)
 #define bool _Bool
 #define true 1
 #define false 0
@ -26,6 +31,4 @@
 #endif
 #endif
 #define __bool_true_false_are_defined 1
 #endif /* __STDBOOL_H */
--- a/lib/include/stddef.h
+++ b/lib/include/stddef.h
@ -62,7 +62,7 @@ typedef __SIZE_TYPE__ rsize_t;
 #endif /* defined(__need_STDDEF_H_misc) */
 #if defined(__need_wchar_t)
-#ifndef __cplusplus
+#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
 /* Always define wchar_t when modules are available. */
 #if !defined(_WCHAR_T) || __has_feature(modules)
 #if !__has_feature(modules)
--- a/lib/include/stdnoreturn.h
+++ b/lib/include/stdnoreturn.h
@ -13,4 +13,17 @@
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1
 #if __STDC_VERSION__ > 201710L &&                                              \
    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* The noreturn macro is deprecated in C2x. We do not mark it as such because
   including the header file in C2x is also deprecated and we do not want to
   issue a confusing diagnostic for code which includes <stdnoreturn.h>
   followed by code that writes [[noreturn]]. The issue with such code is not
   with the attribute, or the use of 'noreturn', but the inclusion of the
   header. */
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
 * to system headers which include this header file unconditionally.
 */
 #endif
 #endif /* __STDNORETURN_H */
--- a/lib/include/uintrintrin.h
+++ b/lib/include/uintrintrin.h
@ -39,9 +39,9 @@ struct __uintr_frame
 ///
 /// This intrinsic corresponds to the <c> CLUI </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///   UIF := 0
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _clui (void)
 {
@ -60,9 +60,9 @@ _clui (void)
 ///
 /// This intrinsic corresponds to the <c> STUI </c> instruction.
 ///
-/// \operation
+/// \code{.operation}
 ///   UIF := 1
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _stui (void)
 {
@ -81,7 +81,7 @@ _stui (void)
 ///
 /// \returns The current value of the user interrupt flag (UIF).
 ///
-/// \operation
+/// \code{.operation}
 ///   CF := UIF
 ///   ZF := 0
 ///   AF := 0
@ -89,7 +89,7 @@ _stui (void)
 ///   PF := 0
 ///   SF := 0
 ///   dst := CF
-/// \endoperation
+/// \endcode
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _testui (void)
 {
@ -110,7 +110,7 @@ _testui (void)
 ///    Index of user-interrupt target table entry in user-interrupt target
 ///    table.
 ///
-/// \operation
+/// \code{.operation}
 ///   IF __a > UITTSZ
 ///     GP (0)
 ///   FI
@ -143,7 +143,7 @@ _testui (void)
 ///       SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
 ///     FI
 ///   FI
-/// \endoperation
+/// \endcode
 static __inline__ void __DEFAULT_FN_ATTRS
 _senduipi (unsigned long long __a)
 {
--- a/lib/include/unwind.h
+++ b/lib/include/unwind.h
@ -62,7 +62,8 @@ typedef intptr_t _sleb128_t;
 typedef uintptr_t _uleb128_t;
 struct _Unwind_Context;
-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 struct _Unwind_Control_Block;
 typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
 #else
@ -72,7 +73,7 @@ typedef struct _Unwind_Exception _Unwind_Exception;
 typedef enum {
  _URC_NO_REASON = 0,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
-    !defined(__ARM_DWARF_EH__)
+    !defined(__ARM_DWARF_EH__) && !defined(__SEH__)
  _URC_OK = 0, /* used by ARM EHABI */
 #endif
  _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
@ -86,7 +87,7 @@ typedef enum {
  _URC_INSTALL_CONTEXT = 7,
  _URC_CONTINUE_UNWIND = 8,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
-    !defined(__ARM_DWARF_EH__)
+    !defined(__ARM_DWARF_EH__) && !defined(__SEH__)
  _URC_FAILURE = 9 /* used by ARM EHABI */
 #endif
 } _Unwind_Reason_Code;
@ -103,7 +104,8 @@ typedef enum {
 typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
                                             _Unwind_Exception *);
-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 typedef struct _Unwind_Control_Block _Unwind_Control_Block;
 typedef uint32_t _Unwind_EHT_Header;
@ -167,7 +169,8 @@ typedef _Unwind_Personality_Fn __personality_routine;
 typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
                                                void *);
-#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) ||                \
                          defined(__ARM_DWARF_EH__) || defined(__SEH__))
 typedef enum {
  _UVRSC_CORE = 0,        /* integer register */
  _UVRSC_VFP = 1,         /* vfp */
--- a/lib/include/velintrin.h
+++ b/lib/include/velintrin.h
@ -0,0 +1,71 @@
 /*===---- velintrin.h - VEL intrinsics for VE ------------------------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __VEL_INTRIN_H__
 #define __VEL_INTRIN_H__
 // Vector registers
 typedef double __vr __attribute__((__vector_size__(2048)));
 // Vector mask registers
 #if __STDC_VERSION__ >= 199901L
 // For C99
 typedef _Bool __vm    __attribute__((ext_vector_type(256)));
 typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
 typedef _Bool __vm512 __attribute__((ext_vector_type(512)));
 #else
 #ifdef __cplusplus
 // For C++
 typedef bool __vm    __attribute__((ext_vector_type(256)));
 typedef bool __vm256 __attribute__((ext_vector_type(256)));
 typedef bool __vm512 __attribute__((ext_vector_type(512)));
 #else
 #error need C++ or C99 to use vector intrinsics for VE
 #endif
 #endif
 enum VShuffleCodes {
  VE_VSHUFFLE_YUYU = 0,
  VE_VSHUFFLE_YUYL = 1,
  VE_VSHUFFLE_YUZU = 2,
  VE_VSHUFFLE_YUZL = 3,
  VE_VSHUFFLE_YLYU = 4,
  VE_VSHUFFLE_YLYL = 5,
  VE_VSHUFFLE_YLZU = 6,
  VE_VSHUFFLE_YLZL = 7,
  VE_VSHUFFLE_ZUYU = 8,
  VE_VSHUFFLE_ZUYL = 9,
  VE_VSHUFFLE_ZUZU = 10,
  VE_VSHUFFLE_ZUZL = 11,
  VE_VSHUFFLE_ZLYU = 12,
  VE_VSHUFFLE_ZLYL = 13,
  VE_VSHUFFLE_ZLZU = 14,
  VE_VSHUFFLE_ZLZL = 15,
 };
 // Use generated intrinsic name definitions
 #include <velintrin_gen.h>
 // Use helper functions
 #include <velintrin_approx.h>
 // pack
 #define _vel_pack_f32p __builtin_ve_vl_pack_f32p
 #define _vel_pack_f32a __builtin_ve_vl_pack_f32a
 static inline unsigned long int _vel_pack_i32(unsigned int a, unsigned int b) {
  return (((unsigned long int)a) << 32) | b;
 }
 #define _vel_extract_vm512u(vm) __builtin_ve_vl_extract_vm512u(vm)
 #define _vel_extract_vm512l(vm) __builtin_ve_vl_extract_vm512l(vm)
 #define _vel_insert_vm512u(vm512, vm) __builtin_ve_vl_insert_vm512u(vm512, vm)
 #define _vel_insert_vm512l(vm512, vm) __builtin_ve_vl_insert_vm512l(vm512, vm)
 #endif
--- a/lib/include/velintrin_approx.h
+++ b/lib/include/velintrin_approx.h
@ -0,0 +1,120 @@
 /*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
 *
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 * See https://llvm.org/LICENSE.txt for license information.
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 *
 *===-----------------------------------------------------------------------===
 */
 #ifndef __VEL_INTRIN_APPROX_H__
 #define __VEL_INTRIN_APPROX_H__
 static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
  float s0;
  __vr v2, v3, v4, v5;
  v5 = _vel_vrcps_vvl(v1, l);
  s0 = 1.0;
  v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
  v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
  v2 = _vel_vfmuls_vvvl(v0, v3, l);
  v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
  v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
  v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
  v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
  return v0;
 }
 static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
  float s0;
  __vr v2, v3, v4, v5;
  v5 = _vel_pvrcp_vvl(v1, l);
  s0 = 1.0;
  v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
  v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
  v2 = _vel_pvfmul_vvvl(v0, v3, l);
  v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
  v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
  v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
  v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
  return v0;
 }
 static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
  float s1;
  __vr v1, v2, v3, v4;
  v4 = _vel_vrcps_vvl(v0, l);
  s1 = 1.0;
  v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
  v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
  v1 = _vel_vfmuls_vsvl(s0, v2, l);
  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
  v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
  v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
  return v0;
 }
 static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
  float s1;
  __vr v1, v2;
  s1 = 1.0f / s0;
  v1 = _vel_vfmuls_vsvl(s1, v0, l);
  v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
  v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
  return v0;
 }
 static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
  __vr v1, v2, v3;
  v2 = _vel_vrcpd_vvl(v0, l);
  double s1 = 1.0;
  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
  v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
  v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
  v1 = _vel_vaddul_vsvl(1, v1, l);
  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
  v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
  v1 = _vel_vfmuld_vsvl(s0, v3, l);
  v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
  v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
  return v0;
 }
 static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
  double s0, s1;
  __vr v1, v2, v3;
  v2 = _vel_vrsqrtdnex_vvl(v0, l);
  v1 = _vel_vfmuld_vvvl(v0, v2, l);
  s0 = 1.0;
  s1 = 0.5;
  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
  v3 = _vel_vfmuld_vsvl(s1, v3, l);
  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
  v1 = _vel_vfmuld_vvvl(v0, v2, l);
  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
  v3 = _vel_vfmuld_vsvl(s1, v3, l);
  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
  return v0;
 }
 static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
  float s0, s1;
  __vr v1, v2, v3;
  v0 = _vel_vcvtds_vvl(v0, l);
  v2 = _vel_vrsqrtdnex_vvl(v0, l);
  v1 = _vel_vfmuld_vvvl(v0, v2, l);
  s0 = 1.0;
  s1 = 0.5;
  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
  v3 = _vel_vfmuld_vsvl(s1, v3, l);
  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
  v1 = _vel_vfmuld_vvvl(v0, v2, l);
  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
  v3 = _vel_vfmuld_vsvl(s1, v3, l);
  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
  v0 = _vel_vcvtsd_vvl(v0, l);
  return v0;
 }
 #endif
--- a/lib/include/velintrin_gen.h
+++ b/lib/include/velintrin_gen.h
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@ -1405,12 +1405,12 @@ wasm_f64x2_convert_low_u32x4(v128_t __a) {
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_trunc_sat_f64x2_zero(v128_t __a) {
-  return (v128_t)__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4((__f64x2)__a);
+  return (v128_t)__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4((__f64x2)__a);
 }
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_u32x4_trunc_sat_f64x2_zero(v128_t __a) {
-  return (v128_t)__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4((__f64x2)__a);
+  return (v128_t)__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4((__f64x2)__a);
 }
 static __inline__ v128_t __DEFAULT_FN_ATTRS
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@ -25,11 +25,29 @@
 #include <crc32intrin.h>
 #endif
-#define __SSC_MARK(Tag)                                                        \
+#if defined(__i386__)
-  __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; "                      \
+#define __FULLBX "ebx"
 #define __TMPGPR "eax"
 #else
 // When in 64-bit target, the 32-bit operands generate a 32-bit result,
 // zero-extended to a 64-bit result in the destination general-purpose,
 // It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
 // should preserve the 64-bit register rbx.
 #define __FULLBX "rbx"
 #define __TMPGPR "rax"
 #endif
 #define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
 #define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
 #define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
 #define __SSC_MARK(__Tag)                                                      \
  __asm__ __volatile__( __SAVE_GPRBX                                           \
                       "mov {%0, %%ebx|ebx, %0}; "                             \
                       ".byte 0x64, 0x67, 0x90; "                              \
-                       "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag)            \
+                        __RESTORE_GPRBX                                        \
-                       : "%eax");
+                       ::"i"(__Tag)                                            \
                       :  __TMPGPR );
 #endif /* __X86GPRINTRIN_H */
--- a/lib/include/x86intrin.h
+++ b/lib/include/x86intrin.h
@ -59,5 +59,9 @@
 #include <clzerointrin.h>
 #endif
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__RDPRU__)
 #include <rdpruintrin.h>
 #endif
 #endif /* __X86INTRIN_H */
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -2086,7 +2086,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// void _mm_prefetch(const void * a, const int sel);
+/// void _mm_prefetch(const void *a, const int sel);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
@ -2360,7 +2360,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    00: assigned from bits [15:0] of \a a. \n
 ///    01: assigned from bits [31:16] of \a a. \n
 ///    10: assigned from bits [47:32] of \a a. \n
-///    11: assigned from bits [63:48] of \a a.
+///    11: assigned from bits [63:48] of \a a. \n
 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
@ -2602,7 +2605,10 @@ void _mm_setcsr(unsigned int __i);
 ///    00: Bits [31:0] copied from the specified operand. \n
 ///    01: Bits [63:32] copied from the specified operand. \n
 ///    10: Bits [95:64] copied from the specified operand. \n
-///    11: Bits [127:96] copied from the specified operand.
+///    11: Bits [127:96] copied from the specified operand. \n
 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
 ///    <c>[b6, b4, b2, b0]</c>.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \