update C language headers to clang release/14.x

upstream commit 91632c8ac97fa3daffe4ff8f1391735b5d6805e6
2025-12-06 06:13:07 +00:00 · 2022-02-03 14:18:29 -07:00 · 2022-02-03 14:18:29 -07:00 · 397e055ddd
commit 397e055ddd
parent 60954598e9
72 changed files with 113784 additions and 133285 deletions
--- a/lib/include/__clang_cuda_complex_builtins.h
+++ b/lib/include/__clang_cuda_complex_builtins.h
@ -16,7 +16,7 @@
 // to work with CUDA and OpenMP target offloading [in C and C++ mode].)

 #pragma push_macro("__DEVICE__")
-#ifdef __OPENMP_NVPTX__
+#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
 #pragma omp declare target
 #define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
 #else
@ -26,7 +26,7 @@
 // To make the algorithms available for C and C++ in CUDA and OpenMP we select
 // different but equivalent function versions. TODO: For OpenMP we currently
 // select the native builtins as the overload support for templates is lacking.
-#if !defined(__OPENMP_NVPTX__)
+#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
 #define _ISNANd std::isnan
 #define _ISNANf std::isnan
 #define _ISINFd std::isinf
@ -276,7 +276,7 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
 #undef _fmaxd
 #undef _fmaxf

-#ifdef __OPENMP_NVPTX__
+#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
 #pragma omp end declare target
 #endif

--- a/lib/include/__clang_cuda_intrinsics.h
+++ b/lib/include/__clang_cuda_intrinsics.h
@ -483,4 +483,36 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,

 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320

+#if CUDA_VERSION >= 11000
+extern "C" {
+__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
+  return (size_t)(void __attribute__((address_space(1))) *)__ptr;
+}
+__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
+  return (size_t)(void __attribute__((address_space(3))) *)__ptr;
+}
+__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
+  return (size_t)(void __attribute__((address_space(4))) *)__ptr;
+}
+__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
+  return (size_t)(void __attribute__((address_space(5))) *)__ptr;
+}
+__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
+  return (void *)(void __attribute__((address_space(1))) *)__ptr;
+}
+__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
+  return (void *)(void __attribute__((address_space(3))) *)__ptr;
+}
+__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
+  return (void *)(void __attribute__((address_space(4))) *)__ptr;
+}
+__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
+  return (void *)(void __attribute__((address_space(5))) *)__ptr;
+}
+__device__ inline uint32_t __nvvm_get_smem_pointer(void *__ptr) {
+  return __nv_cvta_generic_to_shared_impl(__ptr);
+}
+} // extern "C"
+#endif // CUDA_VERSION >= 11000
+
 #endif // defined(__CLANG_CUDA_INTRINSICS_H__)
--- a/lib/include/__clang_cuda_libdevice_declares.h
+++ b/lib/include/__clang_cuda_libdevice_declares.h
@ -16,6 +16,7 @@ extern "C" {

 #if defined(__OPENMP_NVPTX__)
 #define __DEVICE__
+#pragma omp begin assumes ext_spmd_amenable no_openmp
 #elif defined(__CUDA__)
 #define __DEVICE__ __device__
 #endif
@ -456,6 +457,11 @@ __DEVICE__ double __nv_y1(double __a);
 __DEVICE__ float __nv_y1f(float __a);
 __DEVICE__ float __nv_ynf(int __a, float __b);
 __DEVICE__ double __nv_yn(int __a, double __b);
+
+#if defined(__OPENMP_NVPTX__)
+#pragma omp end assumes ext_spmd_amenable no_openmp
+#endif
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
--- a/lib/include/__clang_cuda_math.h
+++ b/lib/include/__clang_cuda_math.h
@ -345,4 +345,4 @@ __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
 #pragma pop_macro("__DEVICE_VOID__")
 #pragma pop_macro("__FAST_OR_SLOW")

-#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
+#endif // __CLANG_CUDA_MATH_H__
--- a/lib/include/__clang_cuda_runtime_wrapper.h
+++ b/lib/include/__clang_cuda_runtime_wrapper.h
@ -41,6 +41,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <stdlib.h>
+#include <string.h>
 #undef __CUDACC__

 // Preserve common macros that will be changed below by us or by CUDA
@ -64,9 +65,9 @@
 #endif

 // Make largest subset of device functions available during host
-// compilation -- SM_35 for the time being.
+// compilation.
 #ifndef __CUDA_ARCH__
-#define __CUDA_ARCH__ 350
+#define __CUDA_ARCH__ 9999
 #endif

 #include "__clang_cuda_builtin_vars.h"
@ -205,11 +206,6 @@ inline __host__ double __signbitd(double x) {
 #endif

 #if CUDA_VERSION >= 9000
-// CUDA-9.2 needs host-side memcpy for some host functions in
-// device_functions.hpp
-#if CUDA_VERSION >= 9020
-#include <string.h>
-#endif
 #include "crt/math_functions.hpp"
 #else
 #include "math_functions.hpp"
@ -275,7 +271,38 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 #undef __CUDABE__
 #endif
 #include "sm_20_atomic_functions.hpp"
+// Predicate functions used in `__builtin_assume` need to have no side effect.
+// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
+// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
+// them as pure ones.
+#pragma push_macro("__isGlobal")
+#pragma push_macro("__isShared")
+#pragma push_macro("__isConstant")
+#pragma push_macro("__isLocal")
+#define __isGlobal __ignored_cuda___isGlobal
+#define __isShared __ignored_cuda___isShared
+#define __isConstant __ignored_cuda___isConstant
+#define __isLocal __ignored_cuda___isLocal
 #include "sm_20_intrinsics.hpp"
+#pragma pop_macro("__isGlobal")
+#pragma pop_macro("__isShared")
+#pragma pop_macro("__isConstant")
+#pragma pop_macro("__isLocal")
+#pragma push_macro("__DEVICE__")
+#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
+__DEVICE__ unsigned int __isGlobal(const void *p) {
+  return __nvvm_isspacep_global(p);
+}
+__DEVICE__ unsigned int __isShared(const void *p) {
+  return __nvvm_isspacep_shared(p);
+}
+__DEVICE__ unsigned int __isConstant(const void *p) {
+  return __nvvm_isspacep_const(p);
+}
+__DEVICE__ unsigned int __isLocal(const void *p) {
+  return __nvvm_isspacep_local(p);
+}
+#pragma pop_macro("__DEVICE__")
 #include "sm_32_atomic_functions.hpp"

 // Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
@ -330,6 +357,34 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }

 #pragma pop_macro("__host__")

+// __clang_cuda_texture_intrinsics.h must be included first in order to provide
+// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
+// The implementation requires c++11 and only works with CUDA-9 or newer.
+#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
+// clang-format off
+#include <__clang_cuda_texture_intrinsics.h>
+// clang-format on
+#else
+#if CUDA_VERSION >= 9000
+// Provide a hint that texture support needs C++11.
+template <typename T> struct __nv_tex_needs_cxx11 {
+  const static bool value = false;
+};
+template <class T>
+__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
+                                               cudaTextureObject_t obj,
+                                               float x) {
+  _Static_assert(__nv_tex_needs_cxx11<T>::value,
+                 "Texture support requires C++11");
+}
+#else
+// Textures in CUDA-8 and older are not supported by clang.There's no
+// convenient way to intercept texture use in these versions, so we can't
+// produce a meaningful error. The source code that attempts to use textures
+// will continue to fail as it does now.
+#endif // CUDA_VERSION
+#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
+#include "texture_fetch_functions.h"
 #include "texture_indirect_functions.h"

 // Restore state of __CUDA_ARCH__ and __THROW we had on entry.
--- a/lib/include/__clang_cuda_texture_intrinsics.h
+++ b/lib/include/__clang_cuda_texture_intrinsics.h
@ -0,0 +1,740 @@
+/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ *
+ * This header provides in-header implmentations for NVCC's built-in
+ * __nv_tex_surf_handler() which is used by CUDA's texture-related headers.  The
+ * built-in is unusual as it's actually a set of function overloads that use the
+ * first string literal argument as one of the overload parameters.
+ */
+#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
+#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
+#ifndef __CUDA__
+#error "This file is for CUDA compilation only."
+#endif
+
+// __nv_tex_surf_handler() provided by this header as a macro.
+#define __nv_tex_surf_handler(__op, __ptr, ...)                                \
+  ::__cuda_tex::__tex_fetch<                                                   \
+      ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr,           \
+                                                              __VA_ARGS__)
+
+#pragma push_macro("__ASM_OUT")
+#pragma push_macro("__ASM_OUTP")
+#pragma push_macro("__Args")
+#pragma push_macro("__ID")
+#pragma push_macro("__IDV")
+#pragma push_macro("__IMPL_2DGATHER")
+#pragma push_macro("__IMPL_ALIAS")
+#pragma push_macro("__IMPL_ALIASI")
+#pragma push_macro("__IMPL_F1")
+#pragma push_macro("__IMPL_F3")
+#pragma push_macro("__IMPL_F3N")
+#pragma push_macro("__IMPL_F3S")
+#pragma push_macro("__IMPL_S")
+#pragma push_macro("__IMPL_S3")
+#pragma push_macro("__IMPL_S3I")
+#pragma push_macro("__IMPL_S3N")
+#pragma push_macro("__IMPL_S3NI")
+#pragma push_macro("__IMPL_S3S")
+#pragma push_macro("__IMPL_S3SI")
+#pragma push_macro("__IMPL_SI")
+#pragma push_macro("__L")
+#pragma push_macro("__STRIP_PARENS")
+
+// Put all functions into anonymous namespace so they have internal linkage.
+// The device-only function here must be internal in order to avoid ODR
+// violations in case they are used from the files compiled with
+// -fgpu-rdc. E.g. a library and an app using it may be built with a different
+// version of this header file.
+namespace {
+
+// Put the implmentation into its own namespace so we don't pollute the TU.
+namespace __cuda_tex {
+
+// First, we need a perfect hash function and a few constexpr helper functions
+// for converting a string literal into a numeric value which can be used to
+// parametrize a template. We can not use string literals for that as that would
+// require C++20.
+//
+// The hash function was generated with 'gperf' and then manually converted into
+// its constexpr equivalent.
+//
+// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
+// function has a collision for any of the texture operations, the compilation
+// will fail due to an attempt to redefine a tag with the same value. If the
+// header compiles, then the hash function is good enough for the job.
+
+constexpr int __tex_len(const char *s) {
+  return (s[0] == 0)    ? 0
+         : (s[1] == 0)  ? 1
+         : (s[2] == 0)  ? 2
+         : (s[3] == 0)  ? 3
+         : (s[4] == 0)  ? 4
+         : (s[5] == 0)  ? 5
+         : (s[6] == 0)  ? 6
+         : (s[7] == 0)  ? 7
+         : (s[8] == 0)  ? 8
+         : (s[9] == 0)  ? 9
+         : (s[10] == 0) ? 10
+         : (s[11] == 0) ? 11
+         : (s[12] == 0) ? 12
+         : (s[13] == 0) ? 13
+         : (s[14] == 0) ? 14
+         : (s[15] == 0) ? 15
+         : (s[16] == 0) ? 16
+         : (s[17] == 0) ? 17
+         : (s[18] == 0) ? 18
+         : (s[19] == 0) ? 19
+         : (s[20] == 0) ? 20
+         : (s[21] == 0) ? 21
+         : (s[22] == 0) ? 22
+         : (s[23] == 0) ? 23
+         : (s[24] == 0) ? 24
+         : (s[25] == 0) ? 25
+         : (s[26] == 0) ? 26
+         : (s[27] == 0) ? 27
+         : (s[28] == 0) ? 28
+         : (s[29] == 0) ? 29
+         : (s[30] == 0) ? 30
+         : (s[31] == 0) ? 31
+                        : 32;
+}
+
+constexpr int __tex_hash_map(int c) {
+  return (c == 49)    ? 10
+         : (c == 50)  ? 0
+         : (c == 51)  ? 100
+         : (c == 52)  ? 30
+         : (c == 67)  ? 10
+         : (c == 68)  ? 0
+         : (c == 69)  ? 25
+         : (c == 72)  ? 70
+         : (c == 77)  ? 0
+         : (c == 96)  ? 44
+         : (c == 99)  ? 10
+         : (c == 100) ? 5
+         : (c == 101) ? 60
+         : (c == 102) ? 40
+         : (c == 103) ? 70
+         : (c == 104) ? 25
+         : (c == 112) ? 0
+         : (c == 114) ? 45
+         : (c == 117) ? 5
+         : (c == 118) ? 85
+         : (c == 120) ? 20
+                      : 225;
+}
+
+constexpr int __tex_op_hash(const char *str) {
+  return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
+         __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
+}
+
+// Tag type to identify particular texture operation.
+template <int N> struct __Tag;
+#define __ID(__op) __Tag<__tex_op_hash(__op)>
+// Tags for variants of particular operation. E.g. tex2Dgather can translate
+// into 4 different instructions.
+#define __IDV(__op, __variant)                                                 \
+  __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
+
+// Helper classes for figuring out key data types for derived types.
+// E.g. char2 has __base_t = char, __fetch_t = char4
+template <class> struct __TypeInfoT;
+// Type info for the fundamental types.
+template <> struct __TypeInfoT<float> {
+  using __base_t = float;
+  using __fetch_t = float4;
+};
+template <> struct __TypeInfoT<char> {
+  using __base_t = char;
+  using __fetch_t = int4;
+};
+template <> struct __TypeInfoT<signed char> {
+  using __base_t = signed char;
+  using __fetch_t = int4;
+};
+template <> struct __TypeInfoT<unsigned char> {
+  using __base_t = unsigned char;
+  using __fetch_t = uint4;
+};
+template <> struct __TypeInfoT<short> {
+  using __base_t = short;
+  using __fetch_t = int4;
+};
+template <> struct __TypeInfoT<unsigned short> {
+  using __base_t = unsigned short;
+  using __fetch_t = uint4;
+};
+template <> struct __TypeInfoT<int> {
+  using __base_t = int;
+  using __fetch_t = int4;
+};
+template <> struct __TypeInfoT<unsigned int> {
+  using __base_t = unsigned int;
+  using __fetch_t = uint4;
+};
+
+// Derived base/fetch types for N-element vectors.
+template <class __T> struct __TypeInfoT {
+  using __base_t = decltype(__T::x);
+  using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
+};
+
+// Classes that implement specific texture ops.
+template <class __op> struct __tex_fetch_v4;
+
+// Helper macros to strip parens from a macro argument.
+#define __Args(...) __VA_ARGS__
+#define __STRIP_PARENS(__X) __X
+#define __L(__X) __STRIP_PARENS(__Args __X)
+
+// Construct inline assembly output args.
+// Results are stored in a temp var __r.
+// isResident bool is pointed to by __ir
+// Asm args for return values. It's a 4-element vector
+#define __ASM_OUT(__t)                                                         \
+  ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
+// .. possibly combined with a predicate.
+#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
+
+// Implements a single variant of texture fetch instruction.
+#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
+  template <>                                                                  \
+  __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
+    __rt __r;                                                                  \
+    asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
+    return __r;                                                                \
+  }
+
+// Implements texture fetch instructions for int4/uint4/float4 data types.
+#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
+  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
+            __ASM_OUT("r"), __asm_args)                                        \
+  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
+            __ASM_OUT("r"), __asm_args)                                        \
+  __IMPL_F1(float4, float4, __args,                                            \
+            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"),       \
+            __asm_args)
+// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
+// types. Similar to above, but returns a boolean 'isPresent' value in addition
+// to texture data,
+#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
+  __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
+            __ASM_OUTP("r"), __asm_args)                                       \
+  __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
+            __ASM_OUTP("r"), __asm_args)                                       \
+  __IMPL_F1(float4, float4, __args,                                            \
+            __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"),      \
+            __asm_args)
+
+// Similar to F3, but for integer data which is returned as normalized floats.
+// Only instantiates fetch functions for int4/uint4.
+#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
+  __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
+            __ASM_OUT("r"), __asm_args)                                        \
+  __IMPL_F1(float4, uint4, __args,                                             \
+            __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"),       \
+            __asm_args)
+
+// Instantiates __tex_fetch_v4 with regular fetch functions.
+#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
+    __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
+  }
+
+// Same, but for sparse ops. Only available on sm_60+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
+                    __asm_args)                                                \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
+    __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
+  }
+#else
+#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+#endif
+
+// Same, but for normalized float ops.
+#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
+                    __asm_args)                                                \
+  template <> struct __tex_fetch_v4<__op> {                                    \
+    template <class T>                                                         \
+    __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
+    __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
+  }
+
+// Regular and normalized float ops share a lot of similarities.  This macro
+// instantiates both variants -- normal for __op and normalized for __opn.
+#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
+                  __asm_args)                                                  \
+  __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
+  __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+
+// Convenience macros which converts string literal __op into a __Tag,
+#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
+  __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
+  __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
+#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
+                 __asm_args)                                                   \
+  __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
+            __asm_args)
+
+// CUDA headers have some 'legacy' texture oprerations that duplicate
+// functionality. So, we just inherit it, instead of refining a copy.
+#define __IMPL_ALIASI(__op, __opn)                                             \
+  template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
+#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
+
+// Now we can instantiate everything we need for each specific texture fetch
+// variant.
+__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
+__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
+         "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
+__IMPL_ALIAS("__itex1D", "__tex1D_v2");
+__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
+
+__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
+         (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
+         ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
+__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
+
+__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
+         (float __x, int __layer), "tex.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
+__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
+
+__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
+         (float __x, int __layer, float __dPdx, float __dPdy),
+         "tex.grad.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
+         ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
+__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
+
+__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
+         (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
+         ("r"(__layer), "f"(__x), "f"(__level)));
+__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
+
+__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
+         "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
+         ("f"(__x), "f"(__level)));
+__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
+
+// 2D
+__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
+__IMPL_ALIAS("__itex2D", "__tex2D_v2");
+
+__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
+           " selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y)));
+
+__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
+         (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
+         "tex.grad.2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
+         ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
+          "f"(__dPdy->y)));
+__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
+
+__IMPL_S3S("__itex2DGrad_sparse",
+           (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.grad.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
+            "f"(__dPdy->y)));
+
+__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
+         (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("r"(__layer), "f"(__x), "f"(__y)));
+__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
+
+__IMPL_S3S("__itex2DLayered_sparse",
+           (float __x, float __y, int __layer, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.a2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("r"(__layer), "f"(__x), "f"(__y)));
+
+__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
+         (float __x, float __y, int __layer, const float2 *__dPdx,
+          const float2 *__dPdy),
+         "tex.grad.a2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdy->x), "f"(__dPdy->y)));
+__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
+
+__IMPL_S3S(
+    "__itex2DLayeredGrad_sparse",
+    (float __x, float __y, int __layer, const float2 *__dPdx,
+     const float2 *__dPdy, unsigned char *__ir),
+    "{.reg .pred %%p0;\n\t"
+    "tex.grad.a2d.v4",
+    "f32",
+    "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
+    "selp.u16 %4, 1, 0, %%p0; }",
+    ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
+     "f"(__dPdy->x), "f"(__dPdy->y)));
+
+__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
+         (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
+__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
+
+__IMPL_S3S("__itex2DLayeredLod_sparse",
+           (float __x, float __y, int __layer, float __level,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.a2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
+
+__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
+         (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
+         ("f"(__x), "f"(__y), "f"(__level)));
+__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
+
+__IMPL_S3S("__itex2DLod_sparse",
+           (float __x, float __y, float __level, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.2d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__level)));
+
+// 2D gather is special. Unlike other variants that translate into exactly one
+// asm instruction, it uses one of the four different instructions selected by
+// __comp.  We implement each instruction variant separately, and dispatch the
+// right one from the manually implemented 'umbrella' fetch.
+#define __IMPL_2DGATHER(variant, instr)                                        \
+  __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
+            __IDV("__tex2Dgather_rmnf_v2", variant),                           \
+            (float __x, float __y, int __comp), instr, "f32",                  \
+            "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
+  __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
+                __IDV("__tex2Dgather_v2", variant));                           \
+  __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
+              (float __x, float __y, unsigned char *__ir, int __comp),         \
+              "{.reg .pred %%p0;\n\t" instr, "f32",                            \
+              "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
+              "selp.u16 %4, 1, 0, %%p0; }",                                    \
+              ("f"(__x), "f"(__y)));
+__IMPL_2DGATHER(0, "tld4.r.2d.v4");
+__IMPL_2DGATHER(1, "tld4.g.2d.v4");
+__IMPL_2DGATHER(2, "tld4.b.2d.v4");
+__IMPL_2DGATHER(3, "tld4.a.2d.v4");
+
+// Umbrella dispatcher -- calls into specific 2Dgather variant.
+template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
+  template <class __T>
+  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
+                              int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    }
+  }
+};
+__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
+
+template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
+  template <class __T>
+  __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
+                                 float __y, int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
+          __obj, __x, __y, __comp);
+    }
+  }
+};
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
+  template <class __T>
+  __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
+                              unsigned char *__ir, int __comp) {
+    switch (__comp) {
+    case 0:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 1:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 2:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    case 3:
+      return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
+          __obj, __x, __y, __ir, __comp);
+    }
+  }
+};
+#endif
+
+// 3D
+__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
+         "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itex3D", "__tex3D_v2");
+
+__IMPL_S3S("__itex3D_sparse",
+           (float __x, float __y, float __z, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z)));
+
+__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
+         (float __x, float __y, float __z, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.3d.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
+         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
+
+__IMPL_S3S("__itex3DGrad_sparse",
+           (float __x, float __y, float __z, const float4 *__dPdx,
+            const float4 *__dPdy, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.grad.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
+           "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+            "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+
+__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
+         (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
+
+__IMPL_S3S("__itex3DLod_sparse",
+           (float __x, float __y, float __z, float __level,
+            unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.level.3d.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+
+// Cubemap
+__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
+         (float __x, float __y, float __z), "tex.cube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
+         ("f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
+
+__IMPL_S3S("__itexCubemap_sparse",
+           (float __x, float __y, float __z, unsigned char *__ir),
+           "{.reg .pred %%p0;\n\t"
+           "tex.cube.v4",
+           "f32",
+           "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
+           "selp.u16 %4, 1, 0, %%p0; }",
+           ("f"(__x), "f"(__y), "f"(__z)));
+
+__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
+         (float __x, float __y, float __z, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.cube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
+         "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
+          "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
+__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
+
+__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
+         (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
+__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
+
+__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
+         (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
+          const float4 *__dPdy),
+         "tex.grad.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
+         "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
+          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
+          "f"(__dPdy->z)));
+__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
+
+__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
+         (float __x, float __y, float __z, int __layer, float __level),
+         "tex.level.acube.v4", "f32",
+         "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
+         ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
+
+__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
+         (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
+         "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
+         ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
+__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
+
+// Helper class for extracting slice of data from V4 fetch results.
+template <class __DestT, class __SrcT> struct __convert {
+  template <int __NElements = sizeof(__DestT) /
+                              sizeof(typename __TypeInfoT<__DestT>::__base_t)>
+  __device__ static __DestT __run(__SrcT __v);
+  template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
+  template <> __device__ static __DestT __run<2>(__SrcT __v) {
+    return {__v.x, __v.y};
+  }
+  template <> __device__ static __DestT __run<3>(__SrcT __v) {
+    return {__v.x, __v.y, __v.z};
+  }
+  template <> __device__ static __DestT __run<4>(__SrcT __v) {
+    return {__v.x, __v.y, __v.z, __v.w};
+  }
+};
+
+// These are the top-level function overloads the __nv_tex_surf_handler expands
+// to.  Each overload deals with one of the several ways __nv_tex_surf_handler
+// is called by CUDA headers. In the end, each of the overloads does the same
+// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
+// fetch texture data and which `__convert::run` is needed to convert it into
+// appropriate return type.
+
+// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
+//   Data type and return type are based on ret.
+template <class __op, class __T, class... __Args>
+__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
+                                   __Args... __args) {
+  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
+  *__ptr = __convert<__T, __FetchT>::__run(
+      __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
+}
+
+// texture<> objects get magically converted into a texture reference.  However,
+// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
+// cheat a bit and use inline assembly to do it. It costs us an extra register
+// and a move, but that is easy for ptxas to optimize away.
+template <class __T>
+__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
+  cudaTextureObject_t __obj;
+  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
+  return __obj;
+}
+
+// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
+//   Data type and return type is based on ret.
+template <class __op, class __T, class __HandleT, class... __Args>
+__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
+                                   __Args... __args) {
+  using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
+  *__ptr = __convert<__T, __FetchT>::__run(
+      __tex_fetch_v4<__op>::template __run<__FetchT>(
+          __tex_handle_to_obj(__handle), __args...));
+}
+
+// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
+// cudaReadModeNormalizedFloat fetches always return float4.
+template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
+__device__ static void
+__tex_fetch(__DataT *, __RetT *__ptr,
+            texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
+            __Args... __args) {
+  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
+  *__ptr = __convert<__RetT, float4>::__run(
+      __tex_fetch_v4<__op>::template __run<__FetchT>(
+          __tex_handle_to_obj(__handle), __args...));
+}
+
+// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
+// For cudaReadModeElementType fetch return type is based on type_dummy.
+template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
+__device__ static void
+__tex_fetch(__DataT *, __RetT *__ptr,
+            texture<__DataT, __TexT, cudaReadModeElementType> __handle,
+            __Args... __args) {
+  using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
+  *__ptr = __convert<__RetT, __FetchT>::__run(
+      __tex_fetch_v4<__op>::template __run<__FetchT>(
+          __tex_handle_to_obj(__handle), __args...));
+}
+} // namespace __cuda_tex
+} // namespace
+#pragma pop_macro("__ASM_OUT")
+#pragma pop_macro("__ASM_OUTP")
+#pragma pop_macro("__Args")
+#pragma pop_macro("__ID")
+#pragma pop_macro("__IDV")
+#pragma pop_macro("__IMPL_2DGATHER")
+#pragma pop_macro("__IMPL_ALIAS")
+#pragma pop_macro("__IMPL_ALIASI")
+#pragma pop_macro("__IMPL_F1")
+#pragma pop_macro("__IMPL_F3")
+#pragma pop_macro("__IMPL_F3N")
+#pragma pop_macro("__IMPL_F3S")
+#pragma pop_macro("__IMPL_S")
+#pragma pop_macro("__IMPL_S3")
+#pragma pop_macro("__IMPL_S3I")
+#pragma pop_macro("__IMPL_S3N")
+#pragma pop_macro("__IMPL_S3NI")
+#pragma pop_macro("__IMPL_S3S")
+#pragma pop_macro("__IMPL_S3SI")
+#pragma pop_macro("__IMPL_SI")
+#pragma pop_macro("__L")
+#pragma pop_macro("__STRIP_PARENS")
+#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
--- a/lib/include/__clang_hip_runtime_wrapper.h
+++ b/lib/include/__clang_hip_runtime_wrapper.h
@ -50,6 +50,9 @@ extern "C" {
 #include <cmath>
 #include <cstdlib>
 #include <stdlib.h>
+#if __has_include("hip/hip_version.h")
+#include "hip/hip_version.h"
+#endif // __has_include("hip/hip_version.h")
 #else
 typedef __SIZE_TYPE__ size_t;
 // Define macros which are needed to declare HIP device API's without standard
@ -74,25 +77,35 @@ typedef __SIZE_TYPE__ __hip_size_t;
 extern "C" {
 #endif //__cplusplus

+#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
+extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
+extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
+__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
+  return (void *) __ockl_dm_alloc(__size);
+}
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __ockl_dm_dealloc((unsigned long long)__ptr);
+}
+#else  // HIP version check
 #if __HIP_ENABLE_DEVICE_MALLOC__
 __device__ void *__hip_malloc(__hip_size_t __size);
 __device__ void *__hip_free(void *__ptr);
 __attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  return __hip_malloc(__size);
 }
-__attribute__((weak)) inline __device__ void *free(void *__ptr) {
-  return __hip_free(__ptr);
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
+  __hip_free(__ptr);
 }
 #else
 __attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
  __builtin_trap();
  return (void *)0;
 }
-__attribute__((weak)) inline __device__ void *free(void *__ptr) {
+__attribute__((weak)) inline __device__ void free(void *__ptr) {
  __builtin_trap();
-  return (void *)0;
 }
 #endif
+#endif // HIP version check

 #ifdef __cplusplus
 } // extern "C"
--- a/lib/include/__wmmintrin_aes.h
+++ b/lib/include/__wmmintrin_aes.h
@ -133,7 +133,7 @@ _mm_aesimc_si128(__m128i __V)
 ///    An 8-bit round constant used to generate the AES encryption key.
 /// \returns A 128-bit round key for AES encryption.
 #define _mm_aeskeygenassist_si128(C, R) \
-  (__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
+  ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))

 #undef __DEFAULT_FN_ATTRS

--- a/lib/include/altivec.h
+++ b/lib/include/altivec.h
@ -19,6 +19,10 @@
 #define __CR6_EQ_REV 1
 #define __CR6_LT 2
 #define __CR6_LT_REV 3
+#define __CR6_GT 4
+#define __CR6_GT_REV 5
+#define __CR6_SO 6
+#define __CR6_SO_REV 7

 /* Constants for vec_test_data_class */
 #define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
@ -1810,6 +1814,11 @@ vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
  return (vector bool __int128)__builtin_altivec_vcmpequq(
      (vector bool __int128)__a, (vector bool __int128)__b);
 }
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpeq(vector bool __int128 __a, vector bool  __int128 __b) {
+  return (vector bool __int128)__builtin_altivec_vcmpequq(__a, __b);
+}
 #endif

 #ifdef __POWER9_VECTOR__
@ -1887,6 +1896,11 @@ vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) {
  return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
      (vector bool __int128)__a, (vector bool __int128)__b));
 }
+
+static __inline__ vector bool __int128 __ATTRS_o_ai
+vec_cmpne(vector bool __int128 __a, vector bool __int128 __b) {
+  return (vector bool __int128) ~(__builtin_altivec_vcmpequq(__a, __b));
+}
 #endif

 /* vec_cmpnez */
@ -2472,7 +2486,7 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
 #ifdef __POWER8_VECTOR__
 /* vec_popcnt */

-static __inline__ vector signed char __ATTRS_o_ai
+static __inline__ vector unsigned char __ATTRS_o_ai
 vec_popcnt(vector signed char __a) {
  return __builtin_altivec_vpopcntb(__a);
 }
@ -2480,7 +2494,7 @@ static __inline__ vector unsigned char __ATTRS_o_ai
 vec_popcnt(vector unsigned char __a) {
  return __builtin_altivec_vpopcntb(__a);
 }
-static __inline__ vector signed short __ATTRS_o_ai
+static __inline__ vector unsigned short __ATTRS_o_ai
 vec_popcnt(vector signed short __a) {
  return __builtin_altivec_vpopcnth(__a);
 }
@ -2488,7 +2502,7 @@ static __inline__ vector unsigned short __ATTRS_o_ai
 vec_popcnt(vector unsigned short __a) {
  return __builtin_altivec_vpopcnth(__a);
 }
-static __inline__ vector signed int __ATTRS_o_ai
+static __inline__ vector unsigned int __ATTRS_o_ai
 vec_popcnt(vector signed int __a) {
  return __builtin_altivec_vpopcntw(__a);
 }
@ -2496,7 +2510,7 @@ static __inline__ vector unsigned int __ATTRS_o_ai
 vec_popcnt(vector unsigned int __a) {
  return __builtin_altivec_vpopcntw(__a);
 }
-static __inline__ vector signed long long __ATTRS_o_ai
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_popcnt(vector signed long long __a) {
  return __builtin_altivec_vpopcntd(__a);
 }
@ -3049,13 +3063,10 @@ static __inline__ vector unsigned char __ATTRS_o_ai
 vec_xl_len_r(const unsigned char *__a, size_t __b) {
  vector unsigned char __res =
      (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
-#ifdef __LITTLE_ENDIAN__
  vector unsigned char __mask =
      (vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
-  __res = (vector unsigned char)__builtin_altivec_vperm_4si(
+  return (vector unsigned char)__builtin_altivec_vperm_4si(
      (vector int)__res, (vector int)__res, __mask);
-#endif
-  return __res;
 }

 // vec_xst_len
@ -3130,15 +3141,11 @@ static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
 static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
                                                  unsigned char *__b,
                                                  size_t __c) {
-#ifdef __LITTLE_ENDIAN__
  vector unsigned char __mask =
      (vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
  vector unsigned char __res =
      __builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
  return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
-#else
-  return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
-#endif
 }
 #endif
 #endif
@ -7106,6 +7113,11 @@ vec_orc(vector float __a, vector bool int __b) {
  return (vector float)((vector unsigned int)__a | ~__b);
 }

+static __inline__ vector float __ATTRS_o_ai vec_orc(vector float __a,
+                                                    vector float __b) {
+  return (vector float)((vector unsigned int)__a | ~(vector unsigned int)__b);
+}
+
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_orc(vector signed long long __a, vector signed long long __b) {
  return __a | ~__b;
@ -7150,6 +7162,12 @@ static __inline__ vector double __ATTRS_o_ai
 vec_orc(vector bool long long __a, vector double __b) {
  return (vector double)(__a | ~(vector unsigned long long)__b);
 }
+
+static __inline__ vector double __ATTRS_o_ai vec_orc(vector double __a,
+                                                     vector double __b) {
+  return (vector double)((vector bool long long)__a |
+                         ~(vector unsigned long long)__b);
+}
 #endif

 /* vec_vor */
@ -8399,9 +8417,20 @@ static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
 }

 #ifdef __VSX__
+#ifdef __XL_COMPAT_ALTIVEC__
+static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a);
+static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
+  double __fpscr = __builtin_readflm();
+  __builtin_setrnd(0);
+  vector double __rounded = vec_rint(__a);
+  __builtin_setflm(__fpscr);
+  return __rounded;
+}
+#else
 static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
  return __builtin_vsx_xvrdpi(__a);
 }
+#endif

 /* vec_rint */

@ -8839,7 +8868,7 @@ static __inline__ vector long long __ATTRS_o_ai
 vec_sl(vector long long __a, vector unsigned long long __b) {
  return (vector long long)vec_sl((vector unsigned long long)__a, __b);
 }
-#else
+#elif defined(__VSX__)
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_vspltb(vector unsigned char __a, unsigned char __b);
 static __inline__ vector unsigned long long __ATTRS_o_ai
@ -8885,7 +8914,7 @@ static __inline__ vector long long __ATTRS_o_ai
 vec_sl(vector long long __a, vector unsigned long long __b) {
  return (vector long long)vec_sl((vector unsigned long long)__a, __b);
 }
-#endif
+#endif /* __VSX__ */

 /* vec_vslb */

@ -10350,7 +10379,7 @@ static __inline__ vector long long __ATTRS_o_ai
 vec_sr(vector long long __a, vector unsigned long long __b) {
  return (vector long long)vec_sr((vector unsigned long long)__a, __b);
 }
-#else
+#elif defined(__VSX__)
 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
  __b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
@ -10394,7 +10423,7 @@ static __inline__ vector long long __ATTRS_o_ai
 vec_sr(vector long long __a, vector unsigned long long __b) {
  return (vector long long)vec_sr((vector unsigned long long)__a, __b);
 }
-#endif
+#endif /* __VSX__ */

 /* vec_vsrb */

@ -10480,7 +10509,7 @@ static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
  return (vector unsigned long long)((vector signed long long)__a >> __b);
 }
-#else
+#elif defined(__VSX__)
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_sra(vector signed long long __a, vector unsigned long long __b) {
  __b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
@ -10492,7 +10521,7 @@ vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
  __b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
  return (vector unsigned long long)((vector signed long long)__a >> __b);
 }
-#endif
+#endif /* __VSX__ */

 /* vec_vsrab */

@ -13441,74 +13470,74 @@ vec_vxor(vector bool long long __a, vector bool long long __b) {
 /* vec_extract */

 static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
-                                                       unsigned int __b) {
+                                                       signed int __b) {
  return __a[__b & 0xf];
 }

 static __inline__ unsigned char __ATTRS_o_ai
-vec_extract(vector unsigned char __a, unsigned int __b) {
+vec_extract(vector unsigned char __a, signed int __b) {
  return __a[__b & 0xf];
 }

 static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
-                                                         unsigned int __b) {
+                                                         signed int __b) {
  return __a[__b & 0xf];
 }

 static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
-                                                        unsigned int __b) {
+                                                        signed int __b) {
  return __a[__b & 0x7];
 }

 static __inline__ unsigned short __ATTRS_o_ai
-vec_extract(vector unsigned short __a, unsigned int __b) {
+vec_extract(vector unsigned short __a, signed int __b) {
  return __a[__b & 0x7];
 }

 static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
-                                                          unsigned int __b) {
+                                                          signed int __b) {
  return __a[__b & 0x7];
 }

 static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
-                                                      unsigned int __b) {
+                                                      signed int __b) {
  return __a[__b & 0x3];
 }

 static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
-                                                        unsigned int __b) {
+                                                        signed int __b) {
  return __a[__b & 0x3];
 }

 static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
-                                                        unsigned int __b) {
+                                                        signed int __b) {
  return __a[__b & 0x3];
 }

 #ifdef __VSX__
 static __inline__ signed long long __ATTRS_o_ai
-vec_extract(vector signed long long __a, unsigned int __b) {
+vec_extract(vector signed long long __a, signed int __b) {
  return __a[__b & 0x1];
 }

 static __inline__ unsigned long long __ATTRS_o_ai
-vec_extract(vector unsigned long long __a, unsigned int __b) {
+vec_extract(vector unsigned long long __a, signed int __b) {
  return __a[__b & 0x1];
 }

 static __inline__ unsigned long long __ATTRS_o_ai
-vec_extract(vector bool long long __a, unsigned int __b) {
+vec_extract(vector bool long long __a, signed int __b) {
  return __a[__b & 0x1];
 }

 static __inline__ double __ATTRS_o_ai vec_extract(vector double __a,
-                                                  unsigned int __b) {
+                                                  signed int __b) {
  return __a[__b & 0x1];
 }
 #endif

 static __inline__ float __ATTRS_o_ai vec_extract(vector float __a,
-                                                 unsigned int __b) {
+                                                 signed int __b) {
  return __a[__b & 0x3];
 }

@ -13568,82 +13597,82 @@ vec_extract_fp32_from_shortl(vector unsigned short __a) {

 static __inline__ vector signed char __ATTRS_o_ai
 vec_insert(signed char __a, vector signed char __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0xF] = __a;
  return __b;
 }

 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0xF] = __a;
  return __b;
 }

 static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
                                                           vector bool char __b,
                                                           int __c) {
-  __b[__c] = __a;
+  __b[__c & 0xF] = __a;
  return __b;
 }

 static __inline__ vector signed short __ATTRS_o_ai
 vec_insert(signed short __a, vector signed short __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x7] = __a;
  return __b;
 }

 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x7] = __a;
  return __b;
 }

 static __inline__ vector bool short __ATTRS_o_ai
 vec_insert(unsigned short __a, vector bool short __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x7] = __a;
  return __b;
 }

 static __inline__ vector signed int __ATTRS_o_ai
 vec_insert(signed int __a, vector signed int __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x3] = __a;
  return __b;
 }

 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x3] = __a;
  return __b;
 }

 static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
                                                          vector bool int __b,
                                                          int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x3] = __a;
  return __b;
 }

 #ifdef __VSX__
 static __inline__ vector signed long long __ATTRS_o_ai
 vec_insert(signed long long __a, vector signed long long __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x1] = __a;
  return __b;
 }

 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x1] = __a;
  return __b;
 }

 static __inline__ vector bool long long __ATTRS_o_ai
 vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x1] = __a;
  return __b;
 }
 static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
                                                        vector double __b,
                                                        int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x1] = __a;
  return __b;
 }
 #endif
@ -13651,7 +13680,7 @@ static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
 static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
                                                       vector float __b,
                                                       int __c) {
-  __b[__c] = __a;
+  __b[__c & 0x3] = __a;
  return __b;
 }

@ -14812,42 +14841,43 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
 #ifdef __VSX__
 static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
                                              vector signed long long __b) {
+#ifdef __POWER8_VECTOR__
  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
+#else
+  // No vcmpequd on Power7 so we xor the two vectors and compare against zero as
+  // 32-bit elements.
+  return vec_all_eq((vector signed int)vec_xor(__a, __b), (vector signed int)0);
+#endif
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
                                              vector unsigned long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
                                              vector long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
                                              vector unsigned long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
 }
 #endif

@ -14877,6 +14907,11 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a,
                                              vector unsigned __int128 __b) {
  return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
 }
+
+static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool __int128 __a,
+                                              vector bool __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
+}
 #endif

 /* vec_all_ge */
@ -15822,6 +15857,11 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a,
                                              vector unsigned __int128 __b) {
  return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
 }
+
+static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool __int128 __a,
+                                              vector bool __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
+}
 #endif

 /* vec_all_nge */
@ -16111,6 +16151,11 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a,
                                              vector unsigned __int128 __b) {
  return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
 }
+
+static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool __int128 __a,
+                                              vector bool __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
+}
 #endif

 /* vec_any_ge */
@ -17020,43 +17065,43 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
 #ifdef __VSX__
 static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
                                              vector signed long long __b) {
+#ifdef __POWER8_VECTOR__
  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
+#else
+  // Take advantage of the optimized sequence for vec_all_eq when vcmpequd is
+  // not available.
+  return !vec_all_eq(__a, __b);
+#endif
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
                                              vector unsigned long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
-                                      (vector long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
-                                      (vector signed long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(
-      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
                                              vector signed long long __b) {
-  return __builtin_altivec_vcmpequd_p(
-      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
                                              vector unsigned long long __b) {
-  return __builtin_altivec_vcmpequd_p(
-      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }

 static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
                                              vector bool long long __b) {
-  return __builtin_altivec_vcmpequd_p(
-      __CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
+  return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
 }
 #endif

@ -17086,6 +17131,11 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a,
                                              vector unsigned __int128 __b) {
  return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
 }
+
+static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool __int128 __a,
+                                              vector bool __int128 __b) {
+  return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
+}
 #endif

 /* vec_any_nge */
@ -17203,6 +17253,7 @@ provided.
 #define vec_ncipher_be __builtin_altivec_crypto_vncipher
 #define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast

+#ifdef __VSX__
 static __inline__ vector unsigned long long __attribute__((__always_inline__))
 __builtin_crypto_vsbox(vector unsigned long long __a) {
  return __builtin_altivec_crypto_vsbox(__a);
@ -17231,6 +17282,7 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a,
                              vector unsigned long long __b) {
  return __builtin_altivec_crypto_vncipherlast(__a, __b);
 }
+#endif /* __VSX__ */

 #define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
 #define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
@ -17346,12 +17398,22 @@ vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
 }

 #if defined(__powerpc64__) && defined(__SIZEOF_INT128__)
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
+static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
  return __builtin_altivec_vbpermq((vector unsigned char)__a,
                                   (vector unsigned char)__b);
 }
 #endif
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_bperm(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermq(__a, __b);
+}
+#endif // __POWER8_VECTOR__
+#ifdef __POWER9_VECTOR__
+static __inline__ vector unsigned long long __ATTRS_o_ai
+vec_bperm(vector unsigned long long __a, vector unsigned char __b) {
+  return __builtin_altivec_vbpermd(__a, __b);
+}
 #endif


@ -18198,13 +18260,13 @@ vec_expandm(vector unsigned __int128 __a) {

 #define vec_cntm(__a, __mp)                                                    \
  _Generic((__a), vector unsigned char                                         \
-           : __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)),           \
+           : __builtin_altivec_vcntmbb((__a), (unsigned char)(__mp)),          \
             vector unsigned short                                             \
-           : __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)),           \
+           : __builtin_altivec_vcntmbh((__a), (unsigned char)(__mp)),          \
             vector unsigned int                                               \
-           : __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)),           \
+           : __builtin_altivec_vcntmbw((__a), (unsigned char)(__mp)),          \
             vector unsigned long long                                         \
-           : __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
+           : __builtin_altivec_vcntmbd((__a), (unsigned char)(__mp)))

 /* vec_gen[b|h|w|d|q]m */

@ -18319,10 +18381,10 @@ vec_cfuge(vector unsigned long long __a, vector unsigned long long __b) {
           : __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
 #endif /* __VSX__ */

-/* vec_clrl */
+/* vec_clr_first */

 static __inline__ vector signed char __ATTRS_o_ai
-vec_clrl(vector signed char __a, unsigned int __n) {
+vec_clr_first(vector signed char __a, unsigned int __n) {
 #ifdef __LITTLE_ENDIAN__
  return __builtin_altivec_vclrrb(__a, __n);
 #else
@ -18331,7 +18393,7 @@ vec_clrl(vector signed char __a, unsigned int __n) {
 }

 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_clrl(vector unsigned char __a, unsigned int __n) {
+vec_clr_first(vector unsigned char __a, unsigned int __n) {
 #ifdef __LITTLE_ENDIAN__
  return __builtin_altivec_vclrrb((vector signed char)__a, __n);
 #else
@ -18339,10 +18401,10 @@ vec_clrl(vector unsigned char __a, unsigned int __n) {
 #endif
 }

-/* vec_clrr */
+/* vec_clr_last */

 static __inline__ vector signed char __ATTRS_o_ai
-vec_clrr(vector signed char __a, unsigned int __n) {
+vec_clr_last(vector signed char __a, unsigned int __n) {
 #ifdef __LITTLE_ENDIAN__
  return __builtin_altivec_vclrlb(__a, __n);
 #else
@ -18351,7 +18413,7 @@ vec_clrr(vector signed char __a, unsigned int __n) {
 }

 static __inline__ vector unsigned char __ATTRS_o_ai
-vec_clrr(vector unsigned char __a, unsigned int __n) {
+vec_clr_last(vector unsigned char __a, unsigned int __n) {
 #ifdef __LITTLE_ENDIAN__
  return __builtin_altivec_vclrlb((vector signed char)__a, __n);
 #else
@ -18733,36 +18795,39 @@ static __inline__ vector double __ATTRS_o_ai vec_splatid(const float __a) {

 static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
    vector signed int __a, const unsigned int __b, const signed int __c) {
+  const unsigned int __d = __b & 0x01;
 #ifdef __LITTLE_ENDIAN__
-  __a[1 - __b] = __c;
-  __a[3 - __b] = __c;
+  __a[1 - __d] = __c;
+  __a[3 - __d] = __c;
 #else
-  __a[__b] = __c;
-  __a[2 + __b] = __c;
+  __a[__d] = __c;
+  __a[2 + __d] = __c;
 #endif
  return __a;
 }

 static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
    vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
+  const unsigned int __d = __b & 0x01;
 #ifdef __LITTLE_ENDIAN__
-  __a[1 - __b] = __c;
-  __a[3 - __b] = __c;
+  __a[1 - __d] = __c;
+  __a[3 - __d] = __c;
 #else
-  __a[__b] = __c;
-  __a[2 + __b] = __c;
+  __a[__d] = __c;
+  __a[2 + __d] = __c;
 #endif
  return __a;
 }

 static __inline__ vector float __ATTRS_o_ai
 vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
+  const unsigned int __d = __b & 0x01;
 #ifdef __LITTLE_ENDIAN__
-  __a[1 - __b] = __c;
-  __a[3 - __b] = __c;
+  __a[1 - __d] = __c;
+  __a[3 - __d] = __c;
 #else
-  __a[__b] = __c;
-  __a[2 + __b] = __c;
+  __a[__d] = __c;
+  __a[2 + __d] = __c;
 #endif
  return __a;
 }
@ -18976,6 +19041,51 @@ vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) {
 #endif /* __SIZEOF_INT128__ */
 #endif /* __POWER10_VECTOR__ */

+#ifdef __POWER8_VECTOR__
+#define __bcdadd(__a, __b, __ps) __builtin_ppc_bcdadd((__a), (__b), (__ps))
+#define __bcdsub(__a, __b, __ps) __builtin_ppc_bcdsub((__a), (__b), (__ps))
+
+static __inline__ long __bcdadd_ofl(vector unsigned char __a,
+                                    vector unsigned char __b) {
+  return __builtin_ppc_bcdadd_p(__CR6_SO, __a, __b);
+}
+
+static __inline__ long __bcdsub_ofl(vector unsigned char __a,
+                                    vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __b);
+}
+
+static __inline__ long __bcd_invalid(vector unsigned char __a) {
+  return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __a);
+}
+
+static __inline__ long __bcdcmpeq(vector unsigned char __a,
+                                  vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_EQ, __a, __b);
+}
+
+static __inline__ long __bcdcmplt(vector unsigned char __a,
+                                  vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_LT, __a, __b);
+}
+
+static __inline__ long __bcdcmpgt(vector unsigned char __a,
+                                  vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_GT, __a, __b);
+}
+
+static __inline__ long __bcdcmple(vector unsigned char __a,
+                                  vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_GT_REV, __a, __b);
+}
+
+static __inline__ long __bcdcmpge(vector unsigned char __a,
+                                  vector unsigned char __b) {
+  return __builtin_ppc_bcdsub_p(__CR6_LT_REV, __a, __b);
+}
+
+#endif // __POWER8_VECTOR__
+
 #undef __ATTRS_o_ai

 #endif /* __ALTIVEC_H */
--- a/lib/include/ammintrin.h
+++ b/lib/include/ammintrin.h
@ -10,6 +10,10 @@
 #ifndef __AMMINTRIN_H
 #define __AMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
--- a/lib/include/amxintrin.h
+++ b/lib/include/amxintrin.h
@ -314,7 +314,7 @@ typedef struct __tile1024i_str {
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_loadd(__tile1024i *dst, const void *base,
+static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
                                    __SIZE_TYPE__ stride) {
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }
@ -335,7 +335,7 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_stream_loadd(__tile1024i *dst, const void *base,
+static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
                                           __SIZE_TYPE__ stride) {
  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 }
@ -357,7 +357,7 @@ static void __tile_stream_loadd(__tile1024i *dst, const void *base,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
@ -380,7 +380,7 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
@ -403,7 +403,7 @@ static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
@ -426,7 +426,7 @@ static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
                                     __tile1024i src1) {
  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
                                    src0.tile, src1.tile);
@ -446,7 +446,8 @@ static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
+                                     __tile1024i src) {
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }

@ -459,7 +460,7 @@ static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
 /// \param dst
 ///    The destination tile to be zero. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_zero(__tile1024i *dst) {
+static __inline__ void __tile_zero(__tile1024i *dst) {
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 }

@ -479,7 +480,7 @@ static void __tile_zero(__tile1024i *dst) {
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                       __tile1024i src1) {
  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                      src0.tile, src1.tile);
--- a/lib/include/arm_acle.h
+++ b/lib/include/arm_acle.h
@ -730,6 +730,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 #endif

+/* Memory Operations Intrinsics */
+#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
+#define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
+  __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
+#endif
+
 /* Transactional Memory Extension (TME) Intrinsics */
 #if __ARM_FEATURE_TME

--- a/lib/include/arm_neon.h
+++ b/lib/include/arm_neon.h
--- a/lib/include/avx2intrin.h
+++ b/lib/include/avx2intrin.h
@ -20,25 +20,25 @@

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
 #define _mm256_mpsadbw_epu8(X, Y, M) \
-  (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
-                                     (__v32qi)(__m256i)(Y), (int)(M))
+  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                      (__v32qi)(__m256i)(Y), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi8(__m256i __a)
 {
-    return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi16(__m256i __a)
 {
-    return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_abs_epi32(__m256i __a)
 {
-    return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -114,8 +114,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
 }

 #define _mm256_alignr_epi8(a, b, n) \
-  (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
-                                     (__v32qi)(__m256i)(b), (n))
+  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+                                      (__v32qi)(__m256i)(b), (n)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_and_si256(__m256i __a, __m256i __b)
@ -149,8 +149,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 }

 #define _mm256_blend_epi16(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
-                                     (__v16hi)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+                                      (__v16hi)(__m256i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@ -253,73 +253,73 @@ _mm256_madd_epi16(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epi32(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+  return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_max_epu32(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+  return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epi32(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+  return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+  return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+  return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
 }

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_min_epu32(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 }

 static __inline__ int __DEFAULT_FN_ATTRS256
@ -467,13 +467,13 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 }

 #define _mm256_shuffle_epi32(a, imm) \
-  (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))

 #define _mm256_shufflehi_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))

 #define _mm256_shufflelo_epi16(a, imm) \
-  (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sign_epi8(__m256i __a, __m256i __b)
@ -494,10 +494,10 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 }

 #define _mm256_slli_si256(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

 #define _mm256_bslli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_slli_epi16(__m256i __a, int __count)
@ -560,10 +560,10 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 }

 #define _mm256_srli_si256(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

 #define _mm256_bsrli_epi128(a, imm) \
-  (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
+  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srli_epi16(__m256i __a, int __count)
@ -743,12 +743,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
 #define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)

 #define _mm_blend_epi32(V1, V2, M) \
-  (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
-                                     (__v4si)(__m128i)(V2), (int)(M))
+  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+                                      (__v4si)(__m128i)(V2), (int)(M)))

 #define _mm256_blend_epi32(V1, V2, M) \
-  (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
-                                     (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+                                      (__v8si)(__m256i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastb_epi8(__m128i __X)
@ -806,7 +806,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 }

 #define _mm256_permute4x64_pd(V, M) \
-  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
+  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))

 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -815,17 +815,17 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 }

 #define _mm256_permute4x64_epi64(V, M) \
-  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
+  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))

 #define _mm256_permute2x128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))

 #define _mm256_extracti128_si256(V, M) \
-  (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
+  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))

 #define _mm256_inserti128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
-                                        (__v2di)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+                                         (__v2di)(__m128i)(V2), (int)(M)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskload_epi32(int const *__X, __m256i __M)
@ -936,211 +936,211 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 }

 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
                                      (double const *)(m), \
                                      (__v4si)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+                                      (__v2df)(__m128d)(mask), (s)))

 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
                                         (double const *)(m), \
                                         (__v4si)(__m128i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+                                         (__v4df)(__m256d)(mask), (s)))

 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
                                      (double const *)(m), \
                                      (__v2di)(__m128i)(i), \
-                                     (__v2df)(__m128d)(mask), (s))
+                                      (__v2df)(__m128d)(mask), (s)))

 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
                                         (double const *)(m), \
                                         (__v4di)(__m256i)(i), \
-                                        (__v4df)(__m256d)(mask), (s))
+                                         (__v4df)(__m256d)(mask), (s)))

 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
                                     (float const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+                                     (__v4sf)(__m128)(mask), (s)))

 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
                                        (float const *)(m), \
                                        (__v8si)(__m256i)(i), \
-                                       (__v8sf)(__m256)(mask), (s))
+                                        (__v8sf)(__m256)(mask), (s)))

 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
                                     (float const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v4sf)(__m128)(mask), (s))
+                                     (__v4sf)(__m128)(mask), (s)))

 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
                                        (float const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4sf)(__m128)(mask), (s))
+                                        (__v4sf)(__m128)(mask), (s)))

 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
                                     (int const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+                                     (__v4si)(__m128i)(mask), (s)))

 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
                                        (int const *)(m), \
                                        (__v8si)(__m256i)(i), \
-                                       (__v8si)(__m256i)(mask), (s))
+                                        (__v8si)(__m256i)(mask), (s)))

 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
                                     (int const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v4si)(__m128i)(mask), (s))
+                                     (__v4si)(__m128i)(mask), (s)))

 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
                                        (int const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4si)(__m128i)(mask), (s))
+                                        (__v4si)(__m128i)(mask), (s)))

 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
                                     (long long const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+                                     (__v2di)(__m128i)(mask), (s)))

 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
                                        (long long const *)(m), \
                                        (__v4si)(__m128i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+                                        (__v4di)(__m256i)(mask), (s)))

 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
                                     (long long const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v2di)(__m128i)(mask), (s))
+                                     (__v2di)(__m128i)(mask), (s)))

 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
                                        (long long const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4di)(__m256i)(mask), (s))
+                                        (__v4di)(__m256i)(mask), (s)))

 #define _mm_i32gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
                                      (double const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
                                                           _mm_setzero_pd()), \
-                                     (s))
+                                      (s)))

 #define _mm256_i32gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
                                         (double const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
                                                               _mm256_setzero_pd(), \
                                                               _CMP_EQ_OQ), \
-                                        (s))
+                                         (s)))

 #define _mm_i64gather_pd(m, i, s) \
-  (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
                                      (double const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
                                                           _mm_setzero_pd()), \
-                                     (s))
+                                      (s)))

 #define _mm256_i64gather_pd(m, i, s) \
-  (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
                                         (double const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
                                                               _mm256_setzero_pd(), \
                                                               _CMP_EQ_OQ), \
-                                        (s))
+                                         (s)))

 #define _mm_i32gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
                                     (float const *)(m), \
                                     (__v4si)(__m128i)(i), \
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                          _mm_setzero_ps()), \
-                                    (s))
+                                     (s)))

 #define _mm256_i32gather_ps(m, i, s) \
-  (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
                                        (float const *)(m), \
                                        (__v8si)(__m256i)(i), \
                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
                                                              _mm256_setzero_ps(), \
                                                              _CMP_EQ_OQ), \
-                                       (s))
+                                        (s)))

 #define _mm_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
                                     (float const *)(m), \
                                     (__v2di)(__m128i)(i), \
                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                          _mm_setzero_ps()), \
-                                    (s))
+                                     (s)))

 #define _mm256_i64gather_ps(m, i, s) \
-  (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
                                        (float const *)(m), \
                                        (__v4di)(__m256i)(i), \
                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
                                                             _mm_setzero_ps()), \
-                                       (s))
+                                        (s)))

 #define _mm_i32gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
                                     (int const *)(m), (__v4si)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+                                     (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm256_i32gather_epi32(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
                                        (int const *)(m), (__v8si)(__m256i)(i), \
-                                       (__v8si)_mm256_set1_epi32(-1), (s))
+                                        (__v8si)_mm256_set1_epi32(-1), (s)))

 #define _mm_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
                                     (int const *)(m), (__v2di)(__m128i)(i), \
-                                    (__v4si)_mm_set1_epi32(-1), (s))
+                                     (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm256_i64gather_epi32(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
                                        (int const *)(m), (__v4di)(__m256i)(i), \
-                                       (__v4si)_mm_set1_epi32(-1), (s))
+                                        (__v4si)_mm_set1_epi32(-1), (s)))

 #define _mm_i32gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
                                     (long long const *)(m), \
                                     (__v4si)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))

 #define _mm256_i32gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
                                        (long long const *)(m), \
                                        (__v4si)(__m128i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))

 #define _mm_i64gather_epi64(m, i, s) \
-  (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
                                     (long long const *)(m), \
                                     (__v2di)(__m128i)(i), \
-                                    (__v2di)_mm_set1_epi64x(-1), (s))
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))

 #define _mm256_i64gather_epi64(m, i, s) \
-  (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
                                        (long long const *)(m), \
                                        (__v4di)(__m256i)(i), \
-                                       (__v4di)_mm256_set1_epi64x(-1), (s))
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))

 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128
--- a/lib/include/avx512bf16intrin.h
+++ b/lib/include/avx512bf16intrin.h
@ -232,7 +232,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
 ///
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
      (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
@ -247,7 +247,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
 ///    bit is not set.
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
@ -265,7 +265,7 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
 ///    A 16-bit mask.
 /// \param __A
 ///    A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from convertion of __A
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
  return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
--- a/lib/include/avx512bwintrin.h
+++ b/lib/include/avx512bwintrin.h
@ -178,16 +178,16 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B)
 }

 #define _kshiftli_mask32(A, I) \
-  (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))
+  ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))

 #define _kshiftri_mask32(A, I) \
-  (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))
+  ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))

 #define _kshiftli_mask64(A, I) \
-  (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))
+  ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))

 #define _kshiftri_mask64(A, I) \
-  (__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))
+  ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _cvtmask32_u32(__mmask32 __A) {
@ -232,44 +232,44 @@ _store_mask64(__mmask64 *__A, __mmask64 __B) {
 /* Integer compare */

 #define _mm512_cmp_epi8_mask(a, b, p) \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)-1)
+                                          (__mmask64)-1))

 #define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)(m))
+                                          (__mmask64)(m)))

 #define _mm512_cmp_epu8_mask(a, b, p) \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
                                           (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)-1)
+                                           (__mmask64)-1))

 #define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
                                           (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)(m))
+                                           (__mmask64)(m)))

 #define _mm512_cmp_epi16_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)-1)
+                                          (__mmask32)-1))

 #define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)(m))
+                                          (__mmask32)(m)))

 #define _mm512_cmp_epu16_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
                                           (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)-1)
+                                           (__mmask32)-1))

 #define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
                                           (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)(m))
+                                           (__mmask32)(m)))

 #define _mm512_cmpeq_epi8_mask(A, B) \
    _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -485,7 +485,7 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_abs_epi8 (__m512i __A)
 {
-  return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
+  return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -507,7 +507,7 @@ _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_abs_epi16 (__m512i __A)
 {
-  return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
+  return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -751,7 +751,7 @@ _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -773,7 +773,7 @@ _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -796,7 +796,7 @@ _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -818,7 +818,7 @@ _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_max_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -840,7 +840,7 @@ _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
+  return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -862,7 +862,7 @@ _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epi16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
+  return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -884,7 +884,7 @@ _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
+  return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -906,7 +906,7 @@ _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_min_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
+  return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1428,36 +1428,36 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)


 #define _mm512_shufflehi_epi16(A, imm) \
-  (__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))
+  ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))

 #define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflehi_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)(__m512i)(W))
+                                       (__v32hi)(__m512i)(W)))

 #define _mm512_maskz_shufflehi_epi16(U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflehi_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)_mm512_setzero_si512())
+                                       (__v32hi)_mm512_setzero_si512()))

 #define _mm512_shufflelo_epi16(A, imm) \
-  (__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))
+  ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))


 #define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)(__m512i)(W))
+                                       (__v32hi)(__m512i)(W)))


 #define _mm512_maskz_shufflelo_epi16(U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                       (__v32hi)_mm512_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v32hi)_mm512_setzero_si512())
+                                       (__v32hi)_mm512_setzero_si512()))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sllv_epi16(__m512i __A, __m512i __B)
@ -1527,7 +1527,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
 }

 #define _mm512_bslli_epi128(a, imm) \
-  (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
+  ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srlv_epi16(__m512i __A, __m512i __B)
@ -1664,7 +1664,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
 }

 #define _mm512_bsrli_epi128(a, imm) \
-  (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
+  ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
@ -1984,32 +1984,32 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
 }

 #define _mm512_alignr_epi8(A, B, N) \
-  (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
-                                     (__v64qi)(__m512i)(B), (int)(N))
+  ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
+                                      (__v64qi)(__m512i)(B), (int)(N)))

 #define _mm512_mask_alignr_epi8(W, U, A, B, N) \
-  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                             (__v64qi)(__m512i)(W))
+                              (__v64qi)(__m512i)(W)))

 #define _mm512_maskz_alignr_epi8(U, A, B, N) \
-  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                              (__v64qi)(__m512i)_mm512_setzero_si512())
+                              (__v64qi)(__m512i)_mm512_setzero_si512()))

 #define _mm512_dbsad_epu8(A, B, imm) \
-  (__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
-                                      (__v64qi)(__m512i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
+                                       (__v64qi)(__m512i)(B), (int)(imm)))

 #define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)(__m512i)(W))
+                                  (__v32hi)(__m512i)(W)))

 #define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                  (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)_mm512_setzero_si512())
+                                  (__v32hi)_mm512_setzero_si512()))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sad_epu8 (__m512i __A, __m512i __B)
--- a/lib/include/avx512dqintrin.h
+++ b/lib/include/avx512dqintrin.h
@ -121,10 +121,10 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B)
 }

 #define _kshiftli_mask8(A, I) \
-  (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))
+  ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))

 #define _kshiftri_mask8(A, I) \
-  (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))
+  ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))

 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _cvtmask8_u32(__mmask8 __A) {
@ -342,19 +342,19 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
 }

 #define _mm512_cvt_roundpd_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R))
+                                            (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epu64 (__m512d __A) {
@ -381,19 +381,19 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
 }

 #define _mm512_cvt_roundpd_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epi64 (__m256 __A) {
@ -420,19 +420,19 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
 }

 #define _mm512_cvt_roundps_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R))
+                                            (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
                                            (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epu64 (__m256 __A) {
@ -459,19 +459,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
 }

 #define _mm512_cvt_roundps_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))


 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@ -494,19 +494,19 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
 }

 #define _mm512_cvt_roundepi64_pd(A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+                                            (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_cvtepi64_ps (__m512i __A) {
@ -533,19 +533,19 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
 }

 #define _mm512_cvt_roundepi64_ps(A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+                                           (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                          (int)(R))
+                                           (int)(R)))

 #define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+                                           (__mmask8)(U), (int)(R)))


 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -573,19 +573,19 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
 }

 #define _mm512_cvtt_roundpd_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epu64 (__m512d __A) {
@ -612,19 +612,19 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
 }

 #define _mm512_cvtt_roundpd_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R))
+                                              (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R))
+                                              (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R))
+                                              (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epi64 (__m256 __A) {
@ -651,19 +651,19 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
 }

 #define _mm512_cvtt_roundps_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
                                             (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epu64 (__m256 __A) {
@ -690,19 +690,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
 }

 #define _mm512_cvtt_roundps_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R))
+                                              (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R))
+                                              (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
                                              (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R))
+                                              (__mmask8)(U), (int)(R)))

 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtepu64_pd (__m512i __A) {
@ -724,20 +724,20 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
 }

 #define _mm512_cvt_roundepu64_pd(A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))


 #define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
                                             (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R))
+                                             (__mmask8)(U), (int)(R)))


 static __inline__ __m256 __DEFAULT_FN_ATTRS512
@ -765,290 +765,290 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
 }

 #define _mm512_cvt_roundepu64_ps(A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R))
+                                            (__mmask8)-1, (int)(R)))

 #define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                           (int)(R))
+                                            (int)(R)))

 #define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_range_pd(A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
                                           (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_mask_range_pd(W, U, A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_maskz_range_pd(U, A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_range_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R))
+                                           (__mmask8)-1, (int)(R)))

 #define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          (int)(R))
+                                           (int)(R)))

 #define _mm512_maskz_range_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
                                           (__v8df)(__m512d)(B), (int)(C), \
                                           (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(U), (int)(R))
+                                           (__mmask8)(U), (int)(R)))

 #define _mm512_range_ps(A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
                                          (__mmask16)-1, \
-                                         _MM_FROUND_CUR_DIRECTION)
+                                          _MM_FROUND_CUR_DIRECTION))

 #define _mm512_mask_range_ps(W, U, A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION)
+                                          _MM_FROUND_CUR_DIRECTION))

 #define _mm512_maskz_range_ps(U, A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
                                          (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION)
+                                          _MM_FROUND_CUR_DIRECTION))

 #define _mm512_range_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R))
+                                          (__mmask16)-1, (int)(R)))

 #define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         (int)(R))
+                                          (int)(R)))

 #define _mm512_maskz_range_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
                                          (__v16sf)(__m512)(B), (int)(C), \
                                          (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(U), (int)(R))
+                                          (__mmask16)(U), (int)(R)))

 #define _mm_range_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8) -1, (int)(C),\
-                                               (int)(R))
+                                                (int)(R)))

 #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)

 #define _mm_mask_range_round_ss(W, U, A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)(__m128)(W),\
                                                (__mmask8)(U), (int)(C),\
-                                               (int)(R))
+                                                (int)(R)))

 #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)

 #define _mm_maskz_range_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
                                                (__v4sf)(__m128)(B), \
                                                (__v4sf)_mm_setzero_ps(), \
                                                (__mmask8)(U), (int)(C),\
-                                               (int)(R))
+                                                (int)(R)))

 #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)

 #define _mm_range_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8) -1, (int)(C),\
-                                                (int)(R))
+                                                 (int)(R)))

 #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)

 #define _mm_mask_range_round_sd(W, U, A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)(__m128d)(W),\
                                                 (__mmask8)(U), (int)(C),\
-                                                (int)(R))
+                                                 (int)(R)))

 #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)

 #define _mm_maskz_range_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
                                                 (__v2df)(__m128d)(B), \
                                                 (__v2df)_mm_setzero_pd(), \
                                                 (__mmask8)(U), (int)(C),\
-                                                (int)(R))
+                                                 (int)(R)))

 #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)

 #define _mm512_reduce_pd(A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
                                            (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION)
+                                            _MM_FROUND_CUR_DIRECTION))

 #define _mm512_mask_reduce_pd(W, U, A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)(__m512d)(W), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+                                            _MM_FROUND_CUR_DIRECTION))

 #define _mm512_maskz_reduce_pd(U, A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+                                            _MM_FROUND_CUR_DIRECTION))

 #define _mm512_reduce_ps(A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
                                           (__mmask16)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_mask_reduce_ps(W, U, A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)(__m512)(W), \
                                           (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_maskz_reduce_ps(U, A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
                                           (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))

 #define _mm512_reduce_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+                                            (__mmask8)-1, (int)(R)))

 #define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_maskz_reduce_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
                                            (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+                                            (__mmask8)(U), (int)(R)))

 #define _mm512_reduce_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R))
+                                           (__mmask16)-1, (int)(R)))

 #define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R))
+                                           (__mmask16)(U), (int)(R)))

 #define _mm512_maskz_reduce_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
                                           (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R))
+                                           (__mmask16)(U), (int)(R)))

 #define _mm_reduce_ss(A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION))

 #define _mm_mask_reduce_ss(W, U, A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION))

 #define _mm_maskz_reduce_ss(U, A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), \
                                        (__mmask8)(U), (int)(C), \
-                                       _MM_FROUND_CUR_DIRECTION)
+                                        _MM_FROUND_CUR_DIRECTION))

 #define _mm_reduce_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), (int)(R))
+                                        (int)(C), (int)(R)))

 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), (int)(R))
+                                        (int)(C), (int)(R)))

 #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
                                        (__v4sf)(__m128)(B), \
                                        (__v4sf)_mm_setzero_ps(), \
-                                       (__mmask8)(U), (int)(C), (int)(R))
+                                        (__mmask8)(U), (int)(C), (int)(R)))

 #define _mm_reduce_sd(A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
                                         (__mmask8)-1, (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION)
+                                         _MM_FROUND_CUR_DIRECTION))

 #define _mm_mask_reduce_sd(W, U, A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), _MM_FROUND_CUR_DIRECTION)
+                                         (int)(C), _MM_FROUND_CUR_DIRECTION))

 #define _mm_maskz_reduce_sd(U, A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
                                         (__mmask8)(U), (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION)
+                                         _MM_FROUND_CUR_DIRECTION))

 #define _mm_reduce_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)-1, (int)(C), (int)(R))
+                                         (__mmask8)-1, (int)(C), (int)(R)))

 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), (int)(R))
+                                         (int)(C), (int)(R)))

 #define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
                                         (__v2df)(__m128d)(B), \
                                         (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)(U), (int)(C), (int)(R))
+                                         (__mmask8)(U), (int)(C), (int)(R)))

 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_movepi32_mask (__m512i __A)
@ -1218,158 +1218,158 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
 }

 #define _mm512_extractf32x8_ps(A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v8sf)_mm256_undefined_ps(), \
-                                           (__mmask8)-1)
+                                            (__mmask8)-1))

 #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm512_maskz_extractf32x8_ps(U, A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
                                            (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm512_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_undefined_pd(), \
-                                                (__mmask8)-1)
+                                                 (__mmask8)-1))

 #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm512_maskz_extractf64x2_pd(U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm512_extracti32x8_epi32(A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v8si)_mm256_undefined_si256(), \
-                                            (__mmask8)-1)
+                                             (__mmask8)-1))

 #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U))
+                                             (__mmask8)(U)))

 #define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
                                             (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U))
+                                             (__mmask8)(U)))

 #define _mm512_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                (int)(imm), \
                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1)
+                                                (__mmask8)-1))

 #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)_mm_setzero_si128(), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm512_insertf32x8(A, B, imm) \
-  (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
-                                     (__v8sf)(__m256)(B), (int)(imm))
+  ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+                                      (__v8sf)(__m256)(B), (int)(imm)))

 #define _mm512_mask_insertf32x8(W, U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)(__m512)(W))
+                                 (__v16sf)(__m512)(W)))

 #define _mm512_maskz_insertf32x8(U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                 (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)_mm512_setzero_ps())
+                                 (__v16sf)_mm512_setzero_ps()))

 #define _mm512_insertf64x2(A, B, imm) \
-  (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(imm))
+  ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(imm)))

 #define _mm512_mask_insertf64x2(W, U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)(__m512d)(W))
+                                  (__v8df)(__m512d)(W)))

 #define _mm512_maskz_insertf64x2(U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                  (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)_mm512_setzero_pd())
+                                  (__v8df)_mm512_setzero_pd()))

 #define _mm512_inserti32x8(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+                                       (__v8si)(__m256i)(B), (int)(imm)))

 #define _mm512_mask_inserti32x8(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)(__m512i)(W))
+                                 (__v16si)(__m512i)(W)))

 #define _mm512_maskz_inserti32x8(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                 (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)_mm512_setzero_si512())
+                                 (__v16si)_mm512_setzero_si512()))

 #define _mm512_inserti64x2(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
-                                          (__v2di)(__m128i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+                                           (__v2di)(__m128i)(B), (int)(imm)))

 #define _mm512_mask_inserti64x2(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)(__m512i)(W))
+                                  (__v8di)(__m512i)(W)))

 #define _mm512_maskz_inserti64x2(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                  (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)_mm512_setzero_si512())
+                                  (__v8di)_mm512_setzero_si512()))

 #define _mm512_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)(U))
+  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                               (int)(imm), (__mmask16)(U)))

 #define _mm512_fpclass_ps_mask(A, imm) \
-  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                               (int)(imm), (__mmask16)-1))

 #define _mm512_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                              (__mmask8)(U)))

 #define _mm512_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                              (__mmask8)-1))

 #define _mm_fpclass_sd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                           (__mmask8)-1))

 #define _mm_mask_fpclass_sd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                           (__mmask8)(U)))

 #define _mm_fpclass_ss_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                           (__mmask8)-1))

 #define _mm_mask_fpclass_ss_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                           (__mmask8)(U)))

 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS
--- a/lib/include/avx512erintrin.h
+++ b/lib/include/avx512erintrin.h
@ -15,19 +15,19 @@

 /* exp2a23 */
 #define _mm512_exp2a23_round_pd(A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)-1, (int)(R))
+                                       (__mmask8)-1, (int)(R)))

 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                      (int)(R))
+                                       (int)(R)))

 #define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)(M), (int)(R))
+                                       (__mmask8)(M), (int)(R)))

 #define _mm512_exp2a23_pd(A) \
  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -39,19 +39,19 @@
  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

 #define _mm512_exp2a23_round_ps(A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)-1, (int)(R))
+                                      (__mmask16)-1, (int)(R)))

 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                     (int)(R))
+                                      (int)(R)))

 #define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)(M), (int)(R))
+                                      (__mmask16)(M), (int)(R)))

 #define _mm512_exp2a23_ps(A) \
  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -64,19 +64,19 @@

 /* rsqrt28 */
 #define _mm512_rsqrt28_round_pd(A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)-1, (int)(R))
+                                          (__mmask8)-1, (int)(R)))

 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                         (int)(R))
+                                          (int)(R)))

 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                          (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(M), (int)(R))
+                                          (__mmask8)(M), (int)(R)))

 #define _mm512_rsqrt28_pd(A) \
  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -88,19 +88,19 @@
  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

 #define _mm512_rsqrt28_round_ps(A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)-1, (int)(R))
+                                         (__mmask16)-1, (int)(R)))

 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                        (int)(R))
+                                         (int)(R)))

 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                         (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(M), (int)(R))
+                                         (__mmask16)(M), (int)(R)))

 #define _mm512_rsqrt28_ps(A) \
  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -112,22 +112,22 @@
  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)

 #define _mm_rsqrt28_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R))
+                                               (__mmask8)-1, (int)(R)))

 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)(__m128)(S), \
-                                              (__mmask8)(M), (int)(R))
+                                               (__mmask8)(M), (int)(R)))

 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                               (__v4sf)(__m128)(B), \
                                               (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(M), (int)(R))
+                                               (__mmask8)(M), (int)(R)))

 #define _mm_rsqrt28_ss(A, B) \
  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -139,22 +139,22 @@
  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

 #define _mm_rsqrt28_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R))
+                                                (__mmask8)-1, (int)(R)))

 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)(__m128d)(S), \
-                                               (__mmask8)(M), (int)(R))
+                                                (__mmask8)(M), (int)(R)))

 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                                (__v2df)(__m128d)(B), \
                                                (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(M), (int)(R))
+                                                (__mmask8)(M), (int)(R)))

 #define _mm_rsqrt28_sd(A, B) \
  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -167,19 +167,19 @@

 /* rcp28 */
 #define _mm512_rcp28_round_pd(A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R))
+                                        (__mmask8)-1, (int)(R)))

 #define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R))
+                                        (int)(R)))

 #define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                        (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R))
+                                        (__mmask8)(M), (int)(R)))

 #define _mm512_rcp28_pd(A) \
  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -191,19 +191,19 @@
  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

 #define _mm512_rcp28_round_ps(A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R))
+                                       (__mmask16)-1, (int)(R)))

 #define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R))
+                                       (int)(R)))

 #define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                       (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R))
+                                       (__mmask16)(M), (int)(R)))

 #define _mm512_rcp28_ps(A) \
  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -215,22 +215,22 @@
  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)

 #define _mm_rcp28_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R))
+                                             (__mmask8)-1, (int)(R)))

 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)(__m128)(S), \
-                                            (__mmask8)(M), (int)(R))
+                                             (__mmask8)(M), (int)(R)))

 #define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                             (__v4sf)(__m128)(B), \
                                             (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(M), (int)(R))
+                                             (__mmask8)(M), (int)(R)))

 #define _mm_rcp28_ss(A, B) \
  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -242,22 +242,22 @@
  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

 #define _mm_rcp28_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R))
+                                              (__mmask8)-1, (int)(R)))

 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)(__m128d)(S), \
-                                             (__mmask8)(M), (int)(R))
+                                              (__mmask8)(M), (int)(R)))

 #define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                              (__v2df)(__m128d)(B), \
                                              (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(M), (int)(R))
+                                              (__mmask8)(M), (int)(R)))

 #define _mm_rcp28_sd(A, B) \
  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
--- a/lib/include/avx512fintrin.h
+++ b/lib/include/avx512fintrin.h
--- a/lib/include/avx512fp16intrin.h
+++ b/lib/include/avx512fp16intrin.h
--- a/lib/include/avx512vbmi2intrin.h
+++ b/lib/include/avx512vbmi2intrin.h
@ -129,88 +129,88 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
 }

 #define _mm512_shldi_epi64(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shldi_epi64(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                    (__v8di)(__m512i)(S))
+                                     (__v8di)(__m512i)(S)))

 #define _mm512_maskz_shldi_epi64(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                    (__v8di)_mm512_setzero_si512())
+                                     (__v8di)_mm512_setzero_si512()))

 #define _mm512_shldi_epi32(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shldi_epi32(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                   (__v16si)(__m512i)(S))
+                                    (__v16si)(__m512i)(S)))

 #define _mm512_maskz_shldi_epi32(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                   (__v16si)_mm512_setzero_si512())
+                                    (__v16si)_mm512_setzero_si512()))

 #define _mm512_shldi_epi16(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
-                                     (__v32hi)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shldi_epi16(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                   (__v32hi)(__m512i)(S))
+                                    (__v32hi)(__m512i)(S)))

 #define _mm512_maskz_shldi_epi16(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                   (__v32hi)_mm512_setzero_si512())
+                                    (__v32hi)_mm512_setzero_si512()))

 #define _mm512_shrdi_epi64(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                    (__v8di)(__m512i)(S))
+                                     (__v8di)(__m512i)(S)))

 #define _mm512_maskz_shrdi_epi64(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                    (__v8di)_mm512_setzero_si512())
+                                     (__v8di)_mm512_setzero_si512()))

 #define _mm512_shrdi_epi32(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                   (__v16si)(__m512i)(S))
+                                    (__v16si)(__m512i)(S)))

 #define _mm512_maskz_shrdi_epi32(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                   (__v16si)_mm512_setzero_si512())
+                                    (__v16si)_mm512_setzero_si512()))

 #define _mm512_shrdi_epi16(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
-                                     (__v32hi)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))

 #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                   (__v32hi)(__m512i)(S))
+                                    (__v32hi)(__m512i)(S)))

 #define _mm512_maskz_shrdi_epi16(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                   (__v32hi)_mm512_setzero_si512())
+                                    (__v32hi)_mm512_setzero_si512()))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
--- a/lib/include/avx512vlbf16intrin.h
+++ b/lib/include/avx512vlbf16intrin.h
@ -420,18 +420,46 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  return __R[0];
 }

+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+  return _mm_castsi128_ps(
+      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+}
+
 /// Convert Packed BF16 Data to Packed float Data.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
 }

+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+///    A 4-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
+      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
 /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
 ///
 /// \headerfile <x86intrin.h>
@ -441,13 +469,33 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
 }

+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+///    A 128-bit vector of [4 x float]. Elements are copied from __S when
+///     the corresponding mask bit is not set.
+/// \param __U
+///    A 4-bit mask. Elements are zeroed out when the corresponding mask
+///    bit is not set.
+/// \param __A
+///    A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
+  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
+      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
+      16));
+}
+
 /// Convert Packed BF16 Data to Packed float Data using merging mask.
 ///
 /// \headerfile <x86intrin.h>
@ -460,7 +508,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
 ///    bit is not set.
 /// \param __A
 ///    A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from convertion of __A
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
--- a/lib/include/avx512vlbwintrin.h
+++ b/lib/include/avx512vlbwintrin.h
@ -21,84 +21,84 @@
 /* Integer compare */

 #define _mm_cmp_epi8_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)-1)
+                                          (__mmask16)-1))

 #define _mm_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)(m))
+                                          (__mmask16)(m)))

 #define _mm_cmp_epu8_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
                                           (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)-1)
+                                           (__mmask16)-1))

 #define _mm_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
                                           (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)(m))
+                                           (__mmask16)(m)))

 #define _mm256_cmp_epi8_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)-1)
+                                          (__mmask32)-1))

 #define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)(m))
+                                          (__mmask32)(m)))

 #define _mm256_cmp_epu8_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
                                           (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)-1)
+                                           (__mmask32)-1))

 #define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
                                           (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)(m))
+                                           (__mmask32)(m)))

 #define _mm_cmp_epi16_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1)
+                                         (__mmask8)-1))

 #define _mm_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m))
+                                         (__mmask8)(m)))

 #define _mm_cmp_epu16_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
                                          (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1)
+                                          (__mmask8)-1))

 #define _mm_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
                                          (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m))
+                                          (__mmask8)(m)))

 #define _mm256_cmp_epi16_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)-1)
+                                          (__mmask16)-1))

 #define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)(m))
+                                          (__mmask16)(m)))

 #define _mm256_cmp_epu16_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
                                           (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)-1)
+                                           (__mmask16)-1))

 #define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
                                           (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)(m))
+                                           (__mmask16)(m)))

 #define _mm_cmpeq_epi8_mask(A, B) \
    _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -1821,46 +1821,46 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)


 #define _mm_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+                                       (__v8hi)(__m128i)(W)))

 #define _mm_maskz_shufflehi_epi16(U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                       (__v8hi)_mm_setzero_si128()))

 #define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)(__m256i)(W))
+                                       (__v16hi)(__m256i)(W)))

 #define _mm256_maskz_shufflehi_epi16(U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)_mm256_setzero_si256())
+                                       (__v16hi)_mm256_setzero_si256()))

 #define _mm_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+                                       (__v8hi)(__m128i)(W)))

 #define _mm_maskz_shufflelo_epi16(U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                       (__v8hi)_mm_setzero_si128()))

 #define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v16hi)(__m256i)(W))
+                                       (__v16hi)(__m256i)(W)))

 #define _mm256_maskz_shufflelo_epi16(U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                       (__v16hi)_mm256_shufflelo_epi16((A), \
                                                                       (imm)), \
-                                      (__v16hi)_mm256_setzero_si256())
+                                       (__v16hi)_mm256_setzero_si256()))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi16(__m256i __A, __m256i __B)
@ -2756,52 +2756,52 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
 }

 #define _mm_mask_alignr_epi8(W, U, A, B, N) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)(__m128i)(W))
+                                 (__v16qi)(__m128i)(W)))

 #define _mm_maskz_alignr_epi8(U, A, B, N) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
                                 (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)_mm_setzero_si128())
+                                 (__v16qi)_mm_setzero_si128()))

 #define _mm256_mask_alignr_epi8(W, U, A, B, N) \
-  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)(__m256i)(W))
+                              (__v32qi)(__m256i)(W)))

 #define _mm256_maskz_alignr_epi8(U, A, B, N) \
-  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
                              (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)_mm256_setzero_si256())
+                              (__v32qi)_mm256_setzero_si256()))

 #define _mm_dbsad_epu8(A, B, imm) \
-  (__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
-                                      (__v16qi)(__m128i)(B), (int)(imm))
+  ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(imm)))

 #define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+                                      (__v8hi)(__m128i)(W)))

 #define _mm_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                      (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                      (__v8hi)_mm_setzero_si128()))

 #define _mm256_dbsad_epu8(A, B, imm) \
-  (__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
-                                      (__v32qi)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
+                                       (__v32qi)(__m256i)(B), (int)(imm)))

 #define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)(__m256i)(W))
+                                  (__v16hi)(__m256i)(W)))

 #define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                  (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)_mm256_setzero_si256())
+                                  (__v16hi)_mm256_setzero_si256()))

 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
--- a/lib/include/avx512vldqintrin.h
+++ b/lib/include/avx512vldqintrin.h
@ -773,134 +773,134 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
 }

 #define _mm_range_pd(A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)-1)
+                                           (__mmask8)-1))

 #define _mm_mask_range_pd(W, U, A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)(__m128d)(W), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm_maskz_range_pd(U, A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
                                           (__v2df)(__m128d)(B), (int)(C), \
                                           (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm256_range_pd(A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)-1)
+                                           (__mmask8)-1))

 #define _mm256_mask_range_pd(W, U, A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)(__m256d)(W), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm256_maskz_range_pd(U, A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
                                           (__v4df)(__m256d)(B), (int)(C), \
                                           (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm_range_ps(A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
                                          (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)-1)
+                                          (__mmask8)-1))

 #define _mm_mask_range_ps(W, U, A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
-                                         (__v4sf)(__m128)(W), (__mmask8)(U))
+                                          (__v4sf)(__m128)(W), (__mmask8)(U)))

 #define _mm_maskz_range_ps(U, A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
                                          (__v4sf)(__m128)(B), (int)(C), \
                                          (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)(U))
+                                          (__mmask8)(U)))

 #define _mm256_range_ps(A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
                                          (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)-1)
+                                          (__mmask8)-1))

 #define _mm256_mask_range_ps(W, U, A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
-                                         (__v8sf)(__m256)(W), (__mmask8)(U))
+                                          (__v8sf)(__m256)(W), (__mmask8)(U)))

 #define _mm256_maskz_range_ps(U, A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
                                          (__v8sf)(__m256)(B), (int)(C), \
                                          (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)(U))
+                                          (__mmask8)(U)))

 #define _mm_reduce_pd(A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1)
+                                            (__mmask8)-1))

 #define _mm_mask_reduce_pd(W, U, A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm_maskz_reduce_pd(U, A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
                                            (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm256_reduce_pd(A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)-1)
+                                            (__mmask8)-1))

 #define _mm256_mask_reduce_pd(W, U, A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)(__m256d)(W), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm256_maskz_reduce_pd(U, A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
                                            (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)(U))
+                                            (__mmask8)(U)))

 #define _mm_reduce_ps(A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1)
+                                           (__mmask8)-1))

 #define _mm_mask_reduce_ps(W, U, A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)(__m128)(W), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm_maskz_reduce_ps(U, A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
                                           (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm256_reduce_ps(A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1)
+                                           (__mmask8)-1))

 #define _mm256_mask_reduce_ps(W, U, A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)(__m256)(W), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 #define _mm256_maskz_reduce_ps(U, A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
                                           (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U))
+                                           (__mmask8)(U)))

 static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_movepi32_mask (__m128i __A)
@ -1066,100 +1066,100 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 }

 #define _mm256_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_undefined_pd(), \
-                                                (__mmask8)-1)
+                                                 (__mmask8)-1))

 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm256_maskz_extractf64x2_pd(U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
                                                 (int)(imm), \
                                                 (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm256_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                (int)(imm), \
                                                (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1)
+                                                (__mmask8)-1))

 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)_mm_setzero_si128(), \
-                                                (__mmask8)(U))
+                                                 (__mmask8)(U)))

 #define _mm256_insertf64x2(A, B, imm) \
-  (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(imm))
+  ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(imm)))

 #define _mm256_mask_insertf64x2(W, U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)(__m256d)(W))
+                                  (__v4df)(__m256d)(W)))

 #define _mm256_maskz_insertf64x2(U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                  (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)_mm256_setzero_pd())
+                                  (__v4df)_mm256_setzero_pd()))

 #define _mm256_inserti64x2(A, B, imm) \
-  (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
-                                          (__v2di)(__m128i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+                                           (__v2di)(__m128i)(B), (int)(imm)))

 #define _mm256_mask_inserti64x2(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                  (__v4di)(__m256i)(W))
+                                   (__v4di)(__m256i)(W)))

 #define _mm256_maskz_inserti64x2(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                  (__v4di)_mm256_setzero_si256())
+                                   (__v4di)_mm256_setzero_si256()))

 #define _mm_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                              (__mmask8)(U)))

 #define _mm_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                              (__mmask8)-1))

 #define _mm256_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                              (__mmask8)(U)))

 #define _mm256_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                              (__mmask8)-1))

 #define _mm_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__mmask8)(U)))

 #define _mm_fpclass_ps_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__mmask8)-1))

 #define _mm256_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__mmask8)(U)))

 #define _mm256_fpclass_ps_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__mmask8)-1))

 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
--- a/lib/include/avx512vlfp16intrin.h
+++ b/lib/include/avx512vlfp16intrin.h
--- a/lib/include/avx512vlintrin.h
+++ b/lib/include/avx512vlintrin.h
--- a/lib/include/avx512vlvbmi2intrin.h
+++ b/lib/include/avx512vlvbmi2intrin.h
@ -239,172 +239,172 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
 }

 #define _mm256_shldi_epi64(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
-                                     (__v4di)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shldi_epi64(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                    (__v4di)(__m256i)(S))
+                                     (__v4di)(__m256i)(S)))

 #define _mm256_maskz_shldi_epi64(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                    (__v4di)_mm256_setzero_si256())
+                                     (__v4di)_mm256_setzero_si256()))

 #define _mm_shldi_epi64(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
-                                     (__v2di)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))

 #define _mm_mask_shldi_epi64(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                      (__v2di)(__m128i)(S))
+                                       (__v2di)(__m128i)(S)))

 #define _mm_maskz_shldi_epi64(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                      (__v2di)_mm_setzero_si128())
+                                       (__v2di)_mm_setzero_si128()))

 #define _mm256_shldi_epi32(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
-                                     (__v8si)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shldi_epi32(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                    (__v8si)(__m256i)(S))
+                                     (__v8si)(__m256i)(S)))

 #define _mm256_maskz_shldi_epi32(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                    (__v8si)_mm256_setzero_si256())
+                                     (__v8si)_mm256_setzero_si256()))

 #define _mm_shldi_epi32(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
-                                     (__v4si)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))

 #define _mm_mask_shldi_epi32(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                      (__v4si)(__m128i)(S))
+                                       (__v4si)(__m128i)(S)))

 #define _mm_maskz_shldi_epi32(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                      (__v4si)_mm_setzero_si128())
+                                       (__v4si)_mm_setzero_si128()))

 #define _mm256_shldi_epi16(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
-                                     (__v16hi)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shldi_epi16(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                   (__v16hi)(__m256i)(S))
+                                    (__v16hi)(__m256i)(S)))

 #define _mm256_maskz_shldi_epi16(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                   (__v16hi)_mm256_setzero_si256())
+                                    (__v16hi)_mm256_setzero_si256()))

 #define _mm_shldi_epi16(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
-                                     (__v8hi)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))

 #define _mm_mask_shldi_epi16(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                      (__v8hi)(__m128i)(S))
+                                       (__v8hi)(__m128i)(S)))

 #define _mm_maskz_shldi_epi16(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                       (__v8hi)_mm_setzero_si128()))

 #define _mm256_shrdi_epi64(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
-                                     (__v4di)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                    (__v4di)(__m256i)(S))
+                                     (__v4di)(__m256i)(S)))

 #define _mm256_maskz_shrdi_epi64(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                    (__v4di)_mm256_setzero_si256())
+                                     (__v4di)_mm256_setzero_si256()))

 #define _mm_shrdi_epi64(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
-                                     (__v2di)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))

 #define _mm_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                      (__v2di)(__m128i)(S))
+                                       (__v2di)(__m128i)(S)))

 #define _mm_maskz_shrdi_epi64(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                      (__v2di)_mm_setzero_si128())
+                                       (__v2di)_mm_setzero_si128()))

 #define _mm256_shrdi_epi32(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
-                                     (__v8si)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                    (__v8si)(__m256i)(S))
+                                     (__v8si)(__m256i)(S)))

 #define _mm256_maskz_shrdi_epi32(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                    (__v8si)_mm256_setzero_si256())
+                                     (__v8si)_mm256_setzero_si256()))

 #define _mm_shrdi_epi32(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
-                                     (__v4si)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))

 #define _mm_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                      (__v4si)(__m128i)(S))
+                                       (__v4si)(__m128i)(S)))

 #define _mm_maskz_shrdi_epi32(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                      (__v4si)_mm_setzero_si128())
+                                       (__v4si)_mm_setzero_si128()))

 #define _mm256_shrdi_epi16(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
-                                     (__v16hi)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))

 #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                   (__v16hi)(__m256i)(S))
+                                    (__v16hi)(__m256i)(S)))

 #define _mm256_maskz_shrdi_epi16(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                   (__v16hi)_mm256_setzero_si256())
+                                    (__v16hi)_mm256_setzero_si256()))

 #define _mm_shrdi_epi16(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
-                                     (__v8hi)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))

 #define _mm_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                      (__v8hi)(__m128i)(S))
+                                       (__v8hi)(__m128i)(S)))

 #define _mm_maskz_shrdi_epi16(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                       (__v8hi)_mm_setzero_si128()))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
--- a/lib/include/avx512vlvnniintrin.h
+++ b/lib/include/avx512vlvnniintrin.h
@ -36,7 +36,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpbusd_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -56,7 +56,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpbusds_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -74,7 +74,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpwssd_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -92,7 +92,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpwssds_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -112,7 +112,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpbusd_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -132,7 +132,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpbusds_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -150,7 +150,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpwssd_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -168,7 +168,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpwssds_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
--- a/lib/include/avxintrin.h
+++ b/lib/include/avxintrin.h
@ -400,7 +400,7 @@ _mm256_rcp_ps(__m256 __a)
 ///      11: Truncated.
 /// \returns A 256-bit vector of [4 x double] containing the rounded values.
 #define _mm256_round_pd(V, M) \
-    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
+  ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))

 /// Rounds the values stored in a 256-bit vector of [8 x float] as
 ///    specified by the byte operand. The source values are rounded to integer
@ -432,7 +432,7 @@ _mm256_rcp_ps(__m256 __a)
 ///      11: Truncated.
 /// \returns A 256-bit vector of [8 x float] containing the rounded values.
 #define _mm256_round_ps(V, M) \
-  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
+  ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))

 /// Rounds up the values stored in a 256-bit vector of [4 x double]. The
 ///    source values are rounded up to integer values and returned as 64-bit
@ -989,7 +989,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///         returned vector.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_permute_pd(A, C) \
-  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
+  ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))

 /// Copies the values in a 256-bit vector of [4 x double] as specified by
 ///    the immediate integer operand.
@ -1029,7 +1029,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///         returned vector.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute_pd(A, C) \
-  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
+  ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))

 /// Copies the values in a 128-bit vector of [4 x float] as specified by
 ///    the immediate integer operand.
@ -1085,7 +1085,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          returned vector.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_permute_ps(A, C) \
-  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
+  ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))

 /// Copies the values in a 256-bit vector of [8 x float] as specified by
 ///    the immediate integer operand.
@ -1177,7 +1177,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          returned vector.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute_ps(A, C) \
-  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
+  ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))

 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [4 x double], as specified by the immediate integer operand.
@ -1217,8 +1217,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///          destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_permute2f128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
-                                           (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+                                            (__v4df)(__m256d)(V2), (int)(M)))

 /// Permutes 128-bit data values stored in two 256-bit vectors of
 ///    [8 x float], as specified by the immediate integer operand.
@ -1258,8 +1258,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_permute2f128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
-                                          (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+                                           (__v8sf)(__m256)(V2), (int)(M)))

 /// Permutes 128-bit data values stored in two 256-bit integer vectors,
 ///    as specified by the immediate integer operand.
@ -1298,8 +1298,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    destination.
 /// \returns A 256-bit integer vector containing the copied values.
 #define _mm256_permute2f128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
-                                           (__v8si)(__m256i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+                                            (__v8si)(__m256i)(V2), (int)(M)))

 /* Vector Blend */
 /// Merges 64-bit double-precision data values stored in either of the
@ -1327,8 +1327,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [4 x double] containing the copied values.
 #define _mm256_blend_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
-                                     (__v4df)(__m256d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+                                      (__v4df)(__m256d)(V2), (int)(M)))

 /// Merges 32-bit single-precision data values stored in either of the
 ///    two 256-bit vectors of [8 x float], as specified by the immediate
@ -1355,8 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
 ///    operand \a V2 is copied to the same position in the destination.
 /// \returns A 256-bit vector of [8 x float] containing the copied values.
 #define _mm256_blend_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
-                                    (__v8sf)(__m256)(V2), (int)(M))
+  ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+                                     (__v8sf)(__m256)(V2), (int)(M)))

 /// Merges 64-bit double-precision data values stored in either of the
 ///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@ -1453,8 +1453,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    two parallel dot product computations.
 /// \returns A 256-bit vector of [8 x float] containing the two dot products.
 #define _mm256_dp_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
-                                 (__v8sf)(__m256)(V2), (M))
+  ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+                                  (__v8sf)(__m256)(V2), (M)))

 /* Vector shuffle */
 /// Selects 8 float values from the 256-bit operands of [8 x float], as
@ -1507,8 +1507,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    11: Bits [127:96] and [255:224] are copied from the selected operand.
 /// \returns A 256-bit vector of [8 x float] containing the shuffled values.
 #define _mm256_shuffle_ps(a, b, mask) \
-  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
-                                   (__v8sf)(__m256)(b), (int)(mask))
+  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
+                                    (__v8sf)(__m256)(b), (int)(mask)))

 /// Selects four double-precision values from the 256-bit operands of
 ///    [4 x double], as specified by the immediate value operand.
@ -1553,8 +1553,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    destination.
 /// \returns A 256-bit vector of [4 x double] containing the shuffled values.
 #define _mm256_shuffle_pd(a, b, mask) \
-  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
-                                    (__v4df)(__m256d)(b), (int)(mask))
+  ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
+                                     (__v4df)(__m256d)(b), (int)(mask)))

 /* Compare */
 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
@ -1647,8 +1647,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) \
-  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))

 /// Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
@ -1707,8 +1707,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) \
-  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))

 /// Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
@ -1767,8 +1767,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) \
-  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
-                                   (__v4df)(__m256d)(b), (c))
+  ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+                                    (__v4df)(__m256d)(b), (c)))

 /// Compares each of the corresponding values of two 256-bit vectors of
 ///    [8 x float], using the operation specified by the immediate integer
@ -1827,8 +1827,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) \
-  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
-                                  (__v8sf)(__m256)(b), (c))
+  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+                                   (__v8sf)(__m256)(b), (c)))

 /// Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
@ -1886,8 +1886,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) \
-  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
-                                (__v2df)(__m128d)(b), (c))
+  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+                                 (__v2df)(__m128d)(b), (c)))

 /// Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
@ -1945,8 +1945,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    0x1F: True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) \
-  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
-                               (__v4sf)(__m128)(b), (c))
+  ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+                                (__v4sf)(__m128)(b), (c)))

 /// Takes a [8 x i32] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -1964,7 +1964,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 32 bits of extended
 ///    packed data.
 #define _mm256_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))

 /// Takes a [16 x i16] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -1982,8 +1982,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 16 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi16(X, N) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
-                                                    (int)(N))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
+                                                     (int)(N)))

 /// Takes a [32 x i8] vector and returns the vector element value
 ///    indexed by the immediate constant operand.
@ -2001,8 +2001,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 32-bit integer containing the extracted 8 bits of zero extended
 ///    packed data.
 #define _mm256_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
+                                                    (int)(N)))

 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and returns the vector element value
@ -2021,7 +2021,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A 64-bit integer containing the extracted 64 bits of extended
 ///    packed data.
 #define _mm256_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
 #endif

 /// Takes a [8 x i32] vector and replaces the vector element value
@ -2043,8 +2043,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi32(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
+                                        (int)(I), (int)(N)))


 /// Takes a [16 x i16] vector and replaces the vector element value
@ -2066,8 +2066,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi16(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))

 /// Takes a [32 x i8] vector and replaces the vector element value
 ///    indexed by the immediate constant operand with a new value. Returns the
@ -2088,8 +2088,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///    \a __imm with \a __b.
 #define _mm256_insert_epi8(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
+                                         (int)(I), (int)(N)))

 #ifdef __x86_64__
 /// Takes a [4 x i64] vector and replaces the vector element value
@ -2111,8 +2111,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \returns A copy of vector \a __a, after replacing its element indexed by
 ///     \a __imm with \a __b.
 #define _mm256_insert_epi64(X, I, N) \
-  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif

 /* Conversion */
@ -4592,8 +4592,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit vector of [8 x float] containing the interleaved values.
 #define _mm256_insertf128_ps(V1, V2, M) \
-  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
-                                           (__v4sf)(__m128)(V2), (int)(M))
+  ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+                                            (__v4sf)(__m128)(V2), (int)(M)))

 /// Constructs a new 256-bit vector of [4 x double] by first duplicating
 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
@ -4630,8 +4630,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit vector of [4 x double] containing the interleaved values.
 #define _mm256_insertf128_pd(V1, V2, M) \
-  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
-                                            (__v2df)(__m128d)(V2), (int)(M))
+  ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+                                             (__v2df)(__m128d)(V2), (int)(M)))

 /// Constructs a new 256-bit integer vector by first duplicating a
 ///    256-bit integer vector given in the first parameter, and then replacing
@ -4668,8 +4668,8 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    result.
 /// \returns A 256-bit integer vector containing the interleaved values.
 #define _mm256_insertf128_si256(V1, V2, M) \
-  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
-                                            (__v4si)(__m128i)(V2), (int)(M))
+  ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
+                                             (__v4si)(__m128i)(V2), (int)(M)))

 /*
   Vector extract.
@ -4698,7 +4698,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [4 x float] containing the extracted bits.
 #define _mm256_extractf128_ps(V, M) \
-  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
+  ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))

 /// Extracts either the upper or the lower 128 bits from a 256-bit vector
 ///    of [4 x double], as determined by the immediate integer parameter, and
@ -4722,7 +4722,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit vector of [2 x double] containing the extracted bits.
 #define _mm256_extractf128_pd(V, M) \
-  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
+  ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))

 /// Extracts either the upper or the lower 128 bits from a 256-bit
 ///    integer vector, as determined by the immediate integer parameter, and
@ -4746,177 +4746,7 @@ _mm256_zextsi128_si256(__m128i __a)
 ///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
 /// \returns A 128-bit integer vector containing the extracted bits.
 #define _mm256_extractf128_si256(V, M) \
-  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
-
-/* SIMD load ops (unaligned) */
-/// Loads two 128-bit floating-point vectors of [4 x float] from
-///    unaligned memory locations and constructs a 256-bit floating-point vector
-///    of [8 x float] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing 4 consecutive
-///    single-precision floating-point values. These values are to be copied to
-///    bits[255:128] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing 4 consecutive
-///    single-precision floating-point values. These values are to be copied to
-///    bits[127:0] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-///    concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
-{
-  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
-  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
-}
-
-/// Loads two 128-bit floating-point vectors of [2 x double] from
-///    unaligned memory locations and constructs a 256-bit floating-point vector
-///    of [4 x double] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing two consecutive
-///    double-precision floating-point values. These values are to be copied to
-///    bits[255:128] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing two consecutive
-///    double-precision floating-point values. These values are to be copied to
-///    bits[127:0] of the result. The address of the memory location does not
-///    have to be aligned.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-///    concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
-{
-  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
-  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
-}
-
-/// Loads two 128-bit integer vectors from unaligned memory locations and
-///    constructs a 256-bit integer vector by concatenating the two 128-bit
-///    vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-///   <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location containing a 128-bit integer
-///    vector. This vector is to be copied to bits[255:128] of the result. The
-///    address of the memory location does not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location containing a 128-bit integer
-///    vector. This vector is to be copied to bits[127:0] of the result. The
-///    address of the memory location does not have to be aligned.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
-{
-  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
-  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
-}
-
-/* SIMD store ops (unaligned) */
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-///    vector of [8 x float] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit floating-point vector of [8 x float].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
-{
-  __m128 __v128;
-
-  __v128 = _mm256_castps256_ps128(__a);
-  _mm_storeu_ps(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_ps(__a, 1);
-  _mm_storeu_ps(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-///    vector of [4 x double] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit floating-point vector of [4 x double].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
-{
-  __m128d __v128;
-
-  __v128 = _mm256_castpd256_pd128(__a);
-  _mm_storeu_pd(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_pd(__a, 1);
-  _mm_storeu_pd(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit integer vector into
-///    two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-///   store instructions.
-///
-/// \param __addr_hi
-///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __addr_lo
-///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-///    copied to this memory location. The address of this memory location does
-///    not have to be aligned.
-/// \param __a
-///    A 256-bit integer vector.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
-{
-  __m128i __v128;
-
-  __v128 = _mm256_castsi256_si128(__a);
-  _mm_storeu_si128(__addr_lo, __v128);
-  __v128 = _mm256_extractf128_si256(__a, 1);
-  _mm_storeu_si128(__addr_hi, __v128);
-}
+  ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))

 /// Constructs a 256-bit floating-point vector of [8 x float] by
 ///    concatenating two 128-bit floating-point vectors of [4 x float].
@ -5047,6 +4877,173 @@ _mm256_setr_m128i (__m128i __lo, __m128i __hi)
  return (__m256i)_mm256_set_m128i(__hi, __lo);
 }

+/* SIMD load ops (unaligned) */
+/// Loads two 128-bit floating-point vectors of [4 x float] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing 4 consecutive
+///    single-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+///    concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+  return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
+}
+
+/// Loads two 128-bit floating-point vectors of [2 x double] from
+///    unaligned memory locations and constructs a 256-bit floating-point vector
+///    of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[255:128] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing two consecutive
+///    double-precision floating-point values. These values are to be copied to
+///    bits[127:0] of the result. The address of the memory location does not
+///    have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+///    concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+  return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
+}
+
+/// Loads two 128-bit integer vectors from unaligned memory locations and
+///    constructs a 256-bit integer vector by concatenating the two 128-bit
+///    vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+///   <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[255:128] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location containing a 128-bit integer
+///    vector. This vector is to be copied to bits[127:0] of the result. The
+///    address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
+{
+   return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
+}
+
+/* SIMD store ops (unaligned) */
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [8 x float].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+  __m128 __v128;
+
+  __v128 = _mm256_castps256_ps128(__a);
+  _mm_storeu_ps(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_ps(__a, 1);
+  _mm_storeu_ps(__addr_hi, __v128);
+}
+
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+///    vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit floating-point vector of [4 x double].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+  __m128d __v128;
+
+  __v128 = _mm256_castpd256_pd128(__a);
+  _mm_storeu_pd(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_pd(__a, 1);
+  _mm_storeu_pd(__addr_hi, __v128);
+}
+
+/// Stores the upper and lower 128 bits of a 256-bit integer vector into
+///    two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+///   store instructions.
+///
+/// \param __addr_hi
+///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __addr_lo
+///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+///    copied to this memory location. The address of this memory location does
+///    not have to be aligned.
+/// \param __a
+///    A 256-bit integer vector.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
+{
+  __m128i __v128;
+
+  __v128 = _mm256_castsi256_si128(__a);
+  _mm_storeu_si128(__addr_lo, __v128);
+  __v128 = _mm256_extractf128_si256(__a, 1);
+  _mm_storeu_si128(__addr_hi, __v128);
+}
+
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS128

--- a/lib/include/cetintrin.h
+++ b/lib/include/cetintrin.h
@ -42,10 +42,20 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
  return __builtin_ia32_rdsspd(__a);
 }

+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
+  unsigned int t;
+  return __builtin_ia32_rdsspd(t);
+}
+
 #ifdef __x86_64__
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
  return __builtin_ia32_rdsspq(__a);
 }
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
+  unsigned long long t;
+  return __builtin_ia32_rdsspq(t);
+}
 #endif /* __x86_64__ */

 #ifdef __x86_64__
--- a/lib/include/cpuid.h
+++ b/lib/include/cpuid.h
@ -195,11 +195,12 @@
 #define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000
 #define bit_AMXBF16       0x00400000
+#define bit_AVX512FP16    0x00800000
 #define bit_AMXTILE       0x01000000
 #define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
-#define bit_AVXVNNI       0x00000008
+#define bit_AVXVNNI       0x00000010
 #define bit_AVX512BF16    0x00000020
 #define bit_HRESET        0x00400000

--- a/lib/include/crc32intrin.h
+++ b/lib/include/crc32intrin.h
@ -0,0 +1,100 @@
+/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CRC32INTRIN_H
+#define __CRC32INTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+  return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+  return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
+///    the second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+  return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+  return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CRC32INTRIN_H */
--- a/lib/include/emmintrin.h
+++ b/lib/include/emmintrin.h
@ -10,6 +10,10 @@
 #ifndef __EMMINTRIN_H
 #define __EMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <xmmintrin.h>

 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
@ -2371,7 +2375,7 @@ _mm_madd_epi16(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi16(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
 }

 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
@ -2391,7 +2395,7 @@ _mm_max_epi16(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu8(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
 }

 /// Compares corresponding elements of two 128-bit signed [8 x i16]
@ -2411,7 +2415,7 @@ _mm_max_epu8(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi16(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
 }

 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
@ -2431,7 +2435,7 @@ _mm_min_epi16(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu8(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
 }

 /// Multiplies the corresponding elements of two signed [8 x i16]
@ -2818,10 +2822,10 @@ _mm_xor_si128(__m128i __a, __m128i __b)
 ///    \a a.
 /// \returns A 128-bit integer vector containing the left-shifted value.
 #define _mm_slli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 #define _mm_bslli_si128(a, imm) \
-  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
 ///    by the specified number of bits. Low-order bits are cleared.
@ -3035,10 +3039,10 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
 ///    \a a.
 /// \returns A 128-bit integer vector containing the right-shifted value.
 #define _mm_srli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 #define _mm_bsrli_si128(a, imm) \
-  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))

 /// Right-shifts each of 16-bit values in the 128-bit integer vector
 ///    operand by the specified number of bits. High-order bits are cleared.
@ -4356,8 +4360,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi16(a, imm) \
-  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
-                                                   (int)(imm))
+  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
+                                                    (int)(imm)))

 /// Constructs a 128-bit integer vector by first making a copy of the
 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
@ -4380,8 +4384,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
 ///    lower 16 bits of \a __b are written.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi16(a, b, imm) \
-  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
-                                       (int)(imm))
+  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
+                                        (int)(imm)))

 /// Copies the values of the most significant bits from each 8-bit
 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
@ -4430,7 +4434,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [127:96] of \a a.
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) \
-  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))

 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4460,7 +4464,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [63:48] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))

 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4490,7 +4494,7 @@ _mm_movemask_epi8(__m128i __a)
 ///    11: assign values from bits [127:112] of \a a. \n
 /// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) \
-  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
+  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))

 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@ -4844,8 +4848,8 @@ _mm_movemask_pd(__m128d __a)
 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) \
-  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
-                                 (int)(i))
+  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+                                  (int)(i)))

 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 ///    floating-point vector of [4 x float].
--- a/lib/include/f16cintrin.h
+++ b/lib/include/f16cintrin.h
@ -66,8 +66,8 @@ _cvtsh_ss(unsigned short __a)
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
 #define _cvtss_sh(a, imm) \
-  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0])
+  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]))

 /// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
@ -93,7 +93,7 @@ _cvtsh_ss(unsigned short __a)
 ///    values. The lower 64 bits are used to store the converted 16-bit
 ///    half-precision floating-point values.
 #define _mm_cvtps_ph(a, imm) \
-  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))

 /// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 128-bit vector containing 32-bit float values.
@ -136,7 +136,7 @@ _mm_cvtph_ps(__m128i __a)
 /// \returns A 128-bit vector containing the converted 16-bit half-precision
 ///    float values.
 #define _mm256_cvtps_ph(a, imm) \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
+ ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))

 /// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 256-bit vector of [8 x float].
--- a/lib/include/float.h
+++ b/lib/include/float.h
@ -14,10 +14,11 @@
 * additional definitions provided for Windows.
 * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
 *
- * Also fall back on Darwin to allow additional definitions and
+ * Also fall back on Darwin and AIX to allow additional definitions and
 * implementation-defined values.
 */
-#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
+#if (defined(__APPLE__) || defined(__MINGW32__) || defined(_MSC_VER) ||        \
+     defined(_AIX)) &&                                                         \
    __STDC_HOSTED__ && __has_include_next(<float.h>)

 /* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
@ -37,7 +38,9 @@
 #  undef FLT_MANT_DIG
 #  undef DBL_MANT_DIG
 #  undef LDBL_MANT_DIG
-#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
+#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||              \
+      __cplusplus >= 201103L ||                                                \
+      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef DECIMAL_DIG
 #  endif
 #  undef FLT_DIG
@ -64,7 +67,9 @@
 #  undef FLT_MIN
 #  undef DBL_MIN
 #  undef LDBL_MIN
-#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
+#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||              \
+      __cplusplus >= 201703L ||                                                \
+      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef FLT_TRUE_MIN
 #    undef DBL_TRUE_MIN
 #    undef LDBL_TRUE_MIN
@ -87,7 +92,9 @@
 #define DBL_MANT_DIG __DBL_MANT_DIG__
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__

-#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||                \
+    __cplusplus >= 201103L ||                                                  \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define DECIMAL_DIG __DECIMAL_DIG__
 #endif

@ -123,7 +130,9 @@
 #define DBL_MIN __DBL_MIN__
 #define LDBL_MIN __LDBL_MIN__

-#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||                \
+    __cplusplus >= 201703L ||                                                  \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #  define DBL_TRUE_MIN __DBL_DENORM_MIN__
 #  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
--- a/lib/include/gfniintrin.h
+++ b/lib/include/gfniintrin.h
@ -28,14 +28,14 @@
 #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))

 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
+  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
                                                   (__v16qi)(__m128i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 #define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
+  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
                                                   (__v16qi)(__m128i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
@ -46,14 +46,14 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)

 #ifdef __AVXINTRIN_H
 #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
+  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
                                                   (__v32qi)(__m256i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 #define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
+  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
                                                   (__v32qi)(__m256i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
 _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
@ -65,31 +65,31 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)

 #ifdef __AVX512BWINTRIN_H
 #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
+  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
                                                   (__v64qi)(__m512i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 #define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
-        (__v64qi)(__m512i)(S))
+         (__v64qi)(__m512i)(S)))

 #define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
+  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
         U, A, B, I)

 #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
+  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
                                                   (__v64qi)(__m512i)(B), \
-                                                  (char)(I))
+                                                   (char)(I)))

 #define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
-        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v64qi)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
+         (__v64qi)(__m512i)(S)))

 #define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
+  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
         U, A, B, I)

 static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
@ -117,39 +117,38 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)

 #ifdef __AVX512VLBWINTRIN_H
 #define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
-        (__v16qi)(__m128i)(S))
+         (__v16qi)(__m128i)(S)))

 #define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
+  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
         U, A, B, I)

 #define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
-        (__v32qi)(__m256i)(S))
+         (__v32qi)(__m256i)(S)))

 #define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
         U, A, B, I)

 #define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
-        (__v16qi)(__m128i)(S))
+         (__v16qi)(__m128i)(S)))

 #define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
-        U, A, B, I)
+  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)

 #define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
-        (__v32qi)(__m256i)(S))
+         (__v32qi)(__m256i)(S)))

 #define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
+  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
         U, A, B, I)

 static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
--- a/lib/include/hexagon_protos.h
+++ b/lib/include/hexagon_protos.h
@ -8003,17 +8003,6 @@
 #define Q6_P_vtrunohb_PP __builtin_HEXAGON_S6_vtrunohb_ppp
 #endif /* __HEXAGON_ARCH___ >= 62 */

-#if __HEXAGON_ARCH__ >= 62
-/* ==========================================================================
-   Assembly Syntax:       Vd32=vmem(Rt32):nt
-   C Intrinsic Prototype: HVX_Vector Q6_V_vmem_R_nt(Word32 Rt)
-   Instruction Type:      MAPPING
-   Execution Slots:       SLOT0123
-   ========================================================================== */
-
-#define Q6_V_vmem_R_nt __builtin_HEXAGON_V6_ldntnt0
-#endif /* __HEXAGON_ARCH___ >= 62 */
-
 #if __HEXAGON_ARCH__ >= 65
 /* ==========================================================================
   Assembly Syntax:       Pd4=!any8(vcmpb.eq(Rss32,Rtt32))
--- a/lib/include/hexagon_types.h
+++ b/lib/include/hexagon_types.h
@ -1177,37 +1177,6 @@ private:

 #endif /* __cplusplus */

-// V65 Silver types
-#if __Q6S_ARCH__ >= 65
-  // Silver vector types are 128 bytes, and pairs are 256. The vector predicate
-  // types are 16 bytes and 32 bytes for pairs.
-  typedef long HEXAGON_VecPred128 __attribute__((__vector_size__(16)))
-    __attribute__((aligned(128)));
-
-  typedef long HEXAGON_VecPred256 __attribute__((__vector_size__(32)))
-    __attribute__((aligned(128)));
-
-  typedef long HEXAGON_Vect1024 __attribute__((__vector_size__(128)))
-    __attribute__((aligned(128)));
-
-  typedef long HEXAGON_Vect2048 __attribute__((__vector_size__(256)))
-    __attribute__((aligned(256)));
-
-  typedef long HEXAGON_UVect1024 __attribute__((__vector_size__(128)))
-    __attribute__((aligned(4)));
-
-  typedef long HEXAGON_UVect2048 __attribute__((__vector_size__(256)))
-    __attribute__((aligned(4)));
-
-  #define Q6S_VectorPredPair HEXAGON_VecPred256
-  #define Q6S_VectorPred     HEXAGON_VecPred128
-  #define Q6S_Vector         HEXAGON_Vect1024
-  #define Q6S_VectorPair     HEXAGON_Vect2048
-  #define Q6S_UVector        HEXAGON_UVect1024
-  #define Q6S_UVectorPair    HEXAGON_UVect2048
-
-#else /* __Q6S_ARCH__ >= 65 */
-
 // V65 Vector types
 #if __HVX_ARCH__ >= 65
 #if defined __HVX__ && (__HVX_LENGTH__ == 128)
@ -1256,7 +1225,6 @@ private:
 #endif /* defined __HVX__ &&  (__HVX_LENGTH__ == 64) */
 #endif /* defined __HVX__ && (__HVX_LENGTH__ == 128) */
 #endif /* __HVX_ARCH__ >= 65 */
-#endif /* __Q6S_ARCH__ >= 65 */

 /* Predicates */

--- a/lib/include/hvx_hexagon_protos.h
+++ b/lib/include/hvx_hexagon_protos.h
--- a/lib/include/ia32intrin.h
+++ b/lib/include/ia32intrin.h
@ -16,7 +16,7 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))

 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
@ -282,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
 __crc32b(unsigned int __C, unsigned char __D)
 {
  return __builtin_ia32_crc32qi(__C, __D);
@ -303,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
 __crc32w(unsigned int __C, unsigned short __D)
 {
  return __builtin_ia32_crc32hi(__C, __D);
@ -324,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
 __crc32d(unsigned int __C, unsigned int __D)
 {
  return __builtin_ia32_crc32si(__C, __D);
@ -346,7 +346,7 @@ __crc32d(unsigned int __C, unsigned int __D)
 *  \returns The result of adding operand \a __C to the CRC-32C checksum of
 *     operand \a __D.
 */
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
 __crc32q(unsigned long long __C, unsigned long long __D)
 {
  return __builtin_ia32_crc32di(__C, __D);
@ -435,7 +435,7 @@ __rorq(unsigned long long __X, int __C) {

 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_CAST
-#undef __DEFAULT_FN_ATTRS_SSE42
+#undef __DEFAULT_FN_ATTRS_CRC32
 #undef __DEFAULT_FN_ATTRS_CONSTEXPR

 #endif /* __IA32INTRIN_H */
--- a/lib/include/immintrin.h
+++ b/lib/include/immintrin.h
@ -10,6 +10,10 @@
 #ifndef __IMMINTRIN_H
 #define __IMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <x86gprintrin.h>

 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
@ -210,6 +214,20 @@
 #include <avx512pfintrin.h>
 #endif

+/*
+ * FIXME: _Float16 type is legal only when HW support float16 operation.
+ * We use __AVX512FP16__ to identify if float16 is supported or not, so
+ * when float16 is not supported, the related header is not included.
+ *
+ */
+#if defined(__AVX512FP16__)
+#include <avx512fp16intrin.h>
+#endif
+
+#if defined(__AVX512FP16__) && defined(__AVX512VL__)
+#include <avx512vlfp16intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
@ -525,13 +543,13 @@ extern "C" {
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
                       : "+r" (_Value), "+m" (*_Target) :: "memory");
  return _Value;
 }
 static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
                       : "+r" (_Value), "+m" (*_Target) :: "memory");
  return _Value;
 }
@ -539,13 +557,13 @@ _InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
 #if defined(__x86_64__)
 static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
                       : "+r" (_Value), "+m" (*_Target) :: "memory");
  return _Value;
 }
 static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
                       : "+r" (_Value), "+m" (*_Target) :: "memory");
  return _Value;
 }
@ -557,7 +575,7 @@ _InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
 static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
                              long _Exchange, long _Comparand) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
                       : "+a" (_Comparand), "+m" (*_Destination)
                       : "r" (_Exchange) : "memory");
  return _Comparand;
@ -565,7 +583,7 @@ _InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
 static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange_HLERelease(long volatile *_Destination,
                              long _Exchange, long _Comparand) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
                       : "+a" (_Comparand), "+m" (*_Destination)
                       : "r" (_Exchange) : "memory");
  return _Comparand;
@ -575,7 +593,7 @@ _InterlockedCompareExchange_HLERelease(long volatile *_Destination,
 static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
                              __int64 _Exchange, __int64 _Comparand) {
-  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
                       : "+a" (_Comparand), "+m" (*_Destination)
                       : "r" (_Exchange) : "memory");
  return _Comparand;
@ -583,7 +601,7 @@ _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
 static __inline__ __int64 __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
                              __int64 _Exchange, __int64 _Comparand) {
-  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
                       : "+a" (_Comparand), "+m" (*_Destination)
                       : "r" (_Exchange) : "memory");
  return _Comparand;
--- a/lib/include/intrin.h
+++ b/lib/include/intrin.h
@ -97,8 +97,9 @@ unsigned long __readcr8(void);
 unsigned int __readdr(unsigned int);
 #ifdef __i386__
 unsigned char __readfsbyte(unsigned long);
-unsigned __int64 __readfsqword(unsigned long);
 unsigned short __readfsword(unsigned long);
+unsigned long __readfsdword(unsigned long);
+unsigned __int64 __readfsqword(unsigned long);
 #endif
 unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
@ -149,10 +150,8 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
 long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
 __int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
 __int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
-void __attribute__((__deprecated__(
-    "use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
-void __attribute__((__deprecated__(
-    "use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
+void _ReadBarrier(void);
+void _ReadWriteBarrier(void);
 unsigned int _rorx_u32(unsigned int, const unsigned int);
 int _sarx_i32(int, unsigned int);
 #if __STDC_HOSTED__
@ -163,8 +162,7 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
 void _Store_HLERelease(long volatile *, long);
 void _Store64_HLERelease(__int64 volatile *, __int64);
 void _StorePointer_HLERelease(void *volatile *, void *);
-void __attribute__((__deprecated__(
-    "use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
+void _WriteBarrier(void);
 unsigned __int32 xbegin(void);
 void _xend(void);

@ -457,7 +455,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
                       :
                       : "memory");
 #else
-  __asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
+  __asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
+                       "rep movsb\n"
+                       "xchg {%%esi, %1|%1, esi}"
                       : "+D"(__dst), "+r"(__src), "+c"(__n)
                       :
                       : "memory");
@ -467,12 +467,14 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
                                                  unsigned long const *__src,
                                                  size_t __n) {
 #if defined(__x86_64__)
-  __asm__ __volatile__("rep movsl"
+  __asm__ __volatile__("rep movs{l|d}"
                       : "+D"(__dst), "+S"(__src), "+c"(__n)
                       :
                       : "memory");
 #else
-  __asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
+  __asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
+                       "rep movs{l|d}\n"
+                       "xchg {%%esi, %1|%1, esi}"
                       : "+D"(__dst), "+r"(__src), "+c"(__n)
                       :
                       : "memory");
@ -487,7 +489,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
                       :
                       : "memory");
 #else
-  __asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
+  __asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
+                       "rep movsw\n"
+                       "xchg {%%esi, %1|%1, esi}"
                       : "+D"(__dst), "+r"(__src), "+c"(__n)
                       :
                       : "memory");
@ -496,7 +500,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
 static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
                                                  unsigned long __x,
                                                  size_t __n) {
-  __asm__ __volatile__("rep stosl"
+  __asm__ __volatile__("rep stos{l|d}"
                       : "+D"(__dst), "+c"(__n)
                       : "a"(__x)
                       : "memory");
@ -538,9 +542,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
 #define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx)             \
-  __asm("xchgq %%rbx,%q1\n"                                                    \
+  __asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n"                                      \
        "cpuid\n"                                                              \
-        "xchgq %%rbx,%q1"                                                      \
+        "xchg{q} {%%rbx, %q1|%q1, rbx}"                                        \
        : "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx)                   \
        : "0"(__leaf), "2"(__count))
 #endif
@ -600,13 +604,17 @@ __readmsr(unsigned long __register) {

 static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
  unsigned __LPTRINT_TYPE__ __cr3_val;
-  __asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
+  __asm__ __volatile__(
+                       "mov {%%cr3, %0|%0, cr3}"
+                       : "=r"(__cr3_val)
+                       :
+                       : "memory");
  return __cr3_val;
 }

 static __inline__ void __DEFAULT_FN_ATTRS
 __writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
-  __asm__ ("mov %0, %%cr3" : : "r"(__cr3_val) : "memory");
+  __asm__ ("mov {%0, %%cr3|cr3, %0}" : : "r"(__cr3_val) : "memory");
 }

 #ifdef __cplusplus
--- a/lib/include/keylockerintrin.h
+++ b/lib/include/keylockerintrin.h
@ -99,7 +99,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 }

 /// Wrap a 128-bit AES key from __key into a key handle and output in
-/// ((__m128i*)__h) to ((__m128i*)__h) + 5  and a 32-bit value as return.
+/// ((__m128i*)__h) to ((__m128i*)__h) + 2  and a 32-bit value as return.
 /// The explicit source operand __htype specifies handle restrictions.
 ///
 /// \headerfile <x86intrin.h>
@ -120,9 +120,6 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
 /// MEM[__h+127:__h] := Handle[127:0]   // AAD
 /// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
 /// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
-/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
-/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
-/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
 /// OF := 0
 /// SF := 0
 /// ZF := 0
@ -136,7 +133,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 }

 /// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
-/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
+/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
 /// a 32-bit value as return.
 /// The explicit source operand __htype specifies handle restrictions.
 ///
@ -160,9 +157,6 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
 /// MEM[__h+255:__h+128] := Handle[255:128] // Tag
 /// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
 /// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
-/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
-/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
-/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
 /// OF := 0
 /// SF := 0
 /// ZF := 0
--- a/lib/include/limits.h
+++ b/lib/include/limits.h
@ -62,6 +62,26 @@

 #define CHAR_BIT  __CHAR_BIT__

+/* C2x 5.2.4.2.1 */
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+#define BOOL_WIDTH   __BOOL_WIDTH__
+#define CHAR_WIDTH   CHAR_BIT
+#define SCHAR_WIDTH  CHAR_BIT
+#define UCHAR_WIDTH  CHAR_BIT
+#define USHRT_WIDTH  __SHRT_WIDTH__
+#define SHRT_WIDTH   __SHRT_WIDTH__
+#define UINT_WIDTH   __INT_WIDTH__
+#define INT_WIDTH    __INT_WIDTH__
+#define ULONG_WIDTH  __LONG_WIDTH__
+#define LONG_WIDTH   __LONG_WIDTH__
+#define ULLONG_WIDTH __LLONG_WIDTH__
+#define LLONG_WIDTH  __LLONG_WIDTH__
+
+#define BITINT_MAXWIDTH __BITINT_MAXWIDTH__
+#endif
+
 #ifdef __CHAR_UNSIGNED__  /* -funsigned-char */
 #define CHAR_MIN 0
 #define CHAR_MAX UCHAR_MAX
--- a/lib/include/mmintrin.h
+++ b/lib/include/mmintrin.h
@ -10,6 +10,10 @@
 #ifndef __MMINTRIN_H
 #define __MMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));

 typedef long long __v1di __attribute__((__vector_size__(8)));
--- a/lib/include/nmmintrin.h
+++ b/lib/include/nmmintrin.h
@ -10,6 +10,10 @@
 #ifndef __NMMINTRIN_H
 #define __NMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 /* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
   just include it now then.  */
 #include <smmintrin.h>
--- a/lib/include/opencl-c-base.h
+++ b/lib/include/opencl-c-base.h
@ -12,8 +12,8 @@
 // Define extension macros

 #if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
-// For SPIR all extensions are supported.
-#if defined(__SPIR__)
+// For SPIR and SPIR-V all extensions are supported.
+#if defined(__SPIR__) || defined(__SPIRV__)
 #define cl_khr_subgroup_extended_types 1
 #define cl_khr_subgroup_non_uniform_vote 1
 #define cl_khr_subgroup_ballot 1
@ -25,12 +25,31 @@
 #define cl_khr_integer_dot_product 1
 #define __opencl_c_integer_dot_product_input_4x8bit 1
 #define __opencl_c_integer_dot_product_input_4x8bit_packed 1
+#define cl_ext_float_atomics 1
+#ifdef cl_khr_fp16
+#define __opencl_c_ext_fp16_global_atomic_load_store 1
+#define __opencl_c_ext_fp16_local_atomic_load_store 1
+#define __opencl_c_ext_fp16_global_atomic_add 1
+#define __opencl_c_ext_fp16_local_atomic_add 1
+#define __opencl_c_ext_fp16_global_atomic_min_max 1
+#define __opencl_c_ext_fp16_local_atomic_min_max 1
+#endif
+#ifdef cl_khr_fp64
+#define __opencl_c_ext_fp64_global_atomic_add 1
+#define __opencl_c_ext_fp64_local_atomic_add 1
+#define __opencl_c_ext_fp64_global_atomic_min_max 1
+#define __opencl_c_ext_fp64_local_atomic_min_max 1
+#endif
+#define __opencl_c_ext_fp32_global_atomic_add 1
+#define __opencl_c_ext_fp32_local_atomic_add 1
+#define __opencl_c_ext_fp32_global_atomic_min_max 1
+#define __opencl_c_ext_fp32_local_atomic_min_max 1

-#endif // defined(__SPIR__)
+#endif // defined(__SPIR__) || defined(__SPIRV__)
 #endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)

 // Define feature macros for OpenCL C 2.0
-#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
+#if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200)
 #define __opencl_c_pipes 1
 #define __opencl_c_generic_address_space 1
 #define __opencl_c_work_group_collective_functions 1
@ -45,12 +64,19 @@
 #endif

 // Define header-only feature macros for OpenCL C 3.0.
-#if (__OPENCL_C_VERSION__ == 300)
-// For the SPIR target all features are supported.
-#if defined(__SPIR__)
+#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
+// For the SPIR and SPIR-V target all features are supported.
+#if defined(__SPIR__) || defined(__SPIRV__)
 #define __opencl_c_atomic_scope_all_devices 1
+#define __opencl_c_read_write_images 1
 #endif // defined(__SPIR__)
-#endif // (__OPENCL_C_VERSION__ == 300)
+#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
+
+#if !defined(__opencl_c_generic_address_space)
+// Internal feature macro to provide named (global, local, private) address
+// space overloads for builtin functions that take a pointer argument.
+#define __opencl_c_named_address_space_builtins 1
+#endif // !defined(__opencl_c_generic_address_space)

 // built-in scalar data types:

@ -329,11 +355,17 @@ typedef enum memory_scope {
  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
 #if defined(__opencl_c_atomic_scope_all_devices)
  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
  memory_scope_all_devices = memory_scope_all_svm_devices,
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
 #endif // defined(__opencl_c_atomic_scope_all_devices)
-#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+/**
+ * Subgroups have different requirements on forward progress, so just test
+ * all the relevant macros.
+ * CL 3.0 sub-groups "they are not guaranteed to make independent forward progress"
+ * KHR subgroups "Subgroups within a workgroup are independent, make forward progress with respect to each other"
+ */
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
 #endif
 } memory_scope;
@ -473,12 +505,14 @@ typedef int clk_profiling_info;

 #define MAX_WORK_DIM 3

+#ifdef __opencl_c_device_enqueue
 typedef struct {
  unsigned int workDimension;
  size_t globalWorkOffset[MAX_WORK_DIM];
  size_t globalWorkSize[MAX_WORK_DIM];
  size_t localWorkSize[MAX_WORK_DIM];
 } ndrange_t;
+#endif // __opencl_c_device_enqueue

 #endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)

@ -572,6 +606,28 @@ typedef struct {
 #define as_intptr_t(x) __builtin_astype((x), intptr_t)
 #define as_uintptr_t(x) __builtin_astype((x), uintptr_t)

+// C++ for OpenCL - __remove_address_space
+#if defined(__OPENCL_CPP_VERSION__)
+template <typename _Tp> struct __remove_address_space { using type = _Tp; };
+#if defined(__opencl_c_generic_address_space)
+template <typename _Tp> struct __remove_address_space<__generic _Tp> {
+  using type = _Tp;
+};
+#endif
+template <typename _Tp> struct __remove_address_space<__global _Tp> {
+  using type = _Tp;
+};
+template <typename _Tp> struct __remove_address_space<__private _Tp> {
+  using type = _Tp;
+};
+template <typename _Tp> struct __remove_address_space<__local _Tp> {
+  using type = _Tp;
+};
+template <typename _Tp> struct __remove_address_space<__constant _Tp> {
+  using type = _Tp;
+};
+#endif
+
 // OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers

 #define __kernel_exec(X, typen) __kernel \
--- a/lib/include/opencl-c.h
+++ b/lib/include/opencl-c.h
--- a/lib/include/openmp_wrappers/complex
+++ b/lib/include/openmp_wrappers/complex
@ -17,9 +17,18 @@
 // We require std::math functions in the complex builtins below.
 #include <cmath>

+#ifdef __NVPTX__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
+#endif // __NVPTX__
+
+#ifdef __AMDGCN__
+#define __OPENMP_AMDGCN__
+#include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_AMDGCN__
+#endif // __AMDGCN__
+
 #endif

 // Grab the host header too.
@ -36,11 +45,11 @@
 #ifndef _LIBCPP_STD_VER

 #pragma omp begin declare variant match(                                       \
-    device = {arch(nvptx, nvptx64)},                                           \
+    device = {arch(amdgcn, nvptx, nvptx64)},                                   \
    implementation = {extension(match_any, allow_templates)})

 #include <complex_cmath.h>

 #pragma omp end declare variant

-#endif
+#endif // _LIBCPP_STD_VER
--- a/lib/include/openmp_wrappers/complex.h
+++ b/lib/include/openmp_wrappers/complex.h
@ -17,10 +17,19 @@
 // We require math functions in the complex builtins below.
 #include <math.h>

+#ifdef __NVPTX__
 #define __OPENMP_NVPTX__
 #include <__clang_cuda_complex_builtins.h>
 #undef __OPENMP_NVPTX__
 #endif

+#ifdef __AMDGCN__
+#define __OPENMP_AMDGCN__
+#include <__clang_cuda_complex_builtins.h>
+#undef __OPENMP_AMDGCN__
+#endif
+
+#endif
+
 // Grab the host header too.
 #include_next <complex.h>
--- a/lib/include/pmmintrin.h
+++ b/lib/include/pmmintrin.h
@ -10,6 +10,10 @@
 #ifndef __PMMINTRIN_H
 #define __PMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <emmintrin.h>

 /* Define the default attributes for the functions in this file. */
--- a/lib/include/ppc_wrappers/emmintrin.h
+++ b/lib/include/ppc_wrappers/emmintrin.h
@ -35,7 +35,7 @@
 #ifndef EMMINTRIN_H_
 #define EMMINTRIN_H_

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 #include <altivec.h>

@ -2319,6 +2319,7 @@ _mm_castsi128_pd(__m128i __A)

 #else
 #include_next <emmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* EMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/mm_malloc.h
+++ b/lib/include/ppc_wrappers/mm_malloc.h
@ -10,7 +10,7 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 #include <stdlib.h>

--- a/lib/include/ppc_wrappers/mmintrin.h
+++ b/lib/include/ppc_wrappers/mmintrin.h
@ -35,7 +35,7 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 #include <altivec.h>
 /* The Intel API is flexible enough that we must allow aliasing with other
@ -1445,6 +1445,7 @@ extern __inline __m64

 #else
 #include_next <mmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* _MMINTRIN_H_INCLUDED */
--- a/lib/include/ppc_wrappers/pmmintrin.h
+++ b/lib/include/ppc_wrappers/pmmintrin.h
@ -38,7 +38,7 @@
 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 /* We need definitions from the SSE2 and SSE header files*/
 #include <emmintrin.h>
@ -145,6 +145,7 @@ _mm_lddqu_si128 (__m128i const *__P)

 #else
 #include_next <pmmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* PMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/smmintrin.h
+++ b/lib/include/ppc_wrappers/smmintrin.h
@ -29,10 +29,10 @@
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 #include <altivec.h>
-#include <emmintrin.h>
+#include <tmmintrin.h>

 extern __inline int
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -104,6 +104,7 @@ extern __inline __m128i

 #else
 #include_next <smmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* _SMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/tmmintrin.h
+++ b/lib/include/ppc_wrappers/tmmintrin.h
@ -25,7 +25,7 @@
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 #include <altivec.h>

@ -490,6 +490,7 @@ _mm_mulhrs_pi16 (__m64 __A, __m64 __B)

 #else
 #include_next <tmmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* TMMINTRIN_H_ */
--- a/lib/include/ppc_wrappers/xmmintrin.h
+++ b/lib/include/ppc_wrappers/xmmintrin.h
@ -34,7 +34,7 @@
 #ifndef _XMMINTRIN_H_INCLUDED
 #define _XMMINTRIN_H_INCLUDED

-#if defined(__linux__) && defined(__ppc64__)
+#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))

 /* Define four value permute mask */
 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
@ -1838,6 +1838,7 @@ do {									\

 #else
 #include_next <xmmintrin.h>
-#endif /* defined(__linux__) && defined(__ppc64__) */
+#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
+        */

 #endif /* _XMMINTRIN_H_INCLUDED */
--- a/lib/include/prfchwintrin.h
+++ b/lib/include/prfchwintrin.h
@ -47,9 +47,12 @@ _m_prefetch(void *__P)
 /// \param __P
 ///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetchw(void *__P)
+_m_prefetchw(volatile const void *__P)
 {
-  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
 }

 #endif /* __PRFCHWINTRIN_H */
--- a/lib/include/riscv_vector.h
+++ b/lib/include/riscv_vector.h
--- a/lib/include/smmintrin.h
+++ b/lib/include/smmintrin.h
@ -10,6 +10,10 @@
 #ifndef __SMMINTRIN_H
 #define __SMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <tmmintrin.h>

 /* Define the default attributes for the functions in this file. */
@ -231,7 +235,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_round_ps(X, M) \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
+  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))

 /// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
@ -272,8 +276,8 @@
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
 #define _mm_round_ss(X, Y, M) \
-  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M))
+  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+                                  (__v4sf)(__m128)(Y), (M)))

 /// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
@ -306,7 +310,7 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_round_pd(X, M) \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
+  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))

 /// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
@ -347,8 +351,8 @@
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
 #define _mm_round_sd(X, Y, M) \
-  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M))
+  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+                                   (__v2df)(__m128d)(Y), (M)))

 /* SSE4 Packed Blending Intrinsics.  */
 /// Returns a 128-bit vector of [2 x double] where the values are
@ -376,8 +380,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) \
-  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
-                                    (__v2df)(__m128d)(V2), (int)(M))
+  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                     (__v2df)(__m128d)(V2), (int)(M)))

 /// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
@ -404,8 +408,8 @@
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) \
-  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
-                                   (__v4sf)(__m128)(V2), (int)(M))
+  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                    (__v4sf)(__m128)(V2), (int)(M)))

 /// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
@ -513,8 +517,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) \
-  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
-                                       (__v8hi)(__m128i)(V2), (int)(M))
+  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                        (__v8hi)(__m128i)(V2), (int)(M)))

 /* SSE4 Dword Multiply Instructions.  */
 /// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -590,8 +594,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M) \
-  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M))
+  ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+                                (__v4sf)(__m128)(Y), (M)))

 /// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
@ -625,8 +629,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) \
-  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M))
+  ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+                                 (__v2df)(__m128d)(Y), (M)))

 /* SSE4 Streaming Load Hint Instruction.  */
 /// Loads integer values from a 128-bit aligned memory location to a
@ -664,7 +668,7 @@ _mm_stream_load_si128 (__m128i const *__V)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi8 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+  return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -683,7 +687,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi8 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+  return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -702,7 +706,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu16 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+  return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -721,7 +725,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu16 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+  return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -740,7 +744,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi32 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+  return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -759,7 +763,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi32 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+  return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -778,7 +782,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu32 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+  return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
 }

 /// Compares the corresponding elements of two 128-bit vectors of
@ -797,7 +801,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu32 (__m128i __V1, __m128i __V2)
 {
-  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+  return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
 }

 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
@ -865,15 +869,13 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of parameter \a X are returned. \n
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
-#define _mm_extract_ps(X, N) (__extension__                      \
-  ({ union { int __i; float __f; } __t;  \
-     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
-     __t.__i;}))
+#define _mm_extract_ps(X, N) \
+  __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))

 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
 #define _MM_EXTRACT_FLOAT(D, X, N) \
-  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
+  do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)

 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
   an index suitable for _mm_insert_ps.  */
@ -925,8 +927,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi8(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
-                                        (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                         (int)(I), (int)(N)))

 /// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
@ -957,8 +959,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi32(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
-                                       (int)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                        (int)(I), (int)(N)))

 #ifdef __x86_64__
 /// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -988,8 +990,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi64(X, I, N) \
-  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
-                                       (long long)(I), (int)(N))
+  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                        (long long)(I), (int)(N)))
 #endif /* __x86_64__ */

 /* Extract int from packed integer array at index.  This returns the element
@ -1031,8 +1033,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
 #define _mm_extract_epi8(X, N) \
-  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
-                                                   (int)(N))
+  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                    (int)(N)))

 /// Extracts a 32-bit element from the 128-bit integer vector of
 ///    [4 x i32], using the immediate value parameter \a N as a selector.
@ -1057,7 +1059,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi32(X, N) \
-  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
+  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))

 #ifdef __x86_64__
 /// Extracts a 64-bit element from the 128-bit integer vector of
@ -1080,7 +1082,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
 #define _mm_extract_epi64(X, N) \
-  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
+  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
 #endif /* __x86_64 */

 /* SSE4 128-bit Packed Integer Comparisons.  */
@ -1514,8 +1516,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M) \
-  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M))
+  ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+                                       (__v16qi)(__m128i)(Y), (M)))

 /// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
@ -1624,8 +1626,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
-  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
-                                       (__v16qi)(__m128i)(B), (int)(M))
+  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+                                        (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1678,8 +1680,8 @@ _mm_minpos_epu16(__m128i __V)
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
-  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
-                                   (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -1738,9 +1740,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns a 128-bit integer vector representing the result mask of
 ///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
-  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
                                        (__v16qi)(__m128i)(B), (int)(LB), \
-                                       (int)(M))
+                                        (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -1797,9 +1799,9 @@ _mm_minpos_epu16(__m128i __V)
 ///      1: The index of the most significant set bit. \n
 /// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
                                    (__v16qi)(__m128i)(B), (int)(LB), \
-                                   (int)(M))
+                                    (int)(M)))

 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
 /// Uses the immediate operand \a M to perform a comparison of string
@ -1849,8 +1851,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum; otherwise, returns 0.
 #define _mm_cmpistra(A, B, M) \
-  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1898,8 +1900,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
 #define _mm_cmpistrc(A, B, M) \
-  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1946,8 +1948,8 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpistro(A, B, M) \
-  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -1996,8 +1998,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrs(A, B, M) \
-  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
@ -2046,8 +2048,8 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpistrz(A, B, M) \
-  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
-                                    (__v16qi)(__m128i)(B), (int)(M))
+  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+                                     (__v16qi)(__m128i)(B), (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2100,9 +2102,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the bit mask is zero and the length of the string in
 ///    \a B is the maximum, otherwise, returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2154,9 +2156,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B. \n
 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2207,9 +2209,9 @@ _mm_minpos_epu16(__m128i __V)
 ///          to the size of \a A or \a B.
 /// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpestro(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2262,9 +2264,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a A is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+                                     (int)(M)))

 /// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
@ -2316,9 +2318,9 @@ _mm_minpos_epu16(__m128i __V)
 /// \returns Returns 1 if the length of the string in \a B is less than the
 ///    maximum, otherwise, returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
-  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
-                                    (int)(M))
+                                     (int)(M)))

 /* SSE4.2 Compare Packed Data -- Greater Than.  */
 /// Compares each of the corresponding 64-bit values of the 128-bit
@ -2340,91 +2342,10 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }

-/* SSE4.2 Accumulate CRC32.  */
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned char operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a  __D.
-/// \param __D
-///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u8(unsigned int __C, unsigned char __D)
-{
-  return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned short operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u16(unsigned int __C, unsigned short __D)
-{
-  return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/// Adds the first unsigned integer operand to the CRC-32C checksum of
-///    the second unsigned integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u32(unsigned int __C, unsigned int __D)
-{
-  return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-///    unsigned 64-bit integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
-///
-/// \param __C
-///    An unsigned integer operand to add to the CRC-32C checksum of operand
-///    \a __D.
-/// \param __D
-///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-///    operand \a __D.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
-{
-  return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
 #undef __DEFAULT_FN_ATTRS

 #include <popcntintrin.h>

+#include <crc32intrin.h>
+
 #endif /* __SMMINTRIN_H */
--- a/lib/include/stdatomic.h
+++ b/lib/include/stdatomic.h
@ -12,8 +12,12 @@

 /* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
 * example, already has a Clang-compatible stdatomic.h header.
+ *
+ * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
+ * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
+ * to the clang resource header until that is fully supported.
 */
-#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
+#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
 # include_next <stdatomic.h>
 #else

@ -40,6 +44,11 @@ extern "C" {
 /* 7.17.2 Initialization */

 #define ATOMIC_VAR_INIT(value) (value)
+#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) &&                 \
+    !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
+/* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
+#pragma clang deprecated(ATOMIC_VAR_INIT)
+#endif
 #define atomic_init __c11_atomic_init

 /* 7.17.3 Order and consistency */
@ -149,6 +158,10 @@ typedef _Atomic(uintmax_t)          atomic_uintmax_t;
 typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;

 #define ATOMIC_FLAG_INIT { 0 }
+#if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
+/* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
+#pragma clang deprecated(ATOMIC_FLAG_INIT)
+#endif

 /* These should be provided by the libc implementation. */
 #ifdef __cplusplus
--- a/lib/include/stdint.h
+++ b/lib/include/stdint.h
@ -461,6 +461,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT64_MAX           INT64_C( 9223372036854775807)
 # define INT64_MIN         (-INT64_C( 9223372036854775807)-1)
 # define UINT64_MAX         UINT64_C(18446744073709551615)
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT64_WIDTH         64
+# define INT64_WIDTH          UINT64_WIDTH
+
+# define __UINT_LEAST64_WIDTH UINT64_WIDTH
+# define __UINT_LEAST32_WIDTH UINT64_WIDTH
+# define __UINT_LEAST16_WIDTH UINT64_WIDTH
+# define __UINT_LEAST8_MAX UINT64_MAX
+#endif /* __STDC_VERSION__ */
+
 # define __INT_LEAST64_MIN   INT64_MIN
 # define __INT_LEAST64_MAX   INT64_MAX
 # define __UINT_LEAST64_MAX UINT64_MAX
@ -482,6 +494,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST64_MIN    __INT_LEAST64_MIN
 # define INT_FAST64_MAX    __INT_LEAST64_MAX
 # define UINT_FAST64_MAX  __UINT_LEAST64_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
+# define INT_LEAST64_WIDTH  UINT_LEAST64_WIDTH
+# define UINT_FAST64_WIDTH  __UINT_LEAST64_WIDTH
+# define INT_FAST64_WIDTH   UINT_FAST64_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT_LEAST64_MIN */


@ -495,6 +516,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST56_MIN      INT56_MIN
 # define INT_FAST56_MAX      INT56_MAX
 # define UINT_FAST56_MAX    UINT56_MAX
+
 # define __INT_LEAST32_MIN   INT56_MIN
 # define __INT_LEAST32_MAX   INT56_MAX
 # define __UINT_LEAST32_MAX UINT56_MAX
@ -504,6 +526,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define __INT_LEAST8_MIN    INT56_MIN
 # define __INT_LEAST8_MAX    INT56_MAX
 # define __UINT_LEAST8_MAX  UINT56_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT56_WIDTH         56
+# define INT56_WIDTH          UINT56_WIDTH
+# define UINT_LEAST56_WIDTH   UINT56_WIDTH
+# define INT_LEAST56_WIDTH    UINT_LEAST56_WIDTH
+# define UINT_FAST56_WIDTH    UINT56_WIDTH
+# define INT_FAST56_WIDTH     UINT_FAST56_WIDTH
+# define __UINT_LEAST32_WIDTH UINT56_WIDTH
+# define __UINT_LEAST16_WIDTH UINT56_WIDTH
+# define __UINT_LEAST8_WIDTH  UINT56_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT56_TYPE__ */


@ -517,6 +553,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST48_MIN      INT48_MIN
 # define INT_FAST48_MAX      INT48_MAX
 # define UINT_FAST48_MAX    UINT48_MAX
+
 # define __INT_LEAST32_MIN   INT48_MIN
 # define __INT_LEAST32_MAX   INT48_MAX
 # define __UINT_LEAST32_MAX UINT48_MAX
@ -526,6 +563,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define __INT_LEAST8_MIN    INT48_MIN
 # define __INT_LEAST8_MAX    INT48_MAX
 # define __UINT_LEAST8_MAX  UINT48_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+#define UINT48_WIDTH         48
+#define INT48_WIDTH          UINT48_WIDTH
+#define UINT_LEAST48_WIDTH   UINT48_WIDTH
+#define INT_LEAST48_WIDTH    UINT_LEAST48_WIDTH
+#define UINT_FAST48_WIDTH    UINT48_WIDTH
+#define INT_FAST48_WIDTH     UINT_FAST48_WIDTH
+#define __UINT_LEAST32_WIDTH UINT48_WIDTH
+#define __UINT_LEAST16_WIDTH UINT48_WIDTH
+#define __UINT_LEAST8_WIDTH  UINT48_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT48_TYPE__ */


@ -539,6 +590,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST40_MIN      INT40_MIN
 # define INT_FAST40_MAX      INT40_MAX
 # define UINT_FAST40_MAX    UINT40_MAX
+
 # define __INT_LEAST32_MIN   INT40_MIN
 # define __INT_LEAST32_MAX   INT40_MAX
 # define __UINT_LEAST32_MAX UINT40_MAX
@ -548,6 +600,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define __INT_LEAST8_MIN    INT40_MIN
 # define __INT_LEAST8_MAX    INT40_MAX
 # define __UINT_LEAST8_MAX  UINT40_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT40_WIDTH         40
+# define INT40_WIDTH          UINT40_WIDTH
+# define UINT_LEAST40_WIDTH   UINT40_WIDTH
+# define INT_LEAST40_WIDTH    UINT_LEAST40_WIDTH
+# define UINT_FAST40_WIDTH    UINT40_WIDTH
+# define INT_FAST40_WIDTH     UINT_FAST40_WIDTH
+# define __UINT_LEAST32_WIDTH UINT40_WIDTH
+# define __UINT_LEAST16_WIDTH UINT40_WIDTH
+# define __UINT_LEAST8_WIDTH  UINT40_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT40_TYPE__ */


@ -555,6 +621,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT32_MAX           INT32_C(2147483647)
 # define INT32_MIN         (-INT32_C(2147483647)-1)
 # define UINT32_MAX         UINT32_C(4294967295)
+
 # define __INT_LEAST32_MIN   INT32_MIN
 # define __INT_LEAST32_MAX   INT32_MAX
 # define __UINT_LEAST32_MAX UINT32_MAX
@ -564,6 +631,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define __INT_LEAST8_MIN    INT32_MIN
 # define __INT_LEAST8_MAX    INT32_MAX
 # define __UINT_LEAST8_MAX  UINT32_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT32_WIDTH         32
+# define INT32_WIDTH          UINT32_WIDTH
+# define __UINT_LEAST32_WIDTH UINT32_WIDTH
+# define __UINT_LEAST16_WIDTH UINT32_WIDTH
+# define __UINT_LEAST8_WIDTH  UINT32_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT32_TYPE__ */

 #ifdef __INT_LEAST32_MIN
@ -573,6 +650,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST32_MIN    __INT_LEAST32_MIN
 # define INT_FAST32_MAX    __INT_LEAST32_MAX
 # define UINT_FAST32_MAX  __UINT_LEAST32_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
+# define INT_LEAST32_WIDTH  UINT_LEAST32_WIDTH
+# define UINT_FAST32_WIDTH  __UINT_LEAST32_WIDTH
+# define INT_FAST32_WIDTH   UINT_FAST32_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT_LEAST32_MIN */


@ -586,12 +672,26 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST24_MIN      INT24_MIN
 # define INT_FAST24_MAX      INT24_MAX
 # define UINT_FAST24_MAX    UINT24_MAX
+
 # define __INT_LEAST16_MIN   INT24_MIN
 # define __INT_LEAST16_MAX   INT24_MAX
 # define __UINT_LEAST16_MAX UINT24_MAX
 # define __INT_LEAST8_MIN    INT24_MIN
 # define __INT_LEAST8_MAX    INT24_MAX
 # define __UINT_LEAST8_MAX  UINT24_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT24_WIDTH         24
+# define INT24_WIDTH          UINT24_WIDTH
+# define UINT_LEAST24_WIDTH   UINT24_WIDTH
+# define INT_LEAST24_WIDTH    UINT_LEAST24_WIDTH
+# define UINT_FAST24_WIDTH    UINT24_WIDTH
+# define INT_FAST24_WIDTH     UINT_FAST24_WIDTH
+# define __UINT_LEAST16_WIDTH UINT24_WIDTH
+# define __UINT_LEAST8_WIDTH  UINT24_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT24_TYPE__ */


@ -599,12 +699,22 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define INT16_MAX            INT16_C(32767)
 #define INT16_MIN          (-INT16_C(32767)-1)
 #define UINT16_MAX          UINT16_C(65535)
+
 # define __INT_LEAST16_MIN   INT16_MIN
 # define __INT_LEAST16_MAX   INT16_MAX
 # define __UINT_LEAST16_MAX UINT16_MAX
 # define __INT_LEAST8_MIN    INT16_MIN
 # define __INT_LEAST8_MAX    INT16_MAX
 # define __UINT_LEAST8_MAX  UINT16_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT16_WIDTH         16
+# define INT16_WIDTH          UINT16_WIDTH
+# define __UINT_LEAST16_WIDTH UINT16_WIDTH
+# define __UINT_LEAST8_WIDTH  UINT16_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT16_TYPE__ */

 #ifdef __INT_LEAST16_MIN
@ -614,6 +724,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST16_MIN    __INT_LEAST16_MIN
 # define INT_FAST16_MAX    __INT_LEAST16_MAX
 # define UINT_FAST16_MAX  __UINT_LEAST16_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
+# define INT_LEAST16_WIDTH  UINT_LEAST16_WIDTH
+# define UINT_FAST16_WIDTH  __UINT_LEAST16_WIDTH
+# define INT_FAST16_WIDTH   UINT_FAST16_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT_LEAST16_MIN */


@ -621,9 +740,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT8_MAX            INT8_C(127)
 # define INT8_MIN          (-INT8_C(127)-1)
 # define UINT8_MAX          UINT8_C(255)
+
 # define __INT_LEAST8_MIN    INT8_MIN
 # define __INT_LEAST8_MAX    INT8_MAX
 # define __UINT_LEAST8_MAX  UINT8_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT8_WIDTH         8
+# define INT8_WIDTH          UINT8_WIDTH
+# define __UINT_LEAST8_WIDTH UINT8_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT8_TYPE__ */

 #ifdef __INT_LEAST8_MIN
@ -633,6 +761,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST8_MIN    __INT_LEAST8_MIN
 # define INT_FAST8_MAX    __INT_LEAST8_MAX
 # define UINT_FAST8_MAX  __UINT_LEAST8_MAX
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+# define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
+# define INT_LEAST8_WIDTH  UINT_LEAST8_WIDTH
+# define UINT_FAST8_WIDTH  __UINT_LEAST8_WIDTH
+# define INT_FAST8_WIDTH   UINT_FAST8_WIDTH
+#endif /* __STDC_VERSION__ */
 #endif /* __INT_LEAST8_MIN */

 /* Some utility macros */
@ -652,6 +789,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define PTRDIFF_MAX   __PTRDIFF_MAX__
 #define    SIZE_MAX      __SIZE_MAX__

+/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+/* NB: The C standard requires that these be the same value, but the compiler
+   exposes separate internal width macros. */
+#define INTPTR_WIDTH  __INTPTR_WIDTH__
+#define UINTPTR_WIDTH __UINTPTR_WIDTH__
+#endif
+
 /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
 * is enabled. */
 #if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
@ -663,6 +810,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define  INTMAX_MAX   __INTMAX_MAX__
 #define UINTMAX_MAX  __UINTMAX_MAX__

+/* C2x 7.20.2.5 Width of greatest-width integer types. */
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+/* NB: The C standard requires that these be the same value, but the compiler
+   exposes separate internal width macros. */
+#define INTMAX_WIDTH __INTMAX_WIDTH__
+#define UINTMAX_WIDTH __UINTMAX_WIDTH__
+#endif
+
 /* C99 7.18.3 Limits of other integer types. */
 #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
 #define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
@ -689,5 +846,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define  INTMAX_C(v) __int_c(v,  __INTMAX_C_SUFFIX__)
 #define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)

+/* C2x 7.20.3.x Width of other integer types. */
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if __STDC_VERSION__ >= 202000L
+#define PTRDIFF_WIDTH    __PTRDIFF_WIDTH__
+#define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
+#define SIZE_WIDTH       __SIZE_WIDTH__
+#define WCHAR_WIDTH      __WCHAR_WIDTH__
+#define WINT_WIDTH       __WINT_WIDTH__
+#endif
+
 #endif /* __STDC_HOSTED__ */
 #endif /* __CLANG_STDINT_H */
--- a/lib/include/tmmintrin.h
+++ b/lib/include/tmmintrin.h
@ -10,6 +10,10 @@
 #ifndef __TMMINTRIN_H
 #define __TMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
@ -49,7 +53,7 @@ _mm_abs_pi8(__m64 __a)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi8(__m128i __a)
 {
-    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+    return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
 }

 /// Computes the absolute value of each of the packed 16-bit signed
@ -85,7 +89,7 @@ _mm_abs_pi16(__m64 __a)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi16(__m128i __a)
 {
-    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+    return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
 }

 /// Computes the absolute value of each of the packed 32-bit signed
@ -121,7 +125,7 @@ _mm_abs_pi32(__m64 __a)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_abs_epi32(__m128i __a)
 {
-    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+    return (__m128i)__builtin_elementwise_abs((__v4si)__a);
 }

 /// Concatenates the two 128-bit integer vector operands, and
@ -145,8 +149,8 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_epi8(a, b, n) \
-  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                     (__v16qi)(__m128i)(b), (n))
+  ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+                                      (__v16qi)(__m128i)(b), (n)))

 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 ///    the result by the number of bytes specified in the immediate operand.
@ -168,7 +172,7 @@ _mm_abs_epi32(__m128i __a)
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
 #define _mm_alignr_pi8(a, b, n) \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
+  ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))

 /// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
--- a/lib/include/unwind.h
+++ b/lib/include/unwind.h
@ -172,7 +172,8 @@ typedef enum {
  _UVRSC_CORE = 0,        /* integer register */
  _UVRSC_VFP = 1,         /* vfp */
  _UVRSC_WMMXD = 3,       /* Intel WMMX data register */
-  _UVRSC_WMMXC = 4        /* Intel WMMX control register */
+  _UVRSC_WMMXC = 4,       /* Intel WMMX control register */
+  _UVRSC_PSEUDO = 5       /* Special purpose pseudo register */
 } _Unwind_VRS_RegClass;

 typedef enum {
--- a/lib/include/vaesintrin.h
+++ b/lib/include/vaesintrin.h
@ -82,4 +82,4 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS_F
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_F

-#endif
+#endif // __VAESINTRIN_H
--- a/lib/include/vpclmulqdqintrin.h
+++ b/lib/include/vpclmulqdqintrin.h
@ -15,15 +15,15 @@
 #define __VPCLMULQDQINTRIN_H

 #define _mm256_clmulepi64_epi128(A, B, I) \
-  (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
+  ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
                                        (__v4di)(__m256i)(B),  \
-                                       (char)(I))
+                                        (char)(I)))

 #ifdef __AVX512FINTRIN_H
 #define _mm512_clmulepi64_epi128(A, B, I) \
-  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
+  ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
                                        (__v8di)(__m512i)(B),  \
-                                       (char)(I))
+                                        (char)(I)))
 #endif // __AVX512FINTRIN_H

 #endif /* __VPCLMULQDQINTRIN_H */
--- a/lib/include/wasm_simd128.h
+++ b/lib/include/wasm_simd128.h
@ -276,12 +276,28 @@ wasm_i8x16_make(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3, int8_t __c4,
                           __c12, __c13, __c14, __c15};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u8x16_make(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
+                uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
+                uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
+                uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15) {
+  return (v128_t)(__u8x16){__c0,  __c1,  __c2,  __c3, __c4,  __c5,
+                           __c6,  __c7,  __c8,  __c9, __c10, __c11,
+                           __c12, __c13, __c14, __c15};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_make(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
                int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7) {
  return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u16x8_make(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
+                uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7) {
+  return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
                                                            int32_t __c1,
                                                            int32_t __c2,
@ -289,11 +305,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
  return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_make(uint32_t __c0,
+                                                            uint32_t __c1,
+                                                            uint32_t __c2,
+                                                            uint32_t __c3) {
+  return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_make(int64_t __c0,
                                                            int64_t __c1) {
  return (v128_t)(__i64x2){__c0, __c1};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_make(uint64_t __c0,
+                                                            uint64_t __c1) {
+  return (v128_t)(__u64x2){__c0, __c1};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_make(float __c0,
                                                            float __c1,
                                                            float __c2,
@ -324,6 +352,24 @@ wasm_i8x16_const(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3,
                           __c12, __c13, __c14, __c15};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u8x16_const(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
+                 uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
+                 uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
+                 uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
+            __REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
+                __REQUIRE_CONSTANT(__c7) __REQUIRE_CONSTANT(__c8)
+                    __REQUIRE_CONSTANT(__c9) __REQUIRE_CONSTANT(__c10)
+                        __REQUIRE_CONSTANT(__c11) __REQUIRE_CONSTANT(__c12)
+                            __REQUIRE_CONSTANT(__c13) __REQUIRE_CONSTANT(__c14)
+                                __REQUIRE_CONSTANT(__c15) {
+  return (v128_t)(__u8x16){__c0,  __c1,  __c2,  __c3, __c4,  __c5,
+                           __c6,  __c7,  __c8,  __c9, __c10, __c11,
+                           __c12, __c13, __c14, __c15};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
                 int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7)
@ -334,6 +380,16 @@ wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
  return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u16x8_const(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
+                 uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
+            __REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
+                __REQUIRE_CONSTANT(__c7) {
+  return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@ -341,12 +397,25 @@ wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
  return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u32x4_const(uint32_t __c0, uint32_t __c1, uint32_t __c2, uint32_t __c3)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) {
+  return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const(int64_t __c0,
                                                             int64_t __c1)
    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
  return (v128_t)(__i64x2){__c0, __c1};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const(uint64_t __c0,
+                                                             uint64_t __c1)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
+  return (v128_t)(__u64x2){__c0, __c1};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_f32x4_const(float __c0, float __c1, float __c2, float __c3)
    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@ -366,21 +435,42 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_const_splat(int8_t __c)
                           __c, __c, __c, __c, __c, __c, __c, __c};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_const_splat(uint8_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u8x16){__c, __c, __c, __c, __c, __c, __c, __c,
+                           __c, __c, __c, __c, __c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_const_splat(int16_t __c)
    __REQUIRE_CONSTANT(__c) {
  return (v128_t)(__i16x8){__c, __c, __c, __c, __c, __c, __c, __c};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_const_splat(uint16_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u16x8){__c, __c, __c, __c, __c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_const_splat(int32_t __c)
    __REQUIRE_CONSTANT(__c) {
  return (v128_t)(__i32x4){__c, __c, __c, __c};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_const_splat(uint32_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u32x4){__c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const_splat(int64_t __c)
    __REQUIRE_CONSTANT(__c) {
  return (v128_t)(__i64x2){__c, __c};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const_splat(uint64_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u64x2){__c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_const_splat(float __c)
    __REQUIRE_CONSTANT(__c) {
  return (v128_t)(__f32x4){__c, __c, __c, __c};
@ -396,6 +486,11 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
                           __a, __a, __a, __a, __a, __a, __a, __a};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_splat(uint8_t __a) {
+  return (v128_t)(__u8x16){__a, __a, __a, __a, __a, __a, __a, __a,
+                           __a, __a, __a, __a, __a, __a, __a, __a};
+}
+
 static __inline__ int8_t __DEFAULT_FN_ATTRS wasm_i8x16_extract_lane(v128_t __a,
                                                                    int __i)
    __REQUIRE_CONSTANT(__i) {
@ -417,10 +512,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_replace_lane(v128_t __a,
  return (v128_t)__v;
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_replace_lane(v128_t __a,
+                                                                    int __i,
+                                                                    uint8_t __b)
+    __REQUIRE_CONSTANT(__i) {
+  __u8x16 __v = (__u8x16)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
  return (v128_t)(__i16x8){__a, __a, __a, __a, __a, __a, __a, __a};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_splat(uint16_t __a) {
+  return (v128_t)(__u16x8){__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
 static __inline__ int16_t __DEFAULT_FN_ATTRS wasm_i16x8_extract_lane(v128_t __a,
                                                                     int __i)
    __REQUIRE_CONSTANT(__i) {
@ -441,16 +549,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_replace_lane(v128_t __a,
  return (v128_t)__v;
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_replace_lane(
+    v128_t __a, int __i, uint16_t __b) __REQUIRE_CONSTANT(__i) {
+  __u16x8 __v = (__u16x8)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_splat(int32_t __a) {
  return (v128_t)(__i32x4){__a, __a, __a, __a};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_splat(uint32_t __a) {
+  return (v128_t)(__u32x4){__a, __a, __a, __a};
+}
+
 static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_extract_lane(v128_t __a,
                                                                     int __i)
    __REQUIRE_CONSTANT(__i) {
  return ((__i32x4)__a)[__i];
 }

+static __inline__ uint32_t __DEFAULT_FN_ATTRS
+wasm_u32x4_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
+  return ((__u32x4)__a)[__i];
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
                                                                    int __i,
                                                                    int32_t __b)
@ -460,16 +584,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
  return (v128_t)__v;
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_replace_lane(
+    v128_t __a, int __i, uint32_t __b) __REQUIRE_CONSTANT(__i) {
+  __u32x4 __v = (__u32x4)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_splat(int64_t __a) {
  return (v128_t)(__i64x2){__a, __a};
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_splat(uint64_t __a) {
+  return (v128_t)(__u64x2){__a, __a};
+}
+
 static __inline__ int64_t __DEFAULT_FN_ATTRS wasm_i64x2_extract_lane(v128_t __a,
                                                                     int __i)
    __REQUIRE_CONSTANT(__i) {
  return ((__i64x2)__a)[__i];
 }

+static __inline__ uint64_t __DEFAULT_FN_ATTRS
+wasm_u64x2_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
+  return ((__u64x2)__a)[__i];
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
                                                                    int __i,
                                                                    int64_t __b)
@ -479,6 +619,13 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
  return (v128_t)__v;
 }

+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_replace_lane(
+    v128_t __a, int __i, uint64_t __b) __REQUIRE_CONSTANT(__i) {
+  __u64x2 __v = (__u64x2)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_splat(float __a) {
  return (v128_t)(__f32x4){__a, __a, __a, __a};
 }
@ -804,7 +951,7 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i8x16_all_true(v128_t __a) {
  return __builtin_wasm_all_true_i8x16((__i8x16)__a);
 }

-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
  return __builtin_wasm_bitmask_i8x16((__i8x16)__a);
 }

@ -813,17 +960,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i8x16)__a << __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i8x16)__a >> __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__u8x16)__a >> __b);
 }

@ -894,22 +1041,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i16x8_all_true(v128_t __a) {
  return __builtin_wasm_all_true_i16x8((__i16x8)__a);
 }

-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
  return __builtin_wasm_bitmask_i16x8((__i16x8)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i16x8)__a << __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i16x8)__a >> __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__u16x8)__a >> __b);
 }

@ -985,22 +1132,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i32x4_all_true(v128_t __a) {
  return __builtin_wasm_all_true_i32x4((__i32x4)__a);
 }

-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
  return __builtin_wasm_bitmask_i32x4((__i32x4)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i32x4)__a << __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i32x4)__a >> __b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__u32x4)__a >> __b);
 }

@ -1056,22 +1203,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_all_true(v128_t __a) {
  return __builtin_wasm_all_true_i64x2((__i64x2)__a);
 }

-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
  return __builtin_wasm_bitmask_i64x2((__i64x2)__a);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i64x2)__a << (int64_t)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__i64x2)__a >> (int64_t)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
  return (v128_t)((__u64x2)__a >> (int64_t)__b);
 }

@ -1150,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
                                                            v128_t __b) {
-  __i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
-  return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
                                                            v128_t __b) {
-  __i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
-  return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
@ -1220,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
                                                            v128_t __b) {
-  __i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
-  return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
                                                            v128_t __b) {
-  __i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
-  return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
 }

 static __inline__ v128_t __DEFAULT_FN_ATTRS
--- a/lib/include/wmmintrin.h
+++ b/lib/include/wmmintrin.h
@ -10,6 +10,10 @@
 #ifndef __WMMINTRIN_H
 #define __WMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <emmintrin.h>

 #include <__wmmintrin_aes.h>
--- a/lib/include/x86gprintrin.h
+++ b/lib/include/x86gprintrin.h
@ -20,4 +20,16 @@
 #include <uintrintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CRC32__)
+#include <crc32intrin.h>
+#endif
+
+#define __SSC_MARK(Tag)                                                        \
+  __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; "                      \
+                       "mov {%0, %%ebx|ebx, %0}; "                          \
+                       ".byte 0x64, 0x67, 0x90; "                              \
+                       "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag)            \
+                       : "%eax");
+
 #endif /* __X86GPRINTRIN_H */
--- a/lib/include/xmmintrin.h
+++ b/lib/include/xmmintrin.h
@ -10,6 +10,10 @@
 #ifndef __XMMINTRIN_H
 #define __XMMINTRIN_H

+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
 #include <mmintrin.h>

 typedef int __v4si __attribute__((__vector_size__(16)));
@ -2181,7 +2185,7 @@ void _mm_sfence(void);
 ///    3: Bits [63:48] are copied to the destination.
 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
 #define _mm_extract_pi16(a, n) \
-  (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
+  ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))

 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2212,7 +2216,7 @@ void _mm_sfence(void);
 /// \returns A 64-bit integer vector containing the copied packed data from the
 ///    operands.
 #define _mm_insert_pi16(a, d, n) \
-  (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
+  ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))

 /// Compares each of the corresponding packed 16-bit integer values of
 ///    the 64-bit integer vectors, and writes the greater value to the
@ -2359,7 +2363,7 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 ///    11: assigned from bits [63:48] of \a a.
 /// \returns A 64-bit integer vector containing the shuffled values.
 #define _mm_shuffle_pi16(a, n) \
-  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
+  ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))

 /// Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
@ -2601,8 +2605,8 @@ void _mm_setcsr(unsigned int __i);
 ///    11: Bits [127:96] copied from the specified operand.
 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
 #define _mm_shuffle_ps(a, b, mask) \
-  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
-                                (int)(mask))
+  ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+                                 (int)(mask)))

 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
--- a/lib/include/xopintrin.h
+++ b/lib/include/xopintrin.h
@ -225,16 +225,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
 }

 #define _mm_roti_epi8(A, N) \
-  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))

 #define _mm_roti_epi16(A, N) \
-  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))

 #define _mm_roti_epi32(A, N) \
-  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))

 #define _mm_roti_epi64(A, N) \
-  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi8(__m128i __A, __m128i __B)
@ -285,36 +285,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
 }

 #define _mm_com_epu8(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
-                                  (__v16qi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (N)))

 #define _mm_com_epu16(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
-                                  (__v8hi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                   (__v8hi)(__m128i)(B), (N)))

 #define _mm_com_epu32(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
-                                  (__v4si)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                   (__v4si)(__m128i)(B), (N)))

 #define _mm_com_epu64(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
-                                  (__v2di)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                   (__v2di)(__m128i)(B), (N)))

 #define _mm_com_epi8(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
-                                 (__v16qi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)))

 #define _mm_com_epi16(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
-                                 (__v8hi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)))

 #define _mm_com_epi32(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
-                                 (__v4si)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)))

 #define _mm_com_epi64(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
-                                 (__v2di)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)))

 #define _MM_PCOMCTRL_LT    0
 #define _MM_PCOMCTRL_LE    1
@ -710,23 +710,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
 }

 #define _mm_permute2_pd(X, Y, C, I) \
-  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+  ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
                                      (__v2df)(__m128d)(Y), \
-                                     (__v2di)(__m128i)(C), (I))
+                                      (__v2di)(__m128i)(C), (I)))

 #define _mm256_permute2_pd(X, Y, C, I) \
-  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+  ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
                                         (__v4df)(__m256d)(Y), \
-                                        (__v4di)(__m256i)(C), (I))
+                                         (__v4di)(__m256i)(C), (I)))

 #define _mm_permute2_ps(X, Y, C, I) \
-  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
-                                    (__v4si)(__m128i)(C), (I))
+  ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                     (__v4si)(__m128i)(C), (I)))

 #define _mm256_permute2_ps(X, Y, C, I) \
-  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+  ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
                                        (__v8sf)(__m256)(Y), \
-                                       (__v8si)(__m256i)(C), (I))
+                                        (__v8si)(__m256i)(C), (I)))

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ss(__m128 __A)