update c_headers/* to LLVM 7.0.0rc1

2026-02-21 16:54:52 +00:00 · 2018-08-05 02:20:05 -04:00 · 2018-08-05 02:20:05 -04:00 · 6cf248ec08
commit 6cf248ec08
parent ee68f28bba
87 changed files with 19107 additions and 15970 deletions
--- a/c_headers/__clang_cuda_builtin_vars.h
+++ b/c_headers/__clang_cuda_builtin_vars.h
@ -54,7 +54,7 @@ struct dim3;
 #define __DELETE
 #endif

-// Make sure nobody can create instances of the special varible types.  nvcc
+// Make sure nobody can create instances of the special variable types.  nvcc
 // also disallows taking address of special variables, so we disable address-of
 // operator as well.
 #define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName)                            \
--- a/c_headers/__clang_cuda_device_functions.h
+++ b/c_headers/__clang_cuda_device_functions.h
--- a/c_headers/__clang_cuda_intrinsics.h
+++ b/c_headers/__clang_cuda_intrinsics.h
@ -277,6 +277,9 @@ inline __device__ long long __ldg(const long long *ptr) {
 inline __device__ unsigned char __ldg(const unsigned char *ptr) {
  return __nvvm_ldg_uc(ptr);
 }
+inline __device__ signed char __ldg(const signed char *ptr) {
+  return __nvvm_ldg_uc((const unsigned char *)ptr);
+}
 inline __device__ unsigned short __ldg(const unsigned short *ptr) {
  return __nvvm_ldg_us(ptr);
 }
--- a/c_headers/__clang_cuda_libdevice_declares.h
+++ b/c_headers/__clang_cuda_libdevice_declares.h
@ -0,0 +1,466 @@
+/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
+
+extern "C" {
+
+__device__ int __nv_abs(int __a);
+__device__ double __nv_acos(double __a);
+__device__ float __nv_acosf(float __a);
+__device__ double __nv_acosh(double __a);
+__device__ float __nv_acoshf(float __a);
+__device__ double __nv_asin(double __a);
+__device__ float __nv_asinf(float __a);
+__device__ double __nv_asinh(double __a);
+__device__ float __nv_asinhf(float __a);
+__device__ double __nv_atan2(double __a, double __b);
+__device__ float __nv_atan2f(float __a, float __b);
+__device__ double __nv_atan(double __a);
+__device__ float __nv_atanf(float __a);
+__device__ double __nv_atanh(double __a);
+__device__ float __nv_atanhf(float __a);
+__device__ int __nv_brev(int __a);
+__device__ long long __nv_brevll(long long __a);
+__device__ int __nv_byte_perm(int __a, int __b, int __c);
+__device__ double __nv_cbrt(double __a);
+__device__ float __nv_cbrtf(float __a);
+__device__ double __nv_ceil(double __a);
+__device__ float __nv_ceilf(float __a);
+__device__ int __nv_clz(int __a);
+__device__ int __nv_clzll(long long __a);
+__device__ double __nv_copysign(double __a, double __b);
+__device__ float __nv_copysignf(float __a, float __b);
+__device__ double __nv_cos(double __a);
+__device__ float __nv_cosf(float __a);
+__device__ double __nv_cosh(double __a);
+__device__ float __nv_coshf(float __a);
+__device__ double __nv_cospi(double __a);
+__device__ float __nv_cospif(float __a);
+__device__ double __nv_cyl_bessel_i0(double __a);
+__device__ float __nv_cyl_bessel_i0f(float __a);
+__device__ double __nv_cyl_bessel_i1(double __a);
+__device__ float __nv_cyl_bessel_i1f(float __a);
+__device__ double __nv_dadd_rd(double __a, double __b);
+__device__ double __nv_dadd_rn(double __a, double __b);
+__device__ double __nv_dadd_ru(double __a, double __b);
+__device__ double __nv_dadd_rz(double __a, double __b);
+__device__ double __nv_ddiv_rd(double __a, double __b);
+__device__ double __nv_ddiv_rn(double __a, double __b);
+__device__ double __nv_ddiv_ru(double __a, double __b);
+__device__ double __nv_ddiv_rz(double __a, double __b);
+__device__ double __nv_dmul_rd(double __a, double __b);
+__device__ double __nv_dmul_rn(double __a, double __b);
+__device__ double __nv_dmul_ru(double __a, double __b);
+__device__ double __nv_dmul_rz(double __a, double __b);
+__device__ float __nv_double2float_rd(double __a);
+__device__ float __nv_double2float_rn(double __a);
+__device__ float __nv_double2float_ru(double __a);
+__device__ float __nv_double2float_rz(double __a);
+__device__ int __nv_double2hiint(double __a);
+__device__ int __nv_double2int_rd(double __a);
+__device__ int __nv_double2int_rn(double __a);
+__device__ int __nv_double2int_ru(double __a);
+__device__ int __nv_double2int_rz(double __a);
+__device__ long long __nv_double2ll_rd(double __a);
+__device__ long long __nv_double2ll_rn(double __a);
+__device__ long long __nv_double2ll_ru(double __a);
+__device__ long long __nv_double2ll_rz(double __a);
+__device__ int __nv_double2loint(double __a);
+__device__ unsigned int __nv_double2uint_rd(double __a);
+__device__ unsigned int __nv_double2uint_rn(double __a);
+__device__ unsigned int __nv_double2uint_ru(double __a);
+__device__ unsigned int __nv_double2uint_rz(double __a);
+__device__ unsigned long long __nv_double2ull_rd(double __a);
+__device__ unsigned long long __nv_double2ull_rn(double __a);
+__device__ unsigned long long __nv_double2ull_ru(double __a);
+__device__ unsigned long long __nv_double2ull_rz(double __a);
+__device__ unsigned long long __nv_double_as_longlong(double __a);
+__device__ double __nv_drcp_rd(double __a);
+__device__ double __nv_drcp_rn(double __a);
+__device__ double __nv_drcp_ru(double __a);
+__device__ double __nv_drcp_rz(double __a);
+__device__ double __nv_dsqrt_rd(double __a);
+__device__ double __nv_dsqrt_rn(double __a);
+__device__ double __nv_dsqrt_ru(double __a);
+__device__ double __nv_dsqrt_rz(double __a);
+__device__ double __nv_dsub_rd(double __a, double __b);
+__device__ double __nv_dsub_rn(double __a, double __b);
+__device__ double __nv_dsub_ru(double __a, double __b);
+__device__ double __nv_dsub_rz(double __a, double __b);
+__device__ double __nv_erfc(double __a);
+__device__ float __nv_erfcf(float __a);
+__device__ double __nv_erfcinv(double __a);
+__device__ float __nv_erfcinvf(float __a);
+__device__ double __nv_erfcx(double __a);
+__device__ float __nv_erfcxf(float __a);
+__device__ double __nv_erf(double __a);
+__device__ float __nv_erff(float __a);
+__device__ double __nv_erfinv(double __a);
+__device__ float __nv_erfinvf(float __a);
+__device__ double __nv_exp10(double __a);
+__device__ float __nv_exp10f(float __a);
+__device__ double __nv_exp2(double __a);
+__device__ float __nv_exp2f(float __a);
+__device__ double __nv_exp(double __a);
+__device__ float __nv_expf(float __a);
+__device__ double __nv_expm1(double __a);
+__device__ float __nv_expm1f(float __a);
+__device__ double __nv_fabs(double __a);
+__device__ float __nv_fabsf(float __a);
+__device__ float __nv_fadd_rd(float __a, float __b);
+__device__ float __nv_fadd_rn(float __a, float __b);
+__device__ float __nv_fadd_ru(float __a, float __b);
+__device__ float __nv_fadd_rz(float __a, float __b);
+__device__ float __nv_fast_cosf(float __a);
+__device__ float __nv_fast_exp10f(float __a);
+__device__ float __nv_fast_expf(float __a);
+__device__ float __nv_fast_fdividef(float __a, float __b);
+__device__ float __nv_fast_log10f(float __a);
+__device__ float __nv_fast_log2f(float __a);
+__device__ float __nv_fast_logf(float __a);
+__device__ float __nv_fast_powf(float __a, float __b);
+__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
+__device__ float __nv_fast_sinf(float __a);
+__device__ float __nv_fast_tanf(float __a);
+__device__ double __nv_fdim(double __a, double __b);
+__device__ float __nv_fdimf(float __a, float __b);
+__device__ float __nv_fdiv_rd(float __a, float __b);
+__device__ float __nv_fdiv_rn(float __a, float __b);
+__device__ float __nv_fdiv_ru(float __a, float __b);
+__device__ float __nv_fdiv_rz(float __a, float __b);
+__device__ int __nv_ffs(int __a);
+__device__ int __nv_ffsll(long long __a);
+__device__ int __nv_finitef(float __a);
+__device__ unsigned short __nv_float2half_rn(float __a);
+__device__ int __nv_float2int_rd(float __a);
+__device__ int __nv_float2int_rn(float __a);
+__device__ int __nv_float2int_ru(float __a);
+__device__ int __nv_float2int_rz(float __a);
+__device__ long long __nv_float2ll_rd(float __a);
+__device__ long long __nv_float2ll_rn(float __a);
+__device__ long long __nv_float2ll_ru(float __a);
+__device__ long long __nv_float2ll_rz(float __a);
+__device__ unsigned int __nv_float2uint_rd(float __a);
+__device__ unsigned int __nv_float2uint_rn(float __a);
+__device__ unsigned int __nv_float2uint_ru(float __a);
+__device__ unsigned int __nv_float2uint_rz(float __a);
+__device__ unsigned long long __nv_float2ull_rd(float __a);
+__device__ unsigned long long __nv_float2ull_rn(float __a);
+__device__ unsigned long long __nv_float2ull_ru(float __a);
+__device__ unsigned long long __nv_float2ull_rz(float __a);
+__device__ int __nv_float_as_int(float __a);
+__device__ unsigned int __nv_float_as_uint(float __a);
+__device__ double __nv_floor(double __a);
+__device__ float __nv_floorf(float __a);
+__device__ double __nv_fma(double __a, double __b, double __c);
+__device__ float __nv_fmaf(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
+__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
+__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
+__device__ double __nv_fma_rd(double __a, double __b, double __c);
+__device__ double __nv_fma_rn(double __a, double __b, double __c);
+__device__ double __nv_fma_ru(double __a, double __b, double __c);
+__device__ double __nv_fma_rz(double __a, double __b, double __c);
+__device__ double __nv_fmax(double __a, double __b);
+__device__ float __nv_fmaxf(float __a, float __b);
+__device__ double __nv_fmin(double __a, double __b);
+__device__ float __nv_fminf(float __a, float __b);
+__device__ double __nv_fmod(double __a, double __b);
+__device__ float __nv_fmodf(float __a, float __b);
+__device__ float __nv_fmul_rd(float __a, float __b);
+__device__ float __nv_fmul_rn(float __a, float __b);
+__device__ float __nv_fmul_ru(float __a, float __b);
+__device__ float __nv_fmul_rz(float __a, float __b);
+__device__ float __nv_frcp_rd(float __a);
+__device__ float __nv_frcp_rn(float __a);
+__device__ float __nv_frcp_ru(float __a);
+__device__ float __nv_frcp_rz(float __a);
+__device__ double __nv_frexp(double __a, int *__b);
+__device__ float __nv_frexpf(float __a, int *__b);
+__device__ float __nv_frsqrt_rn(float __a);
+__device__ float __nv_fsqrt_rd(float __a);
+__device__ float __nv_fsqrt_rn(float __a);
+__device__ float __nv_fsqrt_ru(float __a);
+__device__ float __nv_fsqrt_rz(float __a);
+__device__ float __nv_fsub_rd(float __a, float __b);
+__device__ float __nv_fsub_rn(float __a, float __b);
+__device__ float __nv_fsub_ru(float __a, float __b);
+__device__ float __nv_fsub_rz(float __a, float __b);
+__device__ int __nv_hadd(int __a, int __b);
+__device__ float __nv_half2float(unsigned short __h);
+__device__ double __nv_hiloint2double(int __a, int __b);
+__device__ double __nv_hypot(double __a, double __b);
+__device__ float __nv_hypotf(float __a, float __b);
+__device__ int __nv_ilogb(double __a);
+__device__ int __nv_ilogbf(float __a);
+__device__ double __nv_int2double_rn(int __a);
+__device__ float __nv_int2float_rd(int __a);
+__device__ float __nv_int2float_rn(int __a);
+__device__ float __nv_int2float_ru(int __a);
+__device__ float __nv_int2float_rz(int __a);
+__device__ float __nv_int_as_float(int __a);
+__device__ int __nv_isfinited(double __a);
+__device__ int __nv_isinfd(double __a);
+__device__ int __nv_isinff(float __a);
+__device__ int __nv_isnand(double __a);
+__device__ int __nv_isnanf(float __a);
+__device__ double __nv_j0(double __a);
+__device__ float __nv_j0f(float __a);
+__device__ double __nv_j1(double __a);
+__device__ float __nv_j1f(float __a);
+__device__ float __nv_jnf(int __a, float __b);
+__device__ double __nv_jn(int __a, double __b);
+__device__ double __nv_ldexp(double __a, int __b);
+__device__ float __nv_ldexpf(float __a, int __b);
+__device__ double __nv_lgamma(double __a);
+__device__ float __nv_lgammaf(float __a);
+__device__ double __nv_ll2double_rd(long long __a);
+__device__ double __nv_ll2double_rn(long long __a);
+__device__ double __nv_ll2double_ru(long long __a);
+__device__ double __nv_ll2double_rz(long long __a);
+__device__ float __nv_ll2float_rd(long long __a);
+__device__ float __nv_ll2float_rn(long long __a);
+__device__ float __nv_ll2float_ru(long long __a);
+__device__ float __nv_ll2float_rz(long long __a);
+__device__ long long __nv_llabs(long long __a);
+__device__ long long __nv_llmax(long long __a, long long __b);
+__device__ long long __nv_llmin(long long __a, long long __b);
+__device__ long long __nv_llrint(double __a);
+__device__ long long __nv_llrintf(float __a);
+__device__ long long __nv_llround(double __a);
+__device__ long long __nv_llroundf(float __a);
+__device__ double __nv_log10(double __a);
+__device__ float __nv_log10f(float __a);
+__device__ double __nv_log1p(double __a);
+__device__ float __nv_log1pf(float __a);
+__device__ double __nv_log2(double __a);
+__device__ float __nv_log2f(float __a);
+__device__ double __nv_logb(double __a);
+__device__ float __nv_logbf(float __a);
+__device__ double __nv_log(double __a);
+__device__ float __nv_logf(float __a);
+__device__ double __nv_longlong_as_double(long long __a);
+__device__ int __nv_max(int __a, int __b);
+__device__ int __nv_min(int __a, int __b);
+__device__ double __nv_modf(double __a, double *__b);
+__device__ float __nv_modff(float __a, float *__b);
+__device__ int __nv_mul24(int __a, int __b);
+__device__ long long __nv_mul64hi(long long __a, long long __b);
+__device__ int __nv_mulhi(int __a, int __b);
+__device__ double __nv_nan(const signed char *__a);
+__device__ float __nv_nanf(const signed char *__a);
+__device__ double __nv_nearbyint(double __a);
+__device__ float __nv_nearbyintf(float __a);
+__device__ double __nv_nextafter(double __a, double __b);
+__device__ float __nv_nextafterf(float __a, float __b);
+__device__ double __nv_norm3d(double __a, double __b, double __c);
+__device__ float __nv_norm3df(float __a, float __b, float __c);
+__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
+__device__ double __nv_normcdf(double __a);
+__device__ float __nv_normcdff(float __a);
+__device__ double __nv_normcdfinv(double __a);
+__device__ float __nv_normcdfinvf(float __a);
+__device__ float __nv_normf(int __a, const float *__b);
+__device__ double __nv_norm(int __a, const double *__b);
+__device__ int __nv_popc(int __a);
+__device__ int __nv_popcll(long long __a);
+__device__ double __nv_pow(double __a, double __b);
+__device__ float __nv_powf(float __a, float __b);
+__device__ double __nv_powi(double __a, int __b);
+__device__ float __nv_powif(float __a, int __b);
+__device__ double __nv_rcbrt(double __a);
+__device__ float __nv_rcbrtf(float __a);
+__device__ double __nv_rcp64h(double __a);
+__device__ double __nv_remainder(double __a, double __b);
+__device__ float __nv_remainderf(float __a, float __b);
+__device__ double __nv_remquo(double __a, double __b, int *__c);
+__device__ float __nv_remquof(float __a, float __b, int *__c);
+__device__ int __nv_rhadd(int __a, int __b);
+__device__ double __nv_rhypot(double __a, double __b);
+__device__ float __nv_rhypotf(float __a, float __b);
+__device__ double __nv_rint(double __a);
+__device__ float __nv_rintf(float __a);
+__device__ double __nv_rnorm3d(double __a, double __b, double __c);
+__device__ float __nv_rnorm3df(float __a, float __b, float __c);
+__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
+__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
+__device__ float __nv_rnormf(int __a, const float *__b);
+__device__ double __nv_rnorm(int __a, const double *__b);
+__device__ double __nv_round(double __a);
+__device__ float __nv_roundf(float __a);
+__device__ double __nv_rsqrt(double __a);
+__device__ float __nv_rsqrtf(float __a);
+__device__ int __nv_sad(int __a, int __b, int __c);
+__device__ float __nv_saturatef(float __a);
+__device__ double __nv_scalbn(double __a, int __b);
+__device__ float __nv_scalbnf(float __a, int __b);
+__device__ int __nv_signbitd(double __a);
+__device__ int __nv_signbitf(float __a);
+__device__ void __nv_sincos(double __a, double *__b, double *__c);
+__device__ void __nv_sincosf(float __a, float *__b, float *__c);
+__device__ void __nv_sincospi(double __a, double *__b, double *__c);
+__device__ void __nv_sincospif(float __a, float *__b, float *__c);
+__device__ double __nv_sin(double __a);
+__device__ float __nv_sinf(float __a);
+__device__ double __nv_sinh(double __a);
+__device__ float __nv_sinhf(float __a);
+__device__ double __nv_sinpi(double __a);
+__device__ float __nv_sinpif(float __a);
+__device__ double __nv_sqrt(double __a);
+__device__ float __nv_sqrtf(float __a);
+__device__ double __nv_tan(double __a);
+__device__ float __nv_tanf(float __a);
+__device__ double __nv_tanh(double __a);
+__device__ float __nv_tanhf(float __a);
+__device__ double __nv_tgamma(double __a);
+__device__ float __nv_tgammaf(float __a);
+__device__ double __nv_trunc(double __a);
+__device__ float __nv_truncf(float __a);
+__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
+__device__ double __nv_uint2double_rn(unsigned int __i);
+__device__ float __nv_uint2float_rd(unsigned int __a);
+__device__ float __nv_uint2float_rn(unsigned int __a);
+__device__ float __nv_uint2float_ru(unsigned int __a);
+__device__ float __nv_uint2float_rz(unsigned int __a);
+__device__ float __nv_uint_as_float(unsigned int __a);
+__device__ double __nv_ull2double_rd(unsigned long long __a);
+__device__ double __nv_ull2double_rn(unsigned long long __a);
+__device__ double __nv_ull2double_ru(unsigned long long __a);
+__device__ double __nv_ull2double_rz(unsigned long long __a);
+__device__ float __nv_ull2float_rd(unsigned long long __a);
+__device__ float __nv_ull2float_rn(unsigned long long __a);
+__device__ float __nv_ull2float_ru(unsigned long long __a);
+__device__ float __nv_ull2float_rz(unsigned long long __a);
+__device__ unsigned long long __nv_ullmax(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned long long __nv_ullmin(unsigned long long __a,
+                                          unsigned long long __b);
+__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
+__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
+                                            unsigned long long __b);
+__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
+__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
+                                  unsigned int __c);
+#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
+__device__ int __nv_vabs2(int __a);
+__device__ int __nv_vabs4(int __a);
+__device__ int __nv_vabsdiffs2(int __a, int __b);
+__device__ int __nv_vabsdiffs4(int __a, int __b);
+__device__ int __nv_vabsdiffu2(int __a, int __b);
+__device__ int __nv_vabsdiffu4(int __a, int __b);
+__device__ int __nv_vabsss2(int __a);
+__device__ int __nv_vabsss4(int __a);
+__device__ int __nv_vadd2(int __a, int __b);
+__device__ int __nv_vadd4(int __a, int __b);
+__device__ int __nv_vaddss2(int __a, int __b);
+__device__ int __nv_vaddss4(int __a, int __b);
+__device__ int __nv_vaddus2(int __a, int __b);
+__device__ int __nv_vaddus4(int __a, int __b);
+__device__ int __nv_vavgs2(int __a, int __b);
+__device__ int __nv_vavgs4(int __a, int __b);
+__device__ int __nv_vavgu2(int __a, int __b);
+__device__ int __nv_vavgu4(int __a, int __b);
+__device__ int __nv_vcmpeq2(int __a, int __b);
+__device__ int __nv_vcmpeq4(int __a, int __b);
+__device__ int __nv_vcmpges2(int __a, int __b);
+__device__ int __nv_vcmpges4(int __a, int __b);
+__device__ int __nv_vcmpgeu2(int __a, int __b);
+__device__ int __nv_vcmpgeu4(int __a, int __b);
+__device__ int __nv_vcmpgts2(int __a, int __b);
+__device__ int __nv_vcmpgts4(int __a, int __b);
+__device__ int __nv_vcmpgtu2(int __a, int __b);
+__device__ int __nv_vcmpgtu4(int __a, int __b);
+__device__ int __nv_vcmples2(int __a, int __b);
+__device__ int __nv_vcmples4(int __a, int __b);
+__device__ int __nv_vcmpleu2(int __a, int __b);
+__device__ int __nv_vcmpleu4(int __a, int __b);
+__device__ int __nv_vcmplts2(int __a, int __b);
+__device__ int __nv_vcmplts4(int __a, int __b);
+__device__ int __nv_vcmpltu2(int __a, int __b);
+__device__ int __nv_vcmpltu4(int __a, int __b);
+__device__ int __nv_vcmpne2(int __a, int __b);
+__device__ int __nv_vcmpne4(int __a, int __b);
+__device__ int __nv_vhaddu2(int __a, int __b);
+__device__ int __nv_vhaddu4(int __a, int __b);
+__device__ int __nv_vmaxs2(int __a, int __b);
+__device__ int __nv_vmaxs4(int __a, int __b);
+__device__ int __nv_vmaxu2(int __a, int __b);
+__device__ int __nv_vmaxu4(int __a, int __b);
+__device__ int __nv_vmins2(int __a, int __b);
+__device__ int __nv_vmins4(int __a, int __b);
+__device__ int __nv_vminu2(int __a, int __b);
+__device__ int __nv_vminu4(int __a, int __b);
+__device__ int __nv_vneg2(int __a);
+__device__ int __nv_vneg4(int __a);
+__device__ int __nv_vnegss2(int __a);
+__device__ int __nv_vnegss4(int __a);
+__device__ int __nv_vsads2(int __a, int __b);
+__device__ int __nv_vsads4(int __a, int __b);
+__device__ int __nv_vsadu2(int __a, int __b);
+__device__ int __nv_vsadu4(int __a, int __b);
+__device__ int __nv_vseteq2(int __a, int __b);
+__device__ int __nv_vseteq4(int __a, int __b);
+__device__ int __nv_vsetges2(int __a, int __b);
+__device__ int __nv_vsetges4(int __a, int __b);
+__device__ int __nv_vsetgeu2(int __a, int __b);
+__device__ int __nv_vsetgeu4(int __a, int __b);
+__device__ int __nv_vsetgts2(int __a, int __b);
+__device__ int __nv_vsetgts4(int __a, int __b);
+__device__ int __nv_vsetgtu2(int __a, int __b);
+__device__ int __nv_vsetgtu4(int __a, int __b);
+__device__ int __nv_vsetles2(int __a, int __b);
+__device__ int __nv_vsetles4(int __a, int __b);
+__device__ int __nv_vsetleu2(int __a, int __b);
+__device__ int __nv_vsetleu4(int __a, int __b);
+__device__ int __nv_vsetlts2(int __a, int __b);
+__device__ int __nv_vsetlts4(int __a, int __b);
+__device__ int __nv_vsetltu2(int __a, int __b);
+__device__ int __nv_vsetltu4(int __a, int __b);
+__device__ int __nv_vsetne2(int __a, int __b);
+__device__ int __nv_vsetne4(int __a, int __b);
+__device__ int __nv_vsub2(int __a, int __b);
+__device__ int __nv_vsub4(int __a, int __b);
+__device__ int __nv_vsubss2(int __a, int __b);
+__device__ int __nv_vsubss4(int __a, int __b);
+__device__ int __nv_vsubus2(int __a, int __b);
+__device__ int __nv_vsubus4(int __a, int __b);
+#endif  // CUDA_VERSION
+__device__ double __nv_y0(double __a);
+__device__ float __nv_y0f(float __a);
+__device__ double __nv_y1(double __a);
+__device__ float __nv_y1f(float __a);
+__device__ float __nv_ynf(int __a, float __b);
+__device__ double __nv_yn(int __a, double __b);
+} // extern "C"
+#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__
--- a/c_headers/__clang_cuda_runtime_wrapper.h
+++ b/c_headers/__clang_cuda_runtime_wrapper.h
@ -62,7 +62,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020
 #error "Unsupported CUDA version!"
 #endif

@ -84,6 +84,9 @@
 #define __DEVICE_FUNCTIONS_H__
 #define __MATH_FUNCTIONS_H__
 #define __COMMON_FUNCTIONS_H__
+// device_functions_decls is replaced by __clang_cuda_device_functions.h
+// included below.
+#define __DEVICE_FUNCTIONS_DECLS_H__

 #undef __CUDACC__
 #if CUDA_VERSION < 9000
@ -97,11 +100,17 @@
 #include "host_config.h"
 #include "host_defines.h"

+// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
+// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
+// functional equivalent of what we need.
+#pragma push_macro("nv_weak")
+#define nv_weak weak
 #undef __CUDABE__
 #undef __CUDA_LIBDEVICE__
 #define __CUDACC__
 #include "cuda_runtime.h"

+#pragma pop_macro("nv_weak")
 #undef __CUDACC__
 #define __CUDABE__

@ -137,20 +146,22 @@ inline __host__ double __signbitd(double x) {
 }
 #endif

-// We need decls for functions in CUDA's libdevice with __device__
-// attribute only. Alas they come either as __host__ __device__ or
-// with no attributes at all. To work around that, define __CUDA_RTC__
-// which produces HD variant and undef __host__ which gives us desided
-// decls with __device__ attribute.
-#pragma push_macro("__host__")
-#define __host__
-#define __CUDACC_RTC__
-#include "device_functions_decls.h"
-#undef __CUDACC_RTC__
+// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
+// to provide our own.
+#include <__clang_cuda_libdevice_declares.h>

-// Temporarily poison __host__ macro to ensure it's not used by any of
-// the headers we're about to include.
-#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+// Wrappers for many device-side standard library functions became compiler
+// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
+// provides its own implementation of the wrappers.
+#if CUDA_VERSION >= 9000
+#include <__clang_cuda_device_functions.h>
+#endif
+
+// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
+// counterpart does not do it, so we need to make it empty here to keep
+// following CUDA includes happy.
+#undef __THROW
+#define __THROW

 // CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
 // Previous versions used to check whether they are defined or not.
@ -167,24 +178,20 @@ inline __host__ double __signbitd(double x) {
 #endif
 #endif

+// Temporarily poison __host__ macro to ensure it's not used by any of
+// the headers we're about to include.
+#pragma push_macro("__host__")
+#define __host__ UNEXPECTED_HOST_ATTRIBUTE
+
 // device_functions.hpp and math_functions*.hpp use 'static
 // __forceinline__' (with no __device__) for definitions of device
 // functions. Temporarily redefine __forceinline__ to include
 // __device__.
 #pragma push_macro("__forceinline__")
 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
-
-#pragma push_macro("__float2half_rn")
-#if CUDA_VERSION >= 9000
-// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
-// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
-// device_functions.hpp out of the way.
-#define __float2half_rn  __float2half_rn_disabled
-#endif
-
+#if CUDA_VERSION < 9000
 #include "device_functions.hpp"
-#pragma pop_macro("__float2half_rn")
-
+#endif

 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
@ -196,17 +203,32 @@ inline __host__ double __signbitd(double x) {
 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
 #define __USE_FAST_MATH__ 1
 #endif
+
+#if CUDA_VERSION >= 9000
+// CUDA-9.2 needs host-side memcpy for some host functions in
+// device_functions.hpp
+#if CUDA_VERSION >= 9020
+#include <string.h>
+#endif
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
+
 #pragma pop_macro("__USE_FAST_MATH__")

+#if CUDA_VERSION < 9000
 #include "math_functions_dbl_ptx3.hpp"
+#endif
 #pragma pop_macro("__forceinline__")

 // Pull in host-only functions that are only available when neither
 // __CUDACC__ nor __CUDABE__ are defined.
 #undef __MATH_FUNCTIONS_HPP__
 #undef __CUDABE__
+#if CUDA_VERSION < 9000
 #include "math_functions.hpp"
+#endif
 // Alas, additional overloads for these functions are hard to get to.
 // Considering that we only need these overloads for a few functions,
 // we can provide them here.
@ -222,22 +244,36 @@ static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
 static inline float normcdf(float __a) { return normcdff(__a); }
 static inline float erfcx(float __a) { return erfcxf(__a); }

+#if CUDA_VERSION < 9000
 // For some reason single-argument variant is not always declared by
 // CUDA headers. Alas, device_functions.hpp included below needs it.
 static inline __device__ void __brkpt(int __c) { __brkpt(); }
+#endif

 // Now include *.hpp with definitions of various GPU functions.  Alas,
 // a lot of thins get declared/defined with __host__ attribute which
 // we don't want and we have to define it out. We also have to include
 // {device,math}_functions.hpp again in order to extract the other
 // branch of #if/else inside.
-
 #define __host__
 #undef __CUDABE__
 #define __CUDACC__
+#if CUDA_VERSION >= 9000
+// Some atomic functions became compiler builtins in CUDA-9 , so we need their
+// declarations.
+#include "device_atomic_functions.h"
+#endif
 #undef __DEVICE_FUNCTIONS_HPP__
 #include "device_atomic_functions.hpp"
+#if CUDA_VERSION >= 9000
+#include "crt/device_functions.hpp"
+#include "crt/device_double_functions.hpp"
+#else
 #include "device_functions.hpp"
+#define __CUDABE__
+#include "device_double_functions.h"
+#undef __CUDABE__
+#endif
 #include "sm_20_atomic_functions.hpp"
 #include "sm_20_intrinsics.hpp"
 #include "sm_32_atomic_functions.hpp"
@ -251,8 +287,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 // reason about our code.

 #if CUDA_VERSION >= 8000
+#pragma push_macro("__CUDA_ARCH__")
+#undef __CUDA_ARCH__
 #include "sm_60_atomic_functions.hpp"
 #include "sm_61_intrinsics.hpp"
+#pragma pop_macro("__CUDA_ARCH__")
 #endif

 #undef __MATH_FUNCTIONS_HPP__
@ -279,7 +318,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 #endif
 #endif

+#if CUDA_VERSION >= 9000
+#include "crt/math_functions.hpp"
+#else
 #include "math_functions.hpp"
+#endif
 #pragma pop_macro("_GLIBCXX_MATH_H")
 #pragma pop_macro("_LIBCPP_VERSION")
 #pragma pop_macro("__GNUC__")
--- a/c_headers/__wmmintrin_aes.h
+++ b/c_headers/__wmmintrin_aes.h
@ -20,15 +20,18 @@
 *
 *===-----------------------------------------------------------------------===
 */
-#ifndef _WMMINTRIN_AES_H
-#define _WMMINTRIN_AES_H

-#include <emmintrin.h>
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_AES_H
+#define __WMMINTRIN_AES_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))

-/// \brief Performs a single round of AES encryption using the Equivalent
+/// Performs a single round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
@ -48,7 +51,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R)
  return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
 }

-/// \brief Performs the final round of AES encryption using the Equivalent
+/// Performs the final round of AES encryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
@ -68,7 +71,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R)
  return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
 }

-/// \brief Performs a single round of AES decryption using the Equivalent
+/// Performs a single round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
@ -88,7 +91,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R)
  return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
 }

-/// \brief Performs the final round of AES decryption using the Equivalent
+/// Performs the final round of AES decryption using the Equivalent
 ///    Inverse Cipher, transforming the state value from the first source
 ///    operand using a 128-bit round key value contained in the second source
 ///    operand, and writes the result to the destination.
@ -108,7 +111,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R)
  return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
 }

-/// \brief Applies the AES InvMixColumns() transformation to an expanded key
+/// Applies the AES InvMixColumns() transformation to an expanded key
 ///    contained in the source operand, and writes the result to the
 ///    destination.
 ///
@ -125,7 +128,7 @@ _mm_aesimc_si128(__m128i __V)
  return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
 }

-/// \brief Generates a round key for AES encyption, operating on 128-bit data
+/// Generates a round key for AES encryption, operating on 128-bit data
 ///    specified in the first source operand and using an 8-bit round constant
 ///    specified by the second source operand, and writes the result to the
 ///    destination.
@ -148,4 +151,4 @@ _mm_aesimc_si128(__m128i __V)

 #undef __DEFAULT_FN_ATTRS

-#endif  /* _WMMINTRIN_AES_H */
+#endif  /* __WMMINTRIN_AES_H */
--- a/c_headers/__wmmintrin_pclmul.h
+++ b/c_headers/__wmmintrin_pclmul.h
@ -20,10 +20,15 @@
 *
 *===-----------------------------------------------------------------------===
 */
-#ifndef _WMMINTRIN_PCLMUL_H
-#define _WMMINTRIN_PCLMUL_H

-/// \brief Multiplies two 64-bit integer values, which are selected from source
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_PCLMUL_H
+#define __WMMINTRIN_PCLMUL_H
+
+/// Multiplies two 64-bit integer values, which are selected from source
 ///    operands using the immediate-value operand. The multiplication is a
 ///    carry-less multiplication, and the 128-bit integer product is stored in
 ///    the destination.
@ -50,8 +55,8 @@
 ///    Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
 /// \returns The 128-bit integer vector containing the result of the carry-less
 ///    multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(__X, __Y, __I) \
-  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
-                                        (__v2di)(__m128i)(__Y), (char)(__I)))
+#define _mm_clmulepi64_si128(X, Y, I) \
+  ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
+                                        (__v2di)(__m128i)(Y), (char)(I)))

-#endif /* _WMMINTRIN_PCLMUL_H */
+#endif /* __WMMINTRIN_PCLMUL_H */
--- a/c_headers/ammintrin.h
+++ b/c_headers/ammintrin.h
@ -27,9 +27,9 @@
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))

-/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
 ///    integer vector operand at the index \a idx and of the length \a len.
 ///
 /// \headerfile <x86intrin.h>
@ -57,7 +57,7 @@
  ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
                                  (char)(len), (char)(idx)))

-/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
 ///    integer vector operand at the index and of the length specified by
 ///    \a __y.
 ///
@ -82,7 +82,7 @@ _mm_extract_si64(__m128i __x, __m128i __y)
  return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
 }

-/// \brief Inserts bits of a specified length from the source integer vector
+/// Inserts bits of a specified length from the source integer vector
 ///    \a y into the lower 64 bits of the destination integer vector \a x at
 ///    the index \a idx and of the length \a len.
 ///
@ -120,7 +120,7 @@ _mm_extract_si64(__m128i __x, __m128i __y)
                                    (__v2di)(__m128i)(y), \
                                    (char)(len), (char)(idx)))

-/// \brief Inserts bits of a specified length from the source integer vector
+/// Inserts bits of a specified length from the source integer vector
 ///    \a __y into the lower 64 bits of the destination integer vector \a __x
 ///    at the index and of the length specified by \a __y.
 ///
@ -152,7 +152,7 @@ _mm_insert_si64(__m128i __x, __m128i __y)
  return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
 }

-/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
+/// Stores a 64-bit double-precision value in a 64-bit memory location.
 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
@ -170,7 +170,7 @@ _mm_stream_sd(double *__p, __m128d __a)
  __builtin_ia32_movntsd(__p, (__v2df)__a);
 }

-/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
+/// Stores a 32-bit single-precision floating-point value in a 32-bit
 ///    memory location. To minimize caching, the data is flagged as
 ///    non-temporal (unlikely to be used again soon).
 ///
--- a/c_headers/arm_fp16.h
+++ b/c_headers/arm_fp16.h
--- a/c_headers/arm_neon.h
+++ b/c_headers/arm_neon.h
--- a/c_headers/avx2intrin.h
+++ b/c_headers/avx2intrin.h
--- a/c_headers/avx512bitalgintrin.h
+++ b/c_headers/avx512bitalgintrin.h
@ -29,7 +29,7 @@
 #define __AVX512BITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_popcnt_epi16(__m512i __A)
@ -48,7 +48,7 @@ _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
 {
-  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
+  return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
              __U,
              __B);
 }
@ -70,7 +70,7 @@ _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
 {
-  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
+  return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
              __U,
              __B);
 }
--- a/c_headers/avx512bwintrin.h
+++ b/c_headers/avx512bwintrin.h
--- a/c_headers/avx512cdintrin.h
+++ b/c_headers/avx512cdintrin.h
@ -29,7 +29,7 @@
 #define __AVX512CDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_conflict_epi64 (__m512i __A)
@ -82,49 +82,45 @@ _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi32 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-             (__v16si) _mm512_setzero_si512 (),
-             (__mmask16) -1);
+  return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-                 (__v16si) __W,
-                 (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)__W);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
-             (__v16si) _mm512_setzero_si512 (),
-             (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+                                             (__v16si)_mm512_lzcnt_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_lzcnt_epi64 (__m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-             (__v8di) _mm512_setzero_si512 (),
-             (__mmask8) -1);
+  return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-                 (__v8di) __W,
-                 (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)__W);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
 {
-  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
-             (__v8di) _mm512_setzero_si512 (),
-             (__mmask8) __U);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+                                             (__v8di)_mm512_lzcnt_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/c_headers/avx512dqintrin.h
+++ b/c_headers/avx512dqintrin.h
--- a/c_headers/avx512erintrin.h
+++ b/c_headers/avx512erintrin.h
@ -27,21 +27,21 @@
 #ifndef __AVX512ERINTRIN_H
 #define __AVX512ERINTRIN_H

-// exp2a23
-#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
+/* exp2a23 */
+#define _mm512_exp2a23_round_pd(A, R) \
  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                      (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)-1, (int)(R)); })
+                                      (__mmask8)-1, (int)(R))

-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                      (int)(R)); })
+                                      (int)(R))

-#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
                                      (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)(M), (int)(R)); })
+                                      (__mmask8)(M), (int)(R))

 #define _mm512_exp2a23_pd(A) \
  _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -52,20 +52,20 @@
 #define _mm512_maskz_exp2a23_pd(M, A) \
  _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

-#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
+#define _mm512_exp2a23_round_ps(A, R) \
  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                     (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)-1, (int)(R)); })
+                                     (__mmask16)-1, (int)(R))

-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                     (int)(R)); })
+                                     (int)(R))

-#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
                                     (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)(M), (int)(R)); })
+                                     (__mmask16)(M), (int)(R))

 #define _mm512_exp2a23_ps(A) \
  _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -76,21 +76,21 @@
 #define _mm512_maskz_exp2a23_ps(M, A) \
  _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)

-// rsqrt28
-#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
+/* rsqrt28 */
+#define _mm512_rsqrt28_round_pd(A, R) \
  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                         (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)-1, (int)(R)); })
+                                         (__mmask8)-1, (int)(R))

-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                         (int)(R)); })
+                                         (int)(R))

-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
                                         (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(M), (int)(R)); })
+                                         (__mmask8)(M), (int)(R))

 #define _mm512_rsqrt28_pd(A) \
  _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -101,20 +101,20 @@
 #define _mm512_maskz_rsqrt28_pd(M, A) \
  _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

-#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
+#define _mm512_rsqrt28_round_ps(A, R) \
  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                        (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)-1, (int)(R)); })
+                                        (__mmask16)-1, (int)(R))

-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                        (int)(R)); })
+                                        (int)(R))

-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
                                        (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(M), (int)(R)); })
+                                        (__mmask16)(M), (int)(R))

 #define _mm512_rsqrt28_ps(A) \
  _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -125,23 +125,23 @@
 #define _mm512_maskz_rsqrt28_ps(M, A) \
  _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)

-#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
+#define _mm_rsqrt28_round_ss(A, B, R) \
  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R)); })
+                                              (__mmask8)-1, (int)(R))

-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)(__m128)(S), \
-                                              (__mmask8)(M), (int)(R)); })
+                                              (__mmask8)(M), (int)(R))

-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(M), (int)(R)); })
+                                              (__mmask8)(M), (int)(R))

 #define _mm_rsqrt28_ss(A, B) \
  _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -152,23 +152,23 @@
 #define _mm_maskz_rsqrt28_ss(M, A, B) \
  _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

-#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
+#define _mm_rsqrt28_round_sd(A, B, R) \
  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R)); })
+                                               (__mmask8)-1, (int)(R))

-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)(__m128d)(S), \
-                                               (__mmask8)(M), (int)(R)); })
+                                               (__mmask8)(M), (int)(R))

-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
                                               (__v2df)(__m128d)(B), \
                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(M), (int)(R)); })
+                                               (__mmask8)(M), (int)(R))

 #define _mm_rsqrt28_sd(A, B) \
  _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -179,21 +179,21 @@
 #define _mm_maskz_rsqrt28_sd(M, A, B) \
  _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

-// rcp28
-#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
+/* rcp28 */
+#define _mm512_rcp28_round_pd(A, R) \
  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R)); })
+                                       (__mmask8)-1, (int)(R))

-#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R)); })
+                                       (int)(R))

-#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rcp28_round_pd(M, A, R) \
  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R)); })
+                                       (__mmask8)(M), (int)(R))

 #define _mm512_rcp28_pd(A) \
  _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -204,20 +204,20 @@
 #define _mm512_maskz_rcp28_pd(M, A) \
  _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)

-#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
+#define _mm512_rcp28_round_ps(A, R) \
  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R)); })
+                                      (__mmask16)-1, (int)(R))

-#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R)); })
+                                      (int)(R))

-#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
+#define _mm512_maskz_rcp28_round_ps(M, A, R) \
  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R)); })
+                                      (__mmask16)(M), (int)(R))

 #define _mm512_rcp28_ps(A) \
  _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -228,23 +228,23 @@
 #define _mm512_maskz_rcp28_ps(M, A) \
  _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)

-#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
+#define _mm_rcp28_round_ss(A, B, R) \
  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R)); })
+                                            (__mmask8)-1, (int)(R))

-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)(__m128)(S), \
-                                            (__mmask8)(M), (int)(R)); })
+                                            (__mmask8)(M), (int)(R))

-#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
                                            (__v4sf)(__m128)(B), \
                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(M), (int)(R)); })
+                                            (__mmask8)(M), (int)(R))

 #define _mm_rcp28_ss(A, B) \
  _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -255,23 +255,23 @@
 #define _mm_maskz_rcp28_ss(M, A, B) \
  _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

-#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
+#define _mm_rcp28_round_sd(A, B, R) \
  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R)); })
+                                             (__mmask8)-1, (int)(R))

-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)(__m128d)(S), \
-                                             (__mmask8)(M), (int)(R)); })
+                                             (__mmask8)(M), (int)(R))

-#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
                                             (__v2df)(__m128d)(B), \
                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(M), (int)(R)); })
+                                             (__mmask8)(M), (int)(R))

 #define _mm_rcp28_sd(A, B) \
  _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -282,4 +282,4 @@
 #define _mm_maskz_rcp28_sd(M, A, B) \
  _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)

-#endif // __AVX512ERINTRIN_H
+#endif /* __AVX512ERINTRIN_H */
--- a/c_headers/avx512fintrin.h
+++ b/c_headers/avx512fintrin.h
--- a/c_headers/avx512ifmaintrin.h
+++ b/c_headers/avx512ifmaintrin.h
@ -29,62 +29,52 @@
 #define __IFMAINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
-                   (__v8di) __Y,
-                   (__v8di) __Z,
-                   (__mmask8) -1);
+  return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
-          __m512i __Y)
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
-                   (__v8di) __X,
-                   (__v8di) __Y,
-                   (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
-              (__v8di) __Y,
-              (__v8di) __Z,
-              (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
-                   (__v8di) __Y,
-                   (__v8di) __Z,
-                   (__mmask8) -1);
+  return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
+                                                (__v8di) __Z);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
-          __m512i __Y)
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
-                   (__v8di) __X,
-                   (__v8di) __Y,
-                   (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
+                                   (__v8di)__W);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
 {
-  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
-              (__v8di) __Y,
-              (__v8di) __Z,
-              (__mmask8) __M);
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                   (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v8di)_mm512_setzero_si512());
 }

 #undef __DEFAULT_FN_ATTRS
--- a/c_headers/avx512ifmavlintrin.h
+++ b/c_headers/avx512ifmavlintrin.h
@ -29,121 +29,105 @@
 #define __IFMAVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))



-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
-                   (__v2di) __Y,
-                   (__v2di) __Z,
-                   (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
+                                                (__v2di) __Z);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
-                   (__v2di) __X,
-                   (__v2di) __Y,
-                   (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __Z,
-              (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
-                   (__v4di) __Y,
-                   (__v4di) __Z,
-                   (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
-          __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
-                   (__v4di) __X,
-                   (__v4di) __Y,
-                   (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __Z,
-              (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
-                   (__v2di) __Y,
-                   (__v2di) __Z,
-                   (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
-                   (__v2di) __X,
-                   (__v2di) __Y,
-                   (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
+                                      (__v2di)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 {
-  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
-              (__v2di) __Y,
-              (__v2di) __Z,
-              (__mmask8) __M);
+  return (__m128i)__builtin_ia32_selectq_128(__M,
+                                      (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+                                      (__v2di)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
-                   (__v4di) __Y,
-                   (__v4di) __Z,
-                   (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
-          __m256i __Y)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
-                   (__v4di) __X,
-                   (__v4di) __Y,
-                   (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
+                                   (__v4di)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
 {
-  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
-              (__v4di) __Y,
-              (__v4di) __Z,
-              (__mmask8) __M);
+  return (__m256i)__builtin_ia32_selectq_256(__M,
+                                   (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+                                   (__v4di)_mm256_setzero_si256());
 }


-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avx512pfintrin.h
+++ b/c_headers/avx512pfintrin.h
@ -1,4 +1,4 @@
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
 *
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,80 +31,80 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))

-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                             (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
-              
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
+                             (int)(hint))
+
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
  __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
                             (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
+                             (int)(hint))

-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfdps((__mmask16)(mask), \
                             (__v16si)(__m512i)(index), (int const *)(addr), \
-                             (int)(scale), (int)(hint)); })
+                             (int)(scale), (int)(hint))

-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
  __builtin_ia32_gatherpfdps((__mmask16) -1, \
                             (__v16si)(__m512i)(index), (int const *)(addr), \
-                             (int)(scale), (int)(hint)); })
+                             (int)(scale), (int)(hint))

-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                             (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
+                             (int)(hint))

-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
  __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
                             (long long const *)(addr), (int)(scale), \
-                             (int)(hint)); })
-              
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
+                             (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
  __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                             (int const *)(addr), (int)(scale), (int)(hint)); })
+                             (int const *)(addr), (int)(scale), (int)(hint))

-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
  __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
-                             (int const *)(addr), (int)(scale), (int)(hint)); })
+                             (int const *)(addr), (int)(scale), (int)(hint))

-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
  __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
                              (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))

-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
                              (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))

-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
  __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))

-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfdps((__mmask16)(mask), \
                              (__v16si)(__m512i)(index), (int *)(addr), \
-                              (int)(scale), (int)(hint)); })
+                              (int)(scale), (int)(hint))

-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
  __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
                              (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))

-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
                              (long long *)(addr), (int)(scale), \
-                              (int)(hint)); })
+                              (int)(hint))

-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
  __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))

-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
  __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
-                              (int *)(addr), (int)(scale), (int)(hint)); })
+                              (int *)(addr), (int)(scale), (int)(hint))

 #undef __DEFAULT_FN_ATTRS

--- a/c_headers/avx512vbmi2intrin.h
+++ b/c_headers/avx512vbmi2intrin.h
@ -29,7 +29,7 @@
 #define __AVX512VBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))


 static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -44,7 +44,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
              __U);
 }

@ -60,7 +60,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
              __U);
 }

@ -90,7 +90,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
              __U);
 }

@ -106,7 +106,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
 {
  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
              __U);
 }

@ -122,7 +122,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
-              (__v32hi) _mm512_setzero_hi(),
+              (__v32hi) _mm512_setzero_si512(),
              __U);
 }

@ -138,87 +138,93 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
 {
  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
-              (__v64qi) _mm512_setzero_qi(),
+              (__v64qi) _mm512_setzero_si512(),
              __U);
 }

-#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
-                                          (__v8di)(B), \
-                                          (int)(I), \
-                                          (__v8di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm512_shldi_epi64(A, B, I) \
+  (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                    (__v8di)(__m512i)(S))

 #define _mm512_maskz_shldi_epi64(U, A, B, I) \
-  _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
-
-#define _mm512_shldi_epi64(A, B, I) \
-  _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
-
-#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
-                                          (__v16si)(B), \
-                                          (int)(I), \
-                                          (__v16si)(S), \
-                                          (__mmask16)(U)); })
-
-#define _mm512_maskz_shldi_epi32(U, A, B, I) \
-  _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                    (__v8di)_mm512_setzero_si512())

 #define _mm512_shldi_epi32(A, B, I) \
-  _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+  (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(I))

-#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
-                                          (__v32hi)(B), \
-                                          (int)(I), \
-                                          (__v32hi)(S), \
-                                          (__mmask32)(U)); })
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                   (__v16si)(__m512i)(S))

-#define _mm512_maskz_shldi_epi16(U, A, B, I) \
-  _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                   (__v16si)_mm512_setzero_si512())

 #define _mm512_shldi_epi16(A, B, I) \
-  _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+  (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+                                     (__v32hi)(__m512i)(B), (int)(I))

-#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
-                                          (__v8di)(B), \
-                                          (int)(I), \
-                                          (__v8di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                   (__v32hi)(__m512i)(S))

-#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
-  _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                   (__v32hi)_mm512_setzero_si512())

 #define _mm512_shrdi_epi64(A, B, I) \
-  _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
+  (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(I))

-#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
-                                          (__v16si)(B), \
-                                          (int)(I), \
-                                          (__v16si)(S), \
-                                          (__mmask16)(U)); })
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                    (__v8di)(__m512i)(S))

-#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
-  _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                    (__v8di)_mm512_setzero_si512())

 #define _mm512_shrdi_epi32(A, B, I) \
-  _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
+  (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(I))

-#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
-                                          (__v32hi)(B), \
-                                          (int)(I), \
-                                          (__v32hi)(S), \
-                                          (__mmask32)(U)); })
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                   (__v16si)(__m512i)(S))

-#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
-  _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                   (__v16si)_mm512_setzero_si512())

 #define _mm512_shrdi_epi16(A, B, I) \
-  _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
+  (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+                                     (__v32hi)(__m512i)(B), (int)(I))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                   (__v32hi)(__m512i)(S))
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                   (__v32hi)_mm512_setzero_si512())

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)
--- a/c_headers/avx512vbmiintrin.h
+++ b/c_headers/avx512vbmiintrin.h
@ -29,79 +29,65 @@
 #define __VBMIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))


 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
-         __mmask64 __U, __m512i __B)
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
-              (__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __B,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
+                                                 (__v64qi) __B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
+                              __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __A,
-              (__v64qi) __B,
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
-        __m512i __I, __m512i __B)
+_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
-              /* idx */ ,
-              (__v64qi) __A,
-              (__v64qi) __B,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)__I);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
-         __m512i __I, __m512i __B)
+_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
+                               __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
-               /* idx */ ,
-               (__v64qi) __A,
-               (__v64qi) __B,
-               (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512(__U,
+                               (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+                               (__v64qi)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) _mm512_undefined_epi32 (),
-                 (__mmask64) -1);
+  return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
        __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) _mm512_setzero_si512(),
-                 (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
             __m512i __B)
 {
-  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
-                 (__v64qi) __A,
-                 (__v64qi) __W,
-                 (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                     (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+                                     (__v64qi)__W);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
--- a/c_headers/avx512vbmivlintrin.h
+++ b/c_headers/avx512vbmivlintrin.h
@ -29,161 +29,127 @@
 #define __VBMIVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))


-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
-            __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
-              (__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __B,
-              (__mmask16)
-              __U);
+  return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
+                                                 (__v16qi)__I,
+                                                 (__v16qi)__B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
-         __mmask32 __U, __m256i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
+                           __m128i __B)
 {
-  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
-              (__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __B,
-              (__mmask32)
-              __U);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
+                            __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __A,
-              (__v16qi) __B,
-              (__mmask16) -
-              1);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)__I);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
-           __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
+                            __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
-              /* idx */ ,
-              (__v16qi) __A,
-              (__v16qi) __B,
-              (__mmask16)
-              __U);
+  return (__m128i)__builtin_ia32_selectb_128(__U,
+                                  (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+                                  (__v16qi)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
-            __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
-               /* idx */ ,
-               (__v16qi) __A,
-               (__v16qi) __B,
-               (__mmask16)
-               __U);
+  return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
+                                                 (__v32qi)__B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
+                              __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __A,
-              (__v32qi) __B,
-              (__mmask32) -
-              1);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
-        __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
+                               __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
-              /* idx */ ,
-              (__v32qi) __A,
-              (__v32qi) __B,
-              (__mmask32)
-              __U);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)__I);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
-         __m256i __I, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
+                               __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
-               /* idx */ ,
-               (__v32qi) __A,
-               (__v32qi) __B,
-               (__mmask32)
-               __U);
+  return (__m256i)__builtin_ia32_selectb_256(__U,
+                               (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+                               (__v32qi)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_permutexvar_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) _mm_undefined_si128 (),
-                 (__mmask16) -1);
+  return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) _mm_setzero_si128 (),
-                 (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
          __m128i __B)
 {
-  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
-                 (__v16qi) __A,
-                 (__v16qi) __W,
-                 (__mmask16) __M);
+  return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+                                        (__v16qi)_mm_permutexvar_epi8(__A, __B),
+                                        (__v16qi)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) _mm256_undefined_si256 (),
-                 (__mmask32) -1);
+  return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
        __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) _mm256_setzero_si256 (),
-                 (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
             __m256i __B)
 {
-  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
-                 (__v32qi) __A,
-                 (__v32qi) __W,
-                 (__mmask32) __M);
+  return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+                                     (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+                                     (__v32qi)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
 {
  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -192,7 +158,7 @@ _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i
                (__mmask16) __M);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
 {
  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -202,7 +168,7 @@ _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
                (__mmask16) __M);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
 {
  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -212,7 +178,7 @@ _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
                (__mmask16) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
 {
  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -221,7 +187,7 @@ _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m2
                (__mmask32) __M);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
 {
  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -231,7 +197,7 @@ _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
                (__mmask32) __M);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
 {
  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -242,6 +208,7 @@ _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
 }


-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avx512vlbitalgintrin.h
+++ b/c_headers/avx512vlbitalgintrin.h
@ -1,4 +1,4 @@
-/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
+/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
 *
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -29,15 +29,16 @@
 #define __AVX512VLBITALGINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi16(__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
 {
  return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
@ -45,7 +46,7 @@ _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
              (__v16hi) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
 {
  return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
@ -53,35 +54,35 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
              __B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_popcnt_epi16(__m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi16(__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
 {
  return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
-              (__v8hi) _mm128_popcnt_epi16(__B),
+              (__v8hi) _mm_popcnt_epi16(__B),
              (__v8hi) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
 {
-  return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+  return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
              __U,
              __B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_popcnt_epi8(__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
 {
  return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
@ -89,7 +90,7 @@ _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
              (__v32qi) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
 {
  return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
@ -97,61 +98,62 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
              __B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_popcnt_epi8(__m128i __A)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi8(__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
 {
  return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
-              (__v16qi) _mm128_popcnt_epi8(__B),
+              (__v16qi) _mm_popcnt_epi8(__B),
              (__v16qi) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
 {
-  return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+  return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
              __U,
              __B);
 }

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
 {
  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
              (__v32qi) __B,
              __U);
 }

-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
 {
-  return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
+  return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
              __A,
              __B);
 }

-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
 {
  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
              (__v16qi) __B,
              __U);
 }

-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
 {
-  return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
+  return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
              __A,
              __B);
 }


-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avx512vlbwintrin.h
+++ b/c_headers/avx512vlbwintrin.h
--- a/c_headers/avx512vlcdintrin.h
+++ b/c_headers/avx512vlcdintrin.h
@ -1,4 +1,4 @@
-/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -28,35 +28,36 @@
 #define __AVX512VLCDINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))


-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmb_epi64 (__mmask8 __A)
-{ 
+{
  return (__m128i) _mm_set1_epi64x((long long) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmb_epi64 (__mmask8 __A)
 {
  return (__m256i) _mm256_set1_epi64x((long long)__A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_broadcastmw_epi32 (__mmask16 __A)
 {
  return (__m128i) _mm_set1_epi32((int)__A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_broadcastmw_epi32 (__mmask16 __A)
 {
  return (__m256i) _mm256_set1_epi32((int)__A);
 }


-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi64 (__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@ -64,7 +65,7 @@ _mm_conflict_epi64 (__m128i __A)
               (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@ -72,16 +73,16 @@ _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
               (__mmask8) __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
               (__v2di)
-               _mm_setzero_di (),
+               _mm_setzero_si128 (),
               (__mmask8) __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi64 (__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -89,7 +90,7 @@ _mm256_conflict_epi64 (__m256i __A)
               (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -97,7 +98,7 @@ _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
               (__mmask8) __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -105,7 +106,7 @@ _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
               (__mmask8) __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_conflict_epi32 (__m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -113,7 +114,7 @@ _mm_conflict_epi32 (__m128i __A)
               (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -121,7 +122,7 @@ _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
               (__mmask8) __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
 {
  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -129,7 +130,7 @@ _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
               (__mmask8) __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_conflict_epi32 (__m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -137,7 +138,7 @@ _mm256_conflict_epi32 (__m256i __A)
               (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -145,7 +146,7 @@ _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
               (__mmask8) __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
 {
  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -154,110 +155,95 @@ _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
               (__mmask8) __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_lzcnt_epi32 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
-                 (__v4si)
-                 _mm_setzero_si128 (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+                                             (__v4si)_mm_lzcnt_epi32(__A),
+                                             (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi32 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si) __W,
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
-                 (__v8si)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+                                             (__v8si)_mm256_lzcnt_epi32(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_lzcnt_epi64 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di)
-                 _mm_setzero_di (),
-                 (__mmask8) -1);
+  return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di) __W,
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
-                 (__v2di)
-                 _mm_setzero_di (),
-                 (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+                                             (__v2di)_mm_lzcnt_epi64(__A),
+                                             (__v2di)_mm_setzero_si128());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_lzcnt_epi64 (__m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) -1);
+  return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di) __W,
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
 {
-  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+                                             (__v4di)_mm256_lzcnt_epi64(__A),
+                                             (__v4di)_mm256_setzero_si256());
 }

-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif /* __AVX512VLCDINTRIN_H */
--- a/c_headers/avx512vldqintrin.h
+++ b/c_headers/avx512vldqintrin.h
--- a/c_headers/avx512vlintrin.h
+++ b/c_headers/avx512vlintrin.h
--- a/c_headers/avx512vlvbmi2intrin.h
+++ b/c_headers/avx512vlvbmi2intrin.h
@ -29,130 +29,120 @@
 #define __AVX512VLVBMI2INTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))

-static  __inline __m128i __DEFAULT_FN_ATTRS
-_mm128_setzero_hi(void) {
-  return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
              (__v8hi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
-              (__v8hi) _mm128_setzero_hi(),
+              (__v8hi) _mm_setzero_si128(),
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
              (__v16qi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm128_setzero_hi(),
+              (__v16qi) _mm_setzero_si128(),
              __U);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
-_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
 {
  __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
              __U);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
-_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
 {
  __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
              (__v8hi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
-              (__v8hi) _mm128_setzero_hi(),
+              (__v8hi) _mm_setzero_si128(),
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
              (__v16qi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
 {
  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
-              (__v16qi) _mm128_setzero_hi(),
+              (__v16qi) _mm_setzero_si128(),
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
 {
  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
              (__v8hi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
 {
  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
-              (__v8hi) _mm128_setzero_hi(),
+              (__v8hi) _mm_setzero_si128(),
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
 {
  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
              (__v16qi) __S,
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
 {
  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
-              (__v16qi) _mm128_setzero_hi(),
+              (__v16qi) _mm_setzero_si128(),
              __U);
 }

-static  __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setzero_hi(void) {
-  return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
@ -160,15 +150,15 @@ _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_hi(),
+              (__v16hi) _mm256_setzero_si256(),
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
@ -176,29 +166,29 @@ _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_hi(),
+              (__v32qi) _mm256_setzero_si256(),
              __U);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
 {
  __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
              __U);
 }

-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __DEFAULT_FN_ATTRS256
 _mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
 {
  __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
@ -206,15 +196,15 @@ _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
-              (__v16hi) _mm256_setzero_hi(),
+              (__v16hi) _mm256_setzero_si256(),
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
@ -222,15 +212,15 @@ _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
 {
  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
-              (__v32qi) _mm256_setzero_hi(),
+              (__v32qi) _mm256_setzero_si256(),
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
 {
  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
@ -238,15 +228,15 @@ _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
 {
  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
-              (__v16hi) _mm256_setzero_hi(),
+              (__v16hi) _mm256_setzero_si256(),
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
 {
  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
@ -254,171 +244,183 @@ _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
 {
  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
-              (__v32qi) _mm256_setzero_hi(),
+              (__v32qi) _mm256_setzero_si256(),
              __U);
 }

-#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
-                                          (__v4di)(B), \
-                                          (int)(I), \
-                                          (__v4di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_shldi_epi64(A, B, I) \
+  (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+                                     (__v4di)(__m256i)(B), (int)(I))
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                    (__v4di)(__m256i)(S))

 #define _mm256_maskz_shldi_epi64(U, A, B, I) \
-  _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                    (__v4di)_mm256_setzero_si256())

-#define _mm256_shldi_epi64(A, B, I) \
-  _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shldi_epi64(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+                                     (__v2di)(__m128i)(B), (int)(I))

-#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
-                                          (__v2di)(B), \
-                                          (int)(I), \
-                                          (__v2di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm_mask_shldi_epi64(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                      (__v2di)(__m128i)(S))

-#define _mm128_maskz_shldi_epi64(U, A, B, I) \
-  _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
-
-#define _mm128_shldi_epi64(A, B, I) \
-  _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
-
-#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
-                                          (__v8si)(B), \
-                                          (int)(I), \
-                                          (__v8si)(S), \
-                                          (__mmask8)(U)); })
-
-#define _mm256_maskz_shldi_epi32(U, A, B, I) \
-  _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+#define _mm_maskz_shldi_epi64(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                      (__v2di)_mm_setzero_si128())

 #define _mm256_shldi_epi32(A, B, I) \
-  _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+  (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+                                     (__v8si)(__m256i)(B), (int)(I))

-#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
-                                          (__v4si)(B), \
-                                          (int)(I), \
-                                          (__v4si)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                    (__v8si)(__m256i)(S))

-#define _mm128_maskz_shldi_epi32(U, A, B, I) \
-  _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                    (__v8si)_mm256_setzero_si256())

-#define _mm128_shldi_epi32(A, B, I) \
-  _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shldi_epi32(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+                                     (__v4si)(__m128i)(B), (int)(I))

-#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
-                                          (__v16hi)(B), \
-                                          (int)(I), \
-                                          (__v16hi)(S), \
-                                          (__mmask16)(U)); })
+#define _mm_mask_shldi_epi32(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                      (__v4si)(__m128i)(S))

-#define _mm256_maskz_shldi_epi16(U, A, B, I) \
-  _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+#define _mm_maskz_shldi_epi32(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                      (__v4si)_mm_setzero_si128())

 #define _mm256_shldi_epi16(A, B, I) \
-  _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+  (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+                                     (__v16hi)(__m256i)(B), (int)(I))

-#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
-                                          (__v8hi)(B), \
-                                          (int)(I), \
-                                          (__v8hi)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                   (__v16hi)(__m256i)(S))

-#define _mm128_maskz_shldi_epi16(U, A, B, I) \
-  _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                   (__v16hi)_mm256_setzero_si256())

-#define _mm128_shldi_epi16(A, B, I) \
-  _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shldi_epi16(A, B, I) \
+  (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+                                     (__v8hi)(__m128i)(B), (int)(I))

-#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
-                                          (__v4di)(B), \
-                                          (int)(I), \
-                                          (__v4di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm_mask_shldi_epi16(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                      (__v8hi)(__m128i)(S))

-#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
-  _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
+#define _mm_maskz_shldi_epi16(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                      (__v8hi)_mm_setzero_si128())

 #define _mm256_shrdi_epi64(A, B, I) \
-  _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+  (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+                                     (__v4di)(__m256i)(B), (int)(I))

-#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
-                                          (__v2di)(B), \
-                                          (int)(I), \
-                                          (__v2di)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                    (__v4di)(__m256i)(S))

-#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
-  _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                    (__v4di)_mm256_setzero_si256())

-#define _mm128_shrdi_epi64(A, B, I) \
-  _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shrdi_epi64(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+                                     (__v2di)(__m128i)(B), (int)(I))

-#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
-                                          (__v8si)(B), \
-                                          (int)(I), \
-                                          (__v8si)(S), \
-                                          (__mmask8)(U)); })
+#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                      (__v2di)(__m128i)(S))

-#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
-  _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
+#define _mm_maskz_shrdi_epi64(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                      (__v2di)_mm_setzero_si128())

 #define _mm256_shrdi_epi32(A, B, I) \
-  _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+  (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+                                     (__v8si)(__m256i)(B), (int)(I))

-#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
-                                          (__v4si)(B), \
-                                          (int)(I), \
-                                          (__v4si)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                    (__v8si)(__m256i)(S))

-#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
-  _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                    (__v8si)_mm256_setzero_si256())

-#define _mm128_shrdi_epi32(A, B, I) \
-  _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shrdi_epi32(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+                                     (__v4si)(__m128i)(B), (int)(I))

-#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
-                                          (__v16hi)(B), \
-                                          (int)(I), \
-                                          (__v16hi)(S), \
-                                          (__mmask16)(U)); })
+#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                      (__v4si)(__m128i)(S))

-#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
-  _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
+#define _mm_maskz_shrdi_epi32(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                      (__v4si)_mm_setzero_si128())

 #define _mm256_shrdi_epi16(A, B, I) \
-  _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
+  (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+                                     (__v16hi)(__m256i)(B), (int)(I))

-#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
-  (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
-                                          (__v8hi)(B), \
-                                          (int)(I), \
-                                          (__v8hi)(S), \
-                                          (__mmask8)(U)); })
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                   (__v16hi)(__m256i)(S))

-#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
-  _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                   (__v16hi)_mm256_setzero_si256())

-#define _mm128_shrdi_epi16(A, B, I) \
-  _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
+#define _mm_shrdi_epi16(A, B, I) \
+  (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+                                     (__v8hi)(__m128i)(B), (int)(I))

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                      (__v8hi)(__m128i)(S))
+
+#define _mm_maskz_shrdi_epi16(U, A, B, I) \
+  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                      (__v8hi)_mm_setzero_si128())
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
@ -427,7 +429,7 @@ _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
@ -436,7 +438,7 @@ _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
@ -445,8 +447,8 @@ _mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
              (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
              (__v2di) __A,
@ -454,8 +456,8 @@ _mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
              (__v2di) __A,
@ -463,8 +465,8 @@ _mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
              (__v2di) __A,
@ -472,7 +474,7 @@ _mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
              (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
@ -481,7 +483,7 @@ _mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
@ -490,7 +492,7 @@ _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
@ -499,8 +501,8 @@ _mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
              (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
              (__v4si) __A,
@ -508,8 +510,8 @@ _mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
              (__v4si) __A,
@ -517,8 +519,8 @@ _mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
              (__v4si) __A,
@ -526,7 +528,7 @@ _mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
              (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
@ -535,7 +537,7 @@ _mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
@ -544,7 +546,7 @@ _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
@ -553,8 +555,8 @@ _mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
              (__mmask16) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
              (__v8hi) __A,
@ -562,8 +564,8 @@ _mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
              (__v8hi) __A,
@ -571,8 +573,8 @@ _mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
              (__v8hi) __A,
@ -580,7 +582,7 @@ _mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
              (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
@ -589,7 +591,7 @@ _mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
@ -598,7 +600,7 @@ _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
@ -607,8 +609,8 @@ _mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
              (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
              (__v2di) __A,
@ -616,8 +618,8 @@ _mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
              (__v2di) __A,
@ -625,8 +627,8 @@ _mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
              (__v2di) __A,
@ -634,7 +636,7 @@ _mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
              (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
@ -643,7 +645,7 @@ _mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
@ -652,7 +654,7 @@ _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
@ -661,8 +663,8 @@ _mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
              (__mmask8) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
              (__v4si) __A,
@ -670,8 +672,8 @@ _mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
              (__v4si) __A,
@ -679,8 +681,8 @@ _mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
              (__v4si) __A,
@ -688,7 +690,7 @@ _mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
              (__mmask8) -1);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
@ -697,7 +699,7 @@ _mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
@ -706,7 +708,7 @@ _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
              __U);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
@ -715,8 +717,8 @@ _mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
              (__mmask16) -1);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
              (__v8hi) __A,
@ -724,8 +726,8 @@ _mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
              (__v8hi) __A,
@ -733,8 +735,8 @@ _mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
              __U);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
              (__v8hi) __A,
@ -743,6 +745,7 @@ _mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
 }


-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avx512vlvnniintrin.h
+++ b/c_headers/avx512vlvnniintrin.h
@ -29,226 +29,195 @@
 #define __AVX512VLVNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))


-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
+                                             (__v8si)__B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
+                                              (__v8si)__B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
+                                             (__v8si)__B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)__S);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
-              (__v8si) __A,
-              (__v8si) __B,
-              (__mmask8) -1);
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
+                                              (__v8si)__B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)__S);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m256i)__builtin_ia32_selectd_256(__U,
+                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+                                    (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
+                                             (__v4si)__B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
+                                              (__v4si)__B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
+                                             (__v4si)__B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)__S);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) __U);
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
-              (__v4si) __A,
-              (__v4si) __B,
-              (__mmask8) -1);
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
+                                              (__v4si)__B);
 }

+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)__S);
+}

-#undef __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_selectd_128(__U,
+                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+                                       (__v4si)_mm_setzero_si128());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avx512vnniintrin.h
+++ b/c_headers/avx512vnniintrin.h
@ -29,117 +29,100 @@
 #define __AVX512VNNIINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))


+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
+}
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) -1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
-  return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
+                                             (__v16si)__B);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)__S);
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) __U);
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                    (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+                                    (__v16si)_mm512_setzero_si512());
 }

 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
-              (__v16si) __A,
-              (__v16si) __B,
-              (__mmask16) -1);
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
+                                              (__v16si)__B);
 }

+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_selectd_512(__U,
+                                   (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+                                   (__v16si)_mm512_setzero_si512());
+}

 #undef __DEFAULT_FN_ATTRS

--- a/c_headers/avx512vpopcntdqintrin.h
+++ b/c_headers/avx512vpopcntdqintrin.h
@ -1,5 +1,4 @@
-/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
- *------------------===
+/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
 *
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -32,8 +31,7 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd"   \
-                                                            "q")))
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))

 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
--- a/c_headers/avx512vpopcntdqvlintrin.h
+++ b/c_headers/avx512vpopcntdqvlintrin.h
@ -1,5 +1,4 @@
-/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
- *------------------===
+/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
 *
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,69 +30,76 @@
 #define __AVX512VPOPCNTDQVLINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS                                                     \
-  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))

-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi64(__m128i __A) {
  return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
  return (__m128i)__builtin_ia32_selectq_128(
      (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
  return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi32(__m128i __A) {
  return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
  return (__m128i)__builtin_ia32_selectd_128(
      (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
  return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi64(__m256i __A) {
  return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
  return (__m256i)__builtin_ia32_selectq_256(
      (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
  return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi32(__m256i __A) {
  return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
  return (__m256i)__builtin_ia32_selectd_256(
      (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
  return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
 }

-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif
--- a/c_headers/avxintrin.h
+++ b/c_headers/avxintrin.h
--- a/c_headers/bmiintrin.h
+++ b/c_headers/bmiintrin.h
@ -49,7 +49,7 @@
   to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))

-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -65,7 +65,7 @@ __tzcnt_u16(unsigned short __X)
  return __X ? __builtin_ctzs(__X) : 16;
 }

-/// \brief Performs a bitwise AND of the second operand with the one's
+/// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
@ -85,7 +85,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
 }

 /* AMD-specified, double-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
@ -100,6 +100,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
 ///    number of bits to be extracted.
 /// \returns An unsigned integer whose least significant bits contain the
 ///    extracted bits.
+/// \see _bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __bextr_u32(unsigned int __X, unsigned int __Y)
 {
@ -107,7 +108,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 }

 /* Intel-specified, single-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
@ -124,13 +125,14 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///    Bits [7:0] specify the number of bits.
 /// \returns An unsigned integer whose least significant bits contain the
 ///    extracted bits.
+/// \see __bextr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 {
  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

-/// \brief Clears all bits in the source except for the least significant bit
+/// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@ -147,7 +149,7 @@ __blsi_u32(unsigned int __X)
  return __X & -__X;
 }

-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
@ -164,7 +166,7 @@ __blsmsk_u32(unsigned int __X)
  return __X ^ (__X - 1);
 }

-/// \brief Clears the least significant bit that is set to 1 in the source
+/// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@ -181,7 +183,7 @@ __blsr_u32(unsigned int __X)
  return __X & (__X - 1);
 }

-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -197,7 +199,7 @@ __tzcnt_u32(unsigned int __X)
  return __X ? __builtin_ctz(__X) : 32;
 }

-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -226,7 +228,7 @@ _mm_tzcnt_32(unsigned int __X)

 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))

-/// \brief Performs a bitwise AND of the second operand with the one's
+/// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
@ -246,7 +248,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
 }

 /* AMD-specified, double-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
@ -261,6 +263,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
 ///    the number of bits to be extracted.
 /// \returns An unsigned 64-bit integer whose least significant bits contain the
 ///    extracted bits.
+/// \see _bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __bextr_u64(unsigned long long __X, unsigned long long __Y)
 {
@ -268,7 +271,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 }

 /* Intel-specified, single-leading-underscore version of BEXTR */
-/// \brief Extracts the specified bits from the first operand and returns them
+/// Extracts the specified bits from the first operand and returns them
 ///     in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
@ -285,13 +288,14 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///    Bits [7:0] specify the number of bits.
 /// \returns An unsigned 64-bit integer whose least significant bits contain the
 ///    extracted bits.
+/// \see __bextr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 {
  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
 }

-/// \brief Clears all bits in the source except for the least significant bit
+/// Clears all bits in the source except for the least significant bit
 ///    containing a value of 1 and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@ -308,7 +312,7 @@ __blsi_u64(unsigned long long __X)
  return __X & -__X;
 }

-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
@ -325,7 +329,7 @@ __blsmsk_u64(unsigned long long __X)
  return __X ^ (__X - 1);
 }

-/// \brief Clears the least significant bit that is set to 1 in the source
+/// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@ -342,7 +346,7 @@ __blsr_u64(unsigned long long __X)
  return __X & (__X - 1);
 }

-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -358,7 +362,7 @@ __tzcnt_u64(unsigned long long __X)
  return __X ? __builtin_ctzll(__X) : 64;
 }

-/// \brief Counts the number of trailing zero bits in the operand.
+/// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
--- a/c_headers/cetintrin.h
+++ b/c_headers/cetintrin.h
@ -1,4 +1,4 @@
-/*===---- cetintrin.h - CET intrinsic ------------------------------------===
+/*===---- cetintrin.h - CET intrinsic --------------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -42,6 +42,16 @@ static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
 }
 #endif /* __x86_64__ */

+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspq(__a);
+}
+#else /* __x86_64__ */
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+  __builtin_ia32_incsspd((int)__a);
+}
+#endif /* __x86_64__ */
+
 static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
  return __builtin_ia32_rdsspd(__a);
 }
@ -52,6 +62,16 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo
 }
 #endif /* __x86_64__ */

+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspq(0);
+}
+#else /* __x86_64__ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
+  return __builtin_ia32_rdsspd(0);
+}
+#endif /* __x86_64__ */
+
 static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
  __builtin_ia32_saveprevssp();
 }
--- a/c_headers/cldemoteintrin.h
+++ b/c_headers/cldemoteintrin.h
@ -0,0 +1,42 @@
+/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLDEMOTEINTRIN_H
+#define __CLDEMOTEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("cldemote")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_cldemote(const void * __P) {
+  __builtin_ia32_cldemote(__P);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/clflushoptintrin.h
+++ b/c_headers/clflushoptintrin.h
@ -1,4 +1,4 @@
-/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/c_headers/clwbintrin.h
+++ b/c_headers/clwbintrin.h
@ -31,7 +31,7 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))

-/// \brief Writes back to memory the cache line (if modified) that contains the
+/// Writes back to memory the cache line (if modified) that contains the
 /// linear address specified in \a __p from any level of the cache hierarchy in
 /// the cache coherence domain
 ///
--- a/c_headers/clzerointrin.h
+++ b/c_headers/clzerointrin.h
@ -20,18 +20,18 @@
 *
 *===-----------------------------------------------------------------------===
 */
-#ifndef __X86INTRIN_H
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
 #error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
 #endif

-#ifndef _CLZEROINTRIN_H
-#define _CLZEROINTRIN_H
+#ifndef __CLZEROINTRIN_H
+#define __CLZEROINTRIN_H

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))

-/// \brief Loads the cache line address and zero's out the cacheline
+/// Loads the cache line address and zero's out the cacheline
 ///
 /// \headerfile <clzerointrin.h>
 ///
@ -45,6 +45,6 @@ _mm_clzero (void * __line)
  __builtin_ia32_clzero ((void *)__line);
 }

-#undef __DEFAULT_FN_ATTRS 
+#undef __DEFAULT_FN_ATTRS

-#endif /* _CLZEROINTRIN_H */
+#endif /* __CLZEROINTRIN_H */
--- a/c_headers/cpuid.h
+++ b/c_headers/cpuid.h
@ -156,6 +156,7 @@
 #define bit_SMEP        0x00000080
 #define bit_BMI2        0x00000100
 #define bit_ENH_MOVSB   0x00000200
+#define bit_INVPCID     0x00000400
 #define bit_RTM         0x00000800
 #define bit_MPX         0x00004000
 #define bit_AVX512F     0x00010000
@ -166,7 +167,7 @@
 #define bit_CLFLUSHOPT  0x00800000
 #define bit_CLWB        0x01000000
 #define bit_AVX512PF    0x04000000
-#define bit_AVX51SER    0x08000000
+#define bit_AVX512ER    0x08000000
 #define bit_AVX512CD    0x10000000
 #define bit_SHA         0x20000000
 #define bit_AVX512BW    0x40000000
@ -177,6 +178,7 @@
 #define bit_AVX512VBMI       0x00000002
 #define bit_PKU              0x00000004
 #define bit_OSPKE            0x00000010
+#define bit_WAITPKG          0x00000020
 #define bit_AVX512VBMI2      0x00000040
 #define bit_SHSTK            0x00000080
 #define bit_GFNI             0x00000100
@ -186,10 +188,14 @@
 #define bit_AVX512BITALG     0x00001000
 #define bit_AVX512VPOPCNTDQ  0x00004000
 #define bit_RDPID            0x00400000
+#define bit_CLDEMOTE         0x02000000
+#define bit_MOVDIRI          0x08000000
+#define bit_MOVDIR64B        0x10000000

 /* Features in %edx for leaf 7 sub-leaf 0 */
 #define bit_AVX5124VNNIW  0x00000004
 #define bit_AVX5124FMAPS  0x00000008
+#define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000

 /* Features in %eax for leaf 13 sub-leaf 1 */
@ -197,6 +203,9 @@
 #define bit_XSAVEC      0x00000002
 #define bit_XSAVES      0x00000008

+/* Features in %eax for leaf 0x14 sub-leaf 0 */
+#define bit_PTWRITE     0x00000010
+
 /* Features in %ecx for leaf 0x80000001 */
 #define bit_LAHF_LM     0x00000001
 #define bit_ABM         0x00000020
@ -215,8 +224,9 @@
 #define bit_3DNOWP      0x40000000
 #define bit_3DNOW       0x80000000

-/* Features in %ebx for leaf 0x80000001 */
+/* Features in %ebx for leaf 0x80000008 */
 #define bit_CLZERO      0x00000001
+#define bit_WBNOINVD    0x00000200


 #if __i386__
--- a/c_headers/cuda_wrappers/algorithm
+++ b/c_headers/cuda_wrappers/algorithm
@ -24,28 +24,36 @@
 #ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
 #define __CLANG_CUDA_WRAPPERS_ALGORITHM

-// This header defines __device__ overloads of std::min/max, but only if we're
-// <= C++11.  In C++14, these functions are constexpr, and so are implicitly
-// __host__ __device__.
+// This header defines __device__ overloads of std::min/max.
 //
-// We don't support the initializer_list overloads because
-// initializer_list::begin() and end() are not __host__ __device__ functions.
+// Ideally we'd declare these functions only if we're <= C++11.  In C++14,
+// these functions are constexpr, and so are implicitly __host__ __device__.
 //
-// When compiling in C++14 mode, we could force std::min/max to have different
-// implementations for host and device, by declaring the device overloads
-// before the constexpr overloads appear.  We choose not to do this because
-
-//  a) why write our own implementation when we can use one from the standard
-//     library? and
-//  b) libstdc++ is evil and declares min/max inside a header that is included
-//     *before* we include <algorithm>.  So we'd have to unconditionally
-//     declare our __device__ overloads of min/max, but that would pollute
-//     things for people who choose not to include <algorithm>.
+// However, the compiler being in C++14 mode does not imply that the standard
+// library supports C++14.  There is no macro we can test to check that the
+// stdlib has constexpr std::min/max.  Thus we have to unconditionally define
+// our device overloads.
+//
+// A host+device function cannot be overloaded, and a constexpr function
+// implicitly become host device if there's no explicitly host or device
+// overload preceding it.  So the simple thing to do would be to declare our
+// device min/max overloads, and then #include_next <algorithm>.  This way our
+// device overloads would come first, and so if we have a C++14 stdlib, its
+// min/max won't become host+device and conflict with our device overloads.
+//
+// But that also doesn't work.  libstdc++ is evil and declares std::min/max in
+// an internal header that is included *before* <algorithm>.  Thus by the time
+// we're inside of this file, std::min/max may already have been declared, and
+// thus we can't prevent them from becoming host+device if they're constexpr.
+//
+// Therefore we perpetrate the following hack: We mark our __device__ overloads
+// with __attribute__((enable_if(true, ""))).  This causes the signature of the
+// function to change without changing anything else about it.  (Except that
+// overload resolution will prefer it over the __host__ __device__ version
+// rather than considering them equally good).

 #include_next <algorithm>

-#if __cplusplus <= 201103L
-
 // We need to define these overloads in exactly the namespace our standard
 // library uses (including the right inline namespace), otherwise they won't be
 // picked up by other functions in the standard library (e.g. functions in
@ -59,30 +67,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 #endif

+#pragma push_macro("_CPP14_CONSTEXPR")
+#if __cplusplus >= 201402L
+#define _CPP14_CONSTEXPR constexpr
+#else
+#define _CPP14_CONSTEXPR
+#endif
+
 template <class __T, class __Cmp>
-inline __device__ const __T &
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
 max(const __T &__a, const __T &__b, __Cmp __cmp) {
  return __cmp(__a, __b) ? __b : __a;
 }

 template <class __T>
-inline __device__ const __T &
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
 max(const __T &__a, const __T &__b) {
  return __a < __b ? __b : __a;
 }

 template <class __T, class __Cmp>
-inline __device__ const __T &
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
 min(const __T &__a, const __T &__b, __Cmp __cmp) {
  return __cmp(__b, __a) ? __b : __a;
 }

 template <class __T>
-inline __device__ const __T &
+__attribute__((enable_if(true, "")))
+inline _CPP14_CONSTEXPR __host__ __device__ const __T &
 min(const __T &__a, const __T &__b) {
  return __a < __b ? __a : __b;
 }

+#pragma pop_macro("_CPP14_CONSTEXPR")
+
 #ifdef _LIBCPP_END_NAMESPACE_STD
 _LIBCPP_END_NAMESPACE_STD
 #else
@ -92,5 +113,4 @@ _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace std
 #endif

-#endif // __cplusplus <= 201103L
 #endif // __CLANG_CUDA_WRAPPERS_ALGORITHM
--- a/c_headers/emmintrin.h
+++ b/c_headers/emmintrin.h
--- a/c_headers/f16cintrin.h
+++ b/c_headers/f16cintrin.h
@ -21,18 +21,25 @@
 *===-----------------------------------------------------------------------===
 */

-#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
+#if !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
 #endif

 #ifndef __F16CINTRIN_H
 #define __F16CINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
+#define __DEFAULT_FN_ATTRS128 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+  __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))

-/// \brief Converts a 16-bit half-precision float value into a 32-bit float
+/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
+ * but that's because icc can emulate these without f16c using a library call.
+ * Since we don't do that let's leave these in f16cintrin.h.
+ */
+
+/// Converts a 16-bit half-precision float value into a 32-bit float
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
@ -42,7 +49,7 @@
 /// \param __a
 ///    A 16-bit half-precision float value.
 /// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS
+static __inline float __DEFAULT_FN_ATTRS128
 _cvtsh_ss(unsigned short __a)
 {
  __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
@ -50,7 +57,7 @@ _cvtsh_ss(unsigned short __a)
  return r[0];
 }

-/// \brief Converts a 32-bit single-precision float value to a 16-bit
+/// Converts a 32-bit single-precision float value to a 16-bit
 ///    half-precision float value.
 ///
 /// \headerfile <x86intrin.h>
@ -72,11 +79,11 @@ _cvtsh_ss(unsigned short __a)
 ///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) __extension__ ({ \
+#define _cvtss_sh(a, imm) \
  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0]); })
+                                                     (imm)))[0])

-/// \brief Converts a 128-bit vector containing 32-bit float values into a
+/// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
 ///
 /// \headerfile <x86intrin.h>
@ -99,10 +106,10 @@ _cvtsh_ss(unsigned short __a)
 /// \returns A 128-bit vector containing converted 16-bit half-precision float
 ///    values. The lower 64 bits are used to store the converted 16-bit
 ///    half-precision floating-point values.
-#define _mm_cvtps_ph(a, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
+#define _mm_cvtps_ph(a, imm) \
+  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))

-/// \brief Converts a 128-bit vector containing 16-bit half-precision float
+/// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 128-bit vector containing 32-bit float values.
 ///
 /// \headerfile <x86intrin.h>
@ -113,12 +120,57 @@ _cvtsh_ss(unsigned short __a)
 ///    A 128-bit vector containing 16-bit half-precision float values. The lower
 ///    64 bits are used in the conversion.
 /// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS
+static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtph_ps(__m128i __a)
 {
  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
 }

-#undef __DEFAULT_FN_ATTRS
+/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
+///    containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+///    A 256-bit vector containing 32-bit single-precision float values to be
+///    converted to 16-bit half-precision float values.
+/// \param imm
+///    An immediate value controlling rounding using bits [2:0]: \n
+///    000: Nearest \n
+///    001: Down \n
+///    010: Up \n
+///    011: Truncate \n
+///    1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+///    float values.
+#define _mm256_cvtps_ph(a, imm) \
+ (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+///    values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+///    A 128-bit vector containing 16-bit half-precision float values to be
+///    converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+///    single-precision float values.
+static __inline __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtph_ps(__m128i __a)
+{
+  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif /* __F16CINTRIN_H */
--- a/c_headers/fma4intrin.h
+++ b/c_headers/fma4intrin.h
@ -31,200 +31,202 @@
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif /* __FMA4INTRIN_H */
--- a/c_headers/fmaintrin.h
+++ b/c_headers/fmaintrin.h
@ -1,4 +1,4 @@
-/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -29,200 +29,202 @@
 #define __FMAINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }

-static __inline__ __m128 __DEFAULT_FN_ATTRS
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }

-static __inline__ __m128d __DEFAULT_FN_ATTRS
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }

-#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256

 #endif /* __FMAINTRIN_H */
--- a/c_headers/fxsrintrin.h
+++ b/c_headers/fxsrintrin.h
@ -30,7 +30,7 @@

 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("fxsr")))

-/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
 ///    memory region pointed to by the input parameter \a __p.
 ///
 /// \headerfile <x86intrin.h>
@ -43,10 +43,10 @@
 static __inline__ void __DEFAULT_FN_ATTRS
 _fxsave(void *__p)
 {
-  return __builtin_ia32_fxsave(__p);
+  __builtin_ia32_fxsave(__p);
 }

-/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
 ///    memory region pointed to by the input parameter \a __p. The contents of
 ///    this memory region should have been written to by a previous \c _fxsave
 ///    or \c _fxsave64 intrinsic.
@ -61,11 +61,11 @@ _fxsave(void *__p)
 static __inline__ void __DEFAULT_FN_ATTRS
 _fxrstor(void *__p)
 {
-  return __builtin_ia32_fxrstor(__p);
+  __builtin_ia32_fxrstor(__p);
 }

 #ifdef __x86_64__
-/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
 ///    memory region pointed to by the input parameter \a __p.
 ///
 /// \headerfile <x86intrin.h>
@ -78,10 +78,10 @@ _fxrstor(void *__p)
 static __inline__ void __DEFAULT_FN_ATTRS
 _fxsave64(void *__p)
 {
-  return __builtin_ia32_fxsave64(__p);
+  __builtin_ia32_fxsave64(__p);
 }

-/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
 ///    memory region pointed to by the input parameter \a __p. The contents of
 ///    this memory region should have been written to by a previous \c _fxsave
 ///    or \c _fxsave64 intrinsic.
@ -96,7 +96,7 @@ _fxsave64(void *__p)
 static __inline__ void __DEFAULT_FN_ATTRS
 _fxrstor64(void *__p)
 {
-  return __builtin_ia32_fxrstor64(__p);
+  __builtin_ia32_fxrstor64(__p);
 }
 #endif

--- a/c_headers/gfniintrin.h
+++ b/c_headers/gfniintrin.h
@ -29,104 +29,108 @@
 #define __GFNIINTRIN_H


-#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                   \
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
-        (__v16qi)(__m128i)(S)); })
+        (__v16qi)(__m128i)(S))


-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({          \
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
-        U, A, B, I); })
+        U, A, B, I)


-#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v32qi)(__m256i)(S)); })
+        (__v32qi)(__m256i)(S))

-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-        U, A, B, I); })
+        U, A, B, I)


-#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({                \
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
                                                  (__v64qi)(__m512i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({     \
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v64qi)(__m512i)(S)); })
+        (__v64qi)(__m512i)(S))

-#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({       \
-  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(),    \
-        U, A, B, I); })
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
+        U, A, B, I)

-#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                      \
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({           \
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
-        (__v16qi)(__m128i)(S)); })
+        (__v16qi)(__m128i)(S))


-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({             \
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
-        U, A, B, I); })
+        U, A, B, I)


-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v32qi)(__m256i)(S)); })
+        (__v32qi)(__m256i)(S))

-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
-        U, A, B, I); })
+        U, A, B, I)


-#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({                   \
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
                                                  (__v64qi)(__m512i)(B),          \
-                                                  (char)(I)); })
+                                                  (char)(I))

-#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({        \
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v64qi)(__m512i)(S)); })
+        (__v64qi)(__m512i)(S))

-#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({          \
-  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(),       \
-        U, A, B, I); })
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
+        U, A, B, I)

 /* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))

 /* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))

 /* Default attributes for VLX forms. */
-#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
+#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
@ -135,7 +139,7 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
              (__v16qi) __B);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
 _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
 {
  return (__m128i) __builtin_ia32_selectb_128(__U,
@ -143,21 +147,21 @@ _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
              (__v16qi) __S);
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
 _mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
 {
  return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
              __U, __A, __B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
 _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
              (__v32qi) __B);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
 _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
 {
  return (__m256i) __builtin_ia32_selectb_256(__U,
@ -165,21 +169,21 @@ _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
              (__v32qi) __S);
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
 _mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
 {
  return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
              __U, __A, __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
 {
  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
              (__v64qi) __B);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
 {
  return (__m512i) __builtin_ia32_selectb_512(__U,
@ -187,16 +191,18 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
              (__v64qi) __S);
 }

-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
+  return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
              __U, __A, __B);
 }

 #undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_F
-#undef __DEFAULT_FN_ATTRS_VL
+#undef __DEFAULT_FN_ATTRS_Y
+#undef __DEFAULT_FN_ATTRS_Z
+#undef __DEFAULT_FN_ATTRS_VL128
+#undef __DEFAULT_FN_ATTRS_VL256

-#endif // __GFNIINTRIN_H
+#endif /* __GFNIINTRIN_H */

--- a/c_headers/htmxlintrin.h
+++ b/c_headers/htmxlintrin.h
@ -214,7 +214,7 @@ __TM_failure_code(void* const __TM_buff)

 /* These intrinsics are being made available for compatibility with
   the IBM XL compiler.  For documentation please see the "z/OS XL
-   C/C++ Programming Guide" publically available on the web.  */
+   C/C++ Programming Guide" publicly available on the web.  */

 static __inline long __attribute__((__always_inline__, __nodebug__))
 __TM_simple_begin ()
--- a/c_headers/ia32intrin.h
+++ b/c_headers/ia32intrin.h
@ -70,4 +70,9 @@ __rdtscp(unsigned int *__A) {

 #define _rdpmc(A) __rdpmc(A)

+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_wbinvd(void) {
+  __builtin_ia32_wbinvd();
+}
+
 #endif /* __IA32INTRIN_H */
--- a/c_headers/immintrin.h
+++ b/c_headers/immintrin.h
@ -68,55 +68,11 @@

 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
 #include <avx2intrin.h>
+#endif

-/* The 256-bit versions of functions in f16cintrin.h.
-   Intel documents these as being in immintrin.h, and
-   they depend on typedefs from avxintrin.h. */
-
-/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
-///    containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-///    A 256-bit vector containing 32-bit single-precision float values to be
-///    converted to 16-bit half-precision float values.
-/// \param imm
-///    An immediate value controlling rounding using bits [2:0]: \n
-///    000: Nearest \n
-///    001: Down \n
-///    010: Up \n
-///    011: Truncate \n
-///    1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing the converted 16-bit half-precision
-///    float values.
-#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
-
-/// \brief Converts a 128-bit vector containing 16-bit half-precision float
-///    values into a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-///    A 128-bit vector containing 16-bit half-precision float values to be
-///    converted to 32-bit single-precision float values.
-/// \returns A vector of [8 x float] containing the converted 32-bit
-///    single-precision float values.
-static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
-_mm256_cvtph_ps(__m128i __a)
-{
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
-#endif /* __AVX2__ */
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
+#include <f16cintrin.h>
+#endif

 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
 #include <vpclmulqdqintrin.h>
@ -134,6 +90,10 @@ _mm256_cvtph_ps(__m128i __a)
 #include <lzcntintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
 #include <fmaintrin.h>
 #endif
@ -247,6 +207,18 @@ _mm256_cvtph_ps(__m128i __a)
 #include <gfniintrin.h>
 #endif

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
+/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPID </c> instruction.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
+_rdpid_u32(void) {
+  return __builtin_ia32_rdpid();
+}
+#endif // __RDPID__
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
@ -310,25 +282,25 @@ _readgsbase_u64(void)
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u32(unsigned int __V)
 {
-  return __builtin_ia32_wrfsbase32(__V);
+  __builtin_ia32_wrfsbase32(__V);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u64(unsigned long long __V)
 {
-  return __builtin_ia32_wrfsbase64(__V);
+  __builtin_ia32_wrfsbase64(__V);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u32(unsigned int __V)
 {
-  return __builtin_ia32_wrgsbase32(__V);
+  __builtin_ia32_wrgsbase32(__V);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u64(unsigned long long __V)
 {
-  return __builtin_ia32_wrgsbase64(__V);
+  __builtin_ia32_wrgsbase64(__V);
 }

 #endif
@ -371,4 +343,125 @@ _writegsbase_u64(unsigned long long __V)
 * whereas others are also available at all times. */
 #include <adxintrin.h>

+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
+#include <rdseedintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__)
+#include <wbnoinvdintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__)
+#include <cldemoteintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__)
+#include <waitpkgintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || \
+  defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#include <movdirintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__)
+#include <pconfigintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__)
+#include <sgxintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__)
+#include <ptwriteintrin.h>
+#endif
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__)
+#include <invpcidintrin.h>
+#endif
+
+#ifdef _MSC_VER
+/* Define the default attributes for these intrinsics */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
+                       : "+r" (_Value), "+m" (*_Target) :: "memory");
+  return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
+                              long _Exchange, long _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
+                              __int64 _Exchange, __int64 _Comparand) {
+  __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
+                       : "+a" (_Comparand), "+m" (*_Destination)
+                       : "r" (_Exchange) : "memory");
+  return _Comparand;
+}
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* _MSC_VER */
+
 #endif /* __IMMINTRIN_H */
--- a/c_headers/intrin.h
+++ b/c_headers/intrin.h
@ -38,7 +38,7 @@
 #include <armintr.h>
 #endif

-#if defined(_M_ARM64)
+#if defined(__aarch64__)
 #include <arm64intr.h>
 #endif

@ -83,6 +83,7 @@ void __incfsdword(unsigned long);
 void __incfsword(unsigned long);
 unsigned long __indword(unsigned short);
 void __indwordstring(unsigned short, unsigned long *, unsigned long);
+void __int2c(void);
 void __invlpg(void *);
 unsigned short __inword(unsigned short);
 void __inwordstring(unsigned short, unsigned short *, unsigned long);
@ -140,6 +141,7 @@ void __svm_stgi(void);
 void __svm_vmload(size_t);
 void __svm_vmrun(size_t);
 void __svm_vmsave(size_t);
+void __ud2(void);
 unsigned __int64 __ull_rshift(unsigned __int64, int);
 void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
@ -161,25 +163,15 @@ static __inline__
 unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
 static __inline__
 unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
-static __inline__
 unsigned char _bittest(long const *, long);
-static __inline__
 unsigned char _bittestandcomplement(long *, long);
-static __inline__
 unsigned char _bittestandreset(long *, long);
-static __inline__
 unsigned char _bittestandset(long *, long);
 void __cdecl _disable(void);
 void __cdecl _enable(void);
 long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
 unsigned char _interlockedbittestandreset(long volatile *, long);
 unsigned char _interlockedbittestandset(long volatile *, long);
-long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
-long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
-__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
 void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
                                                    void *);
 void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
@ -256,24 +248,15 @@ void __writegsbyte(unsigned long, unsigned char);
 void __writegsdword(unsigned long, unsigned long);
 void __writegsqword(unsigned long, unsigned __int64);
 void __writegsword(unsigned long, unsigned short);
-static __inline__
-unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
-unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
-static __inline__
 unsigned char _bittest64(__int64 const *, __int64);
-static __inline__
 unsigned char _bittestandcomplement64(__int64 *, __int64);
-static __inline__
 unsigned char _bittestandreset64(__int64 *, __int64);
-static __inline__
 unsigned char _bittestandset64(__int64 *, __int64);
 long _InterlockedAnd_np(long volatile *_Value, long _Mask);
 short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
 __int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
 char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
 unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
-static __inline__
 unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
 long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
                                    long _Comparand);
@ -287,10 +270,6 @@ unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
                                                __int64 *_ComparandResult);
 short _InterlockedCompareExchange16_np(short volatile *_Destination,
                                       short _Exchange, short _Comparand);
-__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
-                                                 __int64);
-__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
-                                                 __int64);
 __int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
                                         __int64 _Exchange, __int64 _Comparand);
 void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
@ -320,7 +299,12 @@ unsigned __int64 _umul128(unsigned __int64,

 #endif /* __x86_64__ */

-#if defined(__x86_64__) || defined(__arm__)
+#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
+
+static __inline__
+unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
+static __inline__
+unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);

 static __inline__
 __int64 _InterlockedDecrement64(__int64 volatile *_Addend);
@ -341,78 +325,6 @@ __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);

 #endif

-/*----------------------------------------------------------------------------*\
-|* Bit Counting and Testing
-\*----------------------------------------------------------------------------*/
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest(long const *_BitBase, long _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1 << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset(long *_BitBase, long _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1 << _BitPos);
-  return _Res;
-}
-#if defined(__arm__) || defined(__aarch64__)
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
-  return (_PrevVal >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
-  return (_PrevVal >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
-  return (_PrevVal >> _BitPos) & 1;
-}
-#endif
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
-  return (*_BitBase >> _BitPos) & 1;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase ^ (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase & ~(1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
-  unsigned char _Res = (*_BitBase >> _BitPos) & 1;
-  *_BitBase = *_BitBase | (1ll << _BitPos);
-  return _Res;
-}
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
-  long long _PrevVal =
-      __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
-#endif
 /*----------------------------------------------------------------------------*\
 |* Interlocked Exchange Add
 \*----------------------------------------------------------------------------*/
@ -602,6 +514,23 @@ _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
 }
 #endif
 /*----------------------------------------------------------------------------*\
+|* Bit Counting and Testing
+\*----------------------------------------------------------------------------*/
+#if defined(__arm__) || defined(__aarch64__)
+unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
+                                           long _BitPos);
+unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
+                                            long _BitPos);
+unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
+                                              long _BitPos);
+unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
+                                             long _BitPos);
+unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
+                                              long _BitPos);
+#endif
+/*----------------------------------------------------------------------------*\
 |* Interlocked Or
 \*----------------------------------------------------------------------------*/
 #if defined(__arm__) || defined(__aarch64__)
@ -868,33 +797,40 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
 #if defined(__i386__) || defined(__x86_64__)
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
-  __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
+  __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
-  __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
+  __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
-  __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
+  __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
-  __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
+  __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
-  __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
+  __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 #endif
 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
-  __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
+  __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
+                       : : "memory");
 }
 static __inline__ void __DEFAULT_FN_ATTRS
 __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
-  __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
+  __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
+                       : "memory");
 }
 #endif

@ -927,6 +863,20 @@ __nop(void) {
  __asm__ volatile ("nop");
 }
 #endif
+#if defined(__x86_64__)
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__shiftleft128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
+  unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
+  unsigned __int128 __res = __val << (__d & 63);
+  return (unsigned __int64)(__res >> 64);
+}
+static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
+__shiftright128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
+  unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
+  unsigned __int128 __res = __val >> (__d & 63);
+  return (unsigned __int64)__res;
+}
+#endif

 /*----------------------------------------------------------------------------*\
 |* Privileged intrinsics
--- a/c_headers/invpcidintrin.h
+++ b/c_headers/invpcidintrin.h
@ -0,0 +1,37 @@
+/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __INVPCIDINTRIN_H
+#define __INVPCIDINTRIN_H
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__,  __target__("invpcid")))
+_invpcid(unsigned int __type, void *__descriptor) {
+  __builtin_ia32_invpcid(__type, __descriptor);
+}
+
+#endif /* __INVPCIDINTRIN_H */
--- a/c_headers/lwpintrin.h
+++ b/c_headers/lwpintrin.h
@ -31,7 +31,7 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))

-/// \brief Parses the LWPCB at the specified address and enables
+/// Parses the LWPCB at the specified address and enables
 ///        profiling if valid.
 ///
 /// \headerfile <x86intrin.h>
@ -48,7 +48,7 @@ __llwpcb (void *__addr)
  __builtin_ia32_llwpcb(__addr);
 }

-/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
+/// Flushes the LWP state to memory and returns the address of the LWPCB.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -58,12 +58,12 @@ __llwpcb (void *__addr)
 ///    Address to the current Lightweight Profiling Control Block (LWPCB).
 ///    If LWP is not currently enabled, returns NULL.
 static __inline__ void* __DEFAULT_FN_ATTRS
-__slwpcb ()
+__slwpcb (void)
 {
  return __builtin_ia32_slwpcb();
 }

-/// \brief Inserts programmed event record into the LWP event ring buffer
+/// Inserts programmed event record into the LWP event ring buffer
 ///        and advances the ring buffer pointer.
 ///
 /// \headerfile <x86intrin.h>
@ -84,7 +84,7 @@ __slwpcb ()
  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
                           (unsigned int) (FLAGS)))

-/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+/// Decrements the LWP programmed value sample event counter. If the result is
 ///        negative, inserts an event record into the LWP event ring buffer in memory
 ///        and advances the ring buffer pointer.
 ///
@ -104,7 +104,7 @@ __slwpcb ()

 #ifdef __x86_64__

-/// \brief Inserts programmed event record into the LWP event ring buffer
+/// Inserts programmed event record into the LWP event ring buffer
 ///        and advances the ring buffer pointer.
 ///
 /// \headerfile <x86intrin.h>
@ -125,7 +125,7 @@ __slwpcb ()
  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
                           (unsigned int) (FLAGS)))

-/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+/// Decrements the LWP programmed value sample event counter. If the result is
 ///        negative, inserts an event record into the LWP event ring buffer in memory
 ///        and advances the ring buffer pointer.
 ///
--- a/c_headers/lzcntintrin.h
+++ b/c_headers/lzcntintrin.h
@ -31,7 +31,7 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))

-/// \brief Counts the number of leading zero bits in the operand.
+/// Counts the number of leading zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -47,7 +47,7 @@ __lzcnt16(unsigned short __X)
  return __X ? __builtin_clzs(__X) : 16;
 }

-/// \brief Counts the number of leading zero bits in the operand.
+/// Counts the number of leading zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -57,13 +57,14 @@ __lzcnt16(unsigned short __X)
 ///    An unsigned 32-bit integer whose leading zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of leading zero
 ///    bits in the operand.
+/// \see _lzcnt_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __lzcnt32(unsigned int __X)
 {
  return __X ? __builtin_clz(__X) : 32;
 }

-/// \brief Counts the number of leading zero bits in the operand.
+/// Counts the number of leading zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -73,6 +74,7 @@ __lzcnt32(unsigned int __X)
 ///    An unsigned 32-bit integer whose leading zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of leading zero
 ///    bits in the operand.
+/// \see __lzcnt32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _lzcnt_u32(unsigned int __X)
 {
@ -80,7 +82,7 @@ _lzcnt_u32(unsigned int __X)
 }

 #ifdef __x86_64__
-/// \brief Counts the number of leading zero bits in the operand.
+/// Counts the number of leading zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -90,13 +92,14 @@ _lzcnt_u32(unsigned int __X)
 ///    An unsigned 64-bit integer whose leading zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of leading zero
 ///    bits in the operand.
+/// \see _lzcnt_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __lzcnt64(unsigned long long __X)
 {
  return __X ? __builtin_clzll(__X) : 64;
 }

-/// \brief Counts the number of leading zero bits in the operand.
+/// Counts the number of leading zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -106,6 +109,7 @@ __lzcnt64(unsigned long long __X)
 ///    An unsigned 64-bit integer whose leading zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of leading zero
 ///    bits in the operand.
+/// \see __lzcnt64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _lzcnt_u64(unsigned long long __X)
 {
--- a/c_headers/mm3dnow.h
+++ b/c_headers/mm3dnow.h
@ -30,9 +30,9 @@
 typedef float __v2sf __attribute__((__vector_size__(8)));

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))

-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
 _m_femms(void) {
  __builtin_ia32_femms();
 }
@ -134,7 +134,7 @@ _m_pmulhrw(__m64 __m1, __m64 __m2) {

 /* Handle the 3dnowa instructions here. */
 #undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))

 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _m_pf2iw(__m64 __m) {
--- a/c_headers/mmintrin.h
+++ b/c_headers/mmintrin.h
@ -32,27 +32,27 @@ typedef short __v4hi __attribute__((__vector_size__(8)));
 typedef char __v8qi __attribute__((__vector_size__(8)));

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))

-/// \brief Clears the MMX state by setting the state of the x87 stack registers
+/// Clears the MMX state by setting the state of the x87 stack registers
 ///    to empty.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> EMMS </c> instruction.
 ///
-static __inline__ void __DEFAULT_FN_ATTRS
+static __inline__ void  __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
 _mm_empty(void)
 {
    __builtin_ia32_emms();
 }

-/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
+/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
 ///    value of the 32-bit integer parameter and setting the upper 32 bits to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value.
@ -64,12 +64,12 @@ _mm_cvtsi32_si64(int __i)
    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
 }

-/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
 ///    signed integer.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@ -81,11 +81,11 @@ _mm_cvtsi64_si32(__m64 __m)
    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
 }

-/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
+/// Casts a 64-bit signed integer value into a 64-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
 ///
 /// \param __i
 ///    A 64-bit signed integer.
@ -97,11 +97,11 @@ _mm_cvtsi64_m64(long long __i)
    return (__m64)__i;
 }

-/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
+/// Casts a 64-bit integer vector into a 64-bit signed integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
 ///
 /// \param __m
 ///    A 64-bit integer vector.
@ -113,7 +113,7 @@ _mm_cvtm64_si64(__m64 __m)
    return (long long)__m;
 }

-/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+/// Converts 16-bit signed integers from both 64-bit integer vector
 ///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
 ///    a 64-bit integer vector of [8 x i8] as the result. Positive values
 ///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
@ -143,7 +143,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Converts 32-bit signed integers from both 64-bit integer vector
+/// Converts 32-bit signed integers from both 64-bit integer vector
 ///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
 ///    a 64-bit integer vector of [4 x i16] as the result. Positive values
 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
@ -173,7 +173,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Converts 16-bit signed integers from both 64-bit integer vector
+/// Converts 16-bit signed integers from both 64-bit integer vector
 ///    parameters of [4 x i16] into 8-bit unsigned integer values, and
 ///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
 ///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
@ -203,7 +203,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
@ -230,7 +230,7 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -253,7 +253,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -274,7 +274,7 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
 ///    and interleaves them into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
@ -301,7 +301,7 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
 ///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -324,7 +324,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
 ///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -345,7 +345,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
+/// Adds each 8-bit integer element of the first 64-bit integer vector
 ///    of [8 x i8] to the corresponding 8-bit integer element of the second
 ///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
 ///    packed into a 64-bit integer vector of [8 x i8].
@ -366,7 +366,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
+/// Adds each 16-bit integer element of the first 64-bit integer vector
 ///    of [4 x i16] to the corresponding 16-bit integer element of the second
 ///    64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
 ///    packed into a 64-bit integer vector of [4 x i16].
@ -387,7 +387,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
+/// Adds each 32-bit integer element of the first 64-bit integer vector
 ///    of [2 x i32] to the corresponding 32-bit integer element of the second
 ///    64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
 ///    packed into a 64-bit integer vector of [2 x i32].
@ -408,7 +408,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
+/// Adds each 8-bit signed integer element of the first 64-bit integer
 ///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
 ///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
 ///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
@ -430,7 +430,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
+/// Adds each 16-bit signed integer element of the first 64-bit integer
 ///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
 ///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
 ///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
@ -453,7 +453,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
+/// Adds each 8-bit unsigned integer element of the first 64-bit integer
 ///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
 ///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
 ///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
@ -475,7 +475,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
+/// Adds each 16-bit unsigned integer element of the first 64-bit integer
 ///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
 ///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
 ///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
@ -497,7 +497,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
+/// Subtracts each 8-bit integer element of the second 64-bit integer
 ///    vector of [8 x i8] from the corresponding 8-bit integer element of the
 ///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
 ///    are packed into a 64-bit integer vector of [8 x i8].
@ -518,7 +518,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
+/// Subtracts each 16-bit integer element of the second 64-bit integer
 ///    vector of [4 x i16] from the corresponding 16-bit integer element of the
 ///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
 ///    results are packed into a 64-bit integer vector of [4 x i16].
@ -539,7 +539,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
+/// Subtracts each 32-bit integer element of the second 64-bit integer
 ///    vector of [2 x i32] from the corresponding 32-bit integer element of the
 ///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
 ///    results are packed into a 64-bit integer vector of [2 x i32].
@ -560,7 +560,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
+/// Subtracts each 8-bit signed integer element of the second 64-bit
 ///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
 ///    element of the first 64-bit integer vector of [8 x i8]. Positive results
 ///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
@ -583,7 +583,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
+/// Subtracts each 16-bit signed integer element of the second 64-bit
 ///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
 ///    element of the first 64-bit integer vector of [4 x i16]. Positive results
 ///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
@ -606,7 +606,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
+/// Subtracts each 8-bit unsigned integer element of the second 64-bit
 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
 ///    element of the first 64-bit integer vector of [8 x i8].
 ///
@ -630,7 +630,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
+/// Subtracts each 16-bit unsigned integer element of the second 64-bit
 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
 ///    integer element of the first 64-bit integer vector of [4 x i16].
 ///
@ -654,7 +654,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16] and get four
 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
@ -681,7 +681,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the upper
 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
@ -702,7 +702,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
+/// Multiplies each 16-bit signed integer element of the first 64-bit
 ///    integer vector of [4 x i16] by the corresponding 16-bit signed integer
 ///    element of the second 64-bit integer vector of [4 x i16]. Packs the lower
 ///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
@ -723,7 +723,7 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Left-shifts each 16-bit signed integer element of the first
+/// Left-shifts each 16-bit signed integer element of the first
 ///    parameter, which is a 64-bit integer vector of [4 x i16], by the number
 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 ///    lower 16 bits of the results are packed into a 64-bit integer vector of
@ -746,7 +746,7 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
 }

-/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
+/// Left-shifts each 16-bit signed integer element of a 64-bit integer
 ///    vector of [4 x i16] by the number of bits specified by a 32-bit integer.
 ///    The lower 16 bits of the results are packed into a 64-bit integer vector
 ///    of [4 x i16].
@ -768,7 +768,7 @@ _mm_slli_pi16(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
 }

-/// \brief Left-shifts each 32-bit signed integer element of the first
+/// Left-shifts each 32-bit signed integer element of the first
 ///    parameter, which is a 64-bit integer vector of [2 x i32], by the number
 ///    of bits specified by the second parameter, which is a 64-bit integer. The
 ///    lower 32 bits of the results are packed into a 64-bit integer vector of
@ -791,7 +791,7 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
 }

-/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
+/// Left-shifts each 32-bit signed integer element of a 64-bit integer
 ///    vector of [2 x i32] by the number of bits specified by a 32-bit integer.
 ///    The lower 32 bits of the results are packed into a 64-bit integer vector
 ///    of [2 x i32].
@ -813,7 +813,7 @@ _mm_slli_pi32(__m64 __m, int __count)
    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
 }

-/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
+/// Left-shifts the first 64-bit integer parameter by the number of bits
 ///    specified by the second 64-bit integer parameter. The lower 64 bits of
 ///    result are returned.
 ///
@ -833,7 +833,7 @@ _mm_sll_si64(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
 }

-/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
+/// Left-shifts the first parameter, which is a 64-bit integer, by the
 ///    number of bits specified by the second parameter, which is a 32-bit
 ///    integer. The lower 64 bits of result are returned.
 ///
@ -853,7 +853,7 @@ _mm_slli_si64(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
 }

-/// \brief Right-shifts each 16-bit integer element of the first parameter,
+/// Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
 ///    specified by the second parameter, which is a 64-bit integer.
 ///
@ -877,7 +877,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
 }

-/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
 ///
 ///    High-order bits are filled with the sign bit of the initial value of each
@ -900,7 +900,7 @@ _mm_srai_pi16(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
 }

-/// \brief Right-shifts each 32-bit integer element of the first parameter,
+/// Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
 ///    specified by the second parameter, which is a 64-bit integer.
 ///
@ -924,7 +924,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
 }

-/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
 ///
 ///    High-order bits are filled with the sign bit of the initial value of each
@ -947,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
 }

-/// \brief Right-shifts each 16-bit integer element of the first parameter,
+/// Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
 ///    specified by the second parameter, which is a 64-bit integer.
 ///
@ -970,7 +970,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
 }

-/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
 ///
 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
@ -992,7 +992,7 @@ _mm_srli_pi16(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
 }

-/// \brief Right-shifts each 32-bit integer element of the first parameter,
+/// Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
 ///    specified by the second parameter, which is a 64-bit integer.
 ///
@ -1015,7 +1015,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
 }

-/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
 ///
 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
@ -1037,7 +1037,7 @@ _mm_srli_pi32(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
 }

-/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
+/// Right-shifts the first 64-bit integer parameter by the number of bits
 ///    specified by the second 64-bit integer parameter.
 ///
 ///    High-order bits are cleared.
@ -1057,7 +1057,7 @@ _mm_srl_si64(__m64 __m, __m64 __count)
    return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
 }

-/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
+/// Right-shifts the first parameter, which is a 64-bit integer, by the
 ///    number of bits specified by the second parameter, which is a 32-bit
 ///    integer.
 ///
@ -1078,7 +1078,7 @@ _mm_srli_si64(__m64 __m, int __count)
    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
 }

-/// \brief Performs a bitwise AND of two 64-bit integer vectors.
+/// Performs a bitwise AND of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1096,7 +1096,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
    return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
 }

-/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
+/// Performs a bitwise NOT of the first 64-bit integer vector, and then
 ///    performs a bitwise AND of the intermediate result and the second 64-bit
 ///    integer vector.
 ///
@ -1117,7 +1117,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
 }

-/// \brief Performs a bitwise OR of two 64-bit integer vectors.
+/// Performs a bitwise OR of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1135,7 +1135,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
 }

-/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
+/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -1153,7 +1153,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
    return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
 }

-/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
@ -1175,7 +1175,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
@ -1197,7 +1197,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is equal to the
 ///    corresponding element of the second vector.
 ///
@ -1219,7 +1219,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
@ -1241,7 +1241,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
 }

-/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
@ -1263,7 +1263,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
 }

-/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is greater than
 ///    the corresponding element of the second vector.
 ///
@ -1285,20 +1285,20 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
 }

-/// \brief Constructs a 64-bit integer vector initialized to zero.
+/// Constructs a 64-bit integer vector initialized to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_setzero_si64(void)
 {
-    return (__m64){ 0LL };
+    return __extension__ (__m64){ 0LL };
 }

-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    32-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@ -1319,7 +1319,7 @@ _mm_set_pi32(int __i1, int __i0)
    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
 }

-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    16-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@ -1342,7 +1342,7 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
 }

-/// \brief Constructs a 64-bit integer vector initialized with the specified
+/// Constructs a 64-bit integer vector initialized with the specified
 ///    8-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@ -1375,13 +1375,14 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
                                               __b4, __b5, __b6, __b7);
 }

-/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
+/// Constructs a 64-bit integer vector of [2 x i32], with each of the
 ///    32-bit integer vector elements set to the specified 32-bit integer
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __i
 ///    A 32-bit integer value used to initialize each vector element of the
@ -1393,13 +1394,14 @@ _mm_set1_pi32(int __i)
    return _mm_set_pi32(__i, __i);
 }

-/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
+/// Constructs a 64-bit integer vector of [4 x i16], with each of the
 ///    16-bit integer vector elements set to the specified 16-bit integer
 ///    value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __w
 ///    A 16-bit integer value used to initialize each vector element of the
@ -1411,13 +1413,13 @@ _mm_set1_pi16(short __w)
    return _mm_set_pi16(__w, __w, __w, __w);
 }

-/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
+/// Constructs a 64-bit integer vector of [8 x i8], with each of the
 ///    8-bit integer vector elements set to the specified 8-bit integer value.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
-///    PSHUFLW </c> instruction.
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
 ///
 /// \param __b
 ///    An 8-bit integer value used to initialize each vector element of the
@ -1429,7 +1431,7 @@ _mm_set1_pi8(char __b)
    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
 }

-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 32-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@ -1450,7 +1452,7 @@ _mm_setr_pi32(int __i0, int __i1)
    return _mm_set_pi32(__i1, __i0);
 }

-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 16-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
@ -1473,7 +1475,7 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
    return _mm_set_pi16(__w3, __w2, __w1, __w0);
 }

-/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
+/// Constructs a 64-bit integer vector, initialized in reverse order with
 ///    the specified 8-bit integer values.
 ///
 /// \headerfile <x86intrin.h>
--- a/c_headers/module.modulemap
+++ b/c_headers/module.modulemap
@ -38,6 +38,7 @@ module _Builtin_intrinsics [system] [extern_c] {
    explicit module neon {
      requires neon
      header "arm_neon.h"
+      header "arm_fp16.h"
      export *
    }
  }
@ -62,6 +63,17 @@ module _Builtin_intrinsics [system] [extern_c] {
    textual header "fma4intrin.h"
    textual header "mwaitxintrin.h"
    textual header "clzerointrin.h"
+    textual header "wbnoinvdintrin.h"
+    textual header "cldemoteintrin.h"
+    textual header "waitpkgintrin.h"
+    textual header "movdirintrin.h"
+    textual header "pconfigintrin.h"
+    textual header "sgxintrin.h"
+    textual header "ptwriteintrin.h"
+    textual header "invpcidintrin.h"
+
+    textual header "__wmmintrin_aes.h"
+    textual header "__wmmintrin_pclmul.h"

    explicit module mm_malloc {
      requires !freestanding
@ -128,14 +140,6 @@ module _Builtin_intrinsics [system] [extern_c] {
      export aes
      export pclmul
    }
-
-    explicit module aes {
-      header "__wmmintrin_aes.h"
-    }
-
-    explicit module pclmul {
-      header "__wmmintrin_pclmul.h"
-    }
  }

  explicit module systemz {
--- a/c_headers/movdirintrin.h
+++ b/c_headers/movdirintrin.h
@ -0,0 +1,63 @@
+/*===------------------------- movdirintrin.h ------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H
+#define _MOVDIRINTRIN_H
+
+/* Move doubleword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u32 (void *__dst, unsigned int  __value)
+{
+  __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
+}
+
+#ifdef __x86_64__
+
+/* Move quadword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdiri")))
+_directstoreu_u64 (void *__dst, unsigned long __value)
+{
+  __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
+}
+
+#endif /* __x86_64__ */
+
+/*
+ * movdir64b - Move 64 bytes as direct store.
+ * The destination must be 64 byte aligned, and the store is atomic.
+ * The source address has no alignment requirement, and the load from
+ * the source address is not atomic.
+ */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__,  __target__("movdir64b")))
+_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
+{
+  __builtin_ia32_movdir64b(__dst, __src);
+}
+
+#endif /* _MOVDIRINTRIN_H */
--- a/c_headers/mwaitxintrin.h
+++ b/c_headers/mwaitxintrin.h
@ -25,8 +25,8 @@
 #error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
 #endif

-#ifndef _MWAITXINTRIN_H
-#define _MWAITXINTRIN_H
+#ifndef __MWAITXINTRIN_H
+#define __MWAITXINTRIN_H

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
@ -44,4 +44,4 @@ _mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)

 #undef __DEFAULT_FN_ATTRS

-#endif /* _MWAITXINTRIN_H */
+#endif /* __MWAITXINTRIN_H */
--- a/c_headers/nmmintrin.h
+++ b/c_headers/nmmintrin.h
@ -21,10 +21,10 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef _NMMINTRIN_H
-#define _NMMINTRIN_H
+#ifndef __NMMINTRIN_H
+#define __NMMINTRIN_H

 /* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
   just include it now then.  */
 #include <smmintrin.h>
-#endif /* _NMMINTRIN_H */
+#endif /* __NMMINTRIN_H */
--- a/c_headers/opencl-c.h
+++ b/c_headers/opencl-c.h
@ -11540,7 +11540,7 @@ half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
 *
 * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
 *
- * The address computed as (p + (offset * n)) must be 
+ * The address computed as (p + (offset * n)) must be
 * 8-bit aligned if gentype is char, uchar;
 * 16-bit aligned if gentype is short, ushort, half;
 * 32-bit aligned if gentype is int, uint, float;
@ -12862,7 +12862,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
 * Read memory barrier that orders only
 * loads.
 * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
 * combination of the following literal
 * values:
 * CLK_LOCAL_MEM_FENCE
@ -12874,7 +12874,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
 * Write memory barrier that orders only
 * stores.
 * The flags argument specifies the memory
- * address space and can be set to to a
+ * address space and can be set to a
 * combination of the following literal
 * values:
 * CLK_LOCAL_MEM_FENCE
@ -12888,7 +12888,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags);
 cl_mem_fence_flags __ovld get_fence(const void *ptr);
 cl_mem_fence_flags __ovld get_fence(void *ptr);

-/** 
+/**
 * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
 * and checked in Sema since they should be declared as
 *   addr gentype* to_addr (gentype*);
@ -13773,7 +13773,7 @@ ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long opera
 // add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
 // or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.

-#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) 
+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
 uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
 uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
 uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
@ -14571,7 +14571,7 @@ int printf(__constant const char* st, ...);
 * only. The filter_mode specified in sampler
 * must be set to CLK_FILTER_NEAREST; otherwise
 * the values returned are undefined.
- 
+
 * The read_image{f|i|ui} calls that take
 * integer coordinates must use a sampler with
 * normalized coordinates set to
@ -15421,8 +15421,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
 #define CLK_DEPTH_STENCIL     0x10BE
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #define CLK_sRGB              0x10BF
-#define CLK_sRGBA             0x10C1
 #define CLK_sRGBx             0x10C0
+#define CLK_sRGBA             0x10C1
 #define CLK_sBGRA             0x10C2
 #define CLK_ABGR              0x10C3
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
--- a/c_headers/pconfigintrin.h
+++ b/c_headers/pconfigintrin.h
@ -0,0 +1,50 @@
+/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PCONFIGINTRIN_H
+#define __PCONFIGINTRIN_H
+
+#define __PCONFIG_KEY_PROGRAM 0x00000001
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("pconfig")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("pconfig"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/pkuintrin.h
+++ b/c_headers/pkuintrin.h
@ -1,4 +1,4 @@
-/*===------------- pkuintrin.h - PKU intrinsics ------------------===
+/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
 *
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -40,7 +40,7 @@ _rdpkru_u32(void)
 static __inline__ void __DEFAULT_FN_ATTRS
 _wrpkru(unsigned int __val)
 {
-  return __builtin_ia32_wrpkru(__val);
+  __builtin_ia32_wrpkru(__val);
 }

 #undef __DEFAULT_FN_ATTRS
--- a/c_headers/pmmintrin.h
+++ b/c_headers/pmmintrin.h
@ -28,9 +28,9 @@

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS \
-  __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
+  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))

-/// \brief Loads data from an unaligned memory location to elements in a 128-bit
+/// Loads data from an unaligned memory location to elements in a 128-bit
 ///    vector.
 ///
 ///    If the address of the data is not 16-byte aligned, the instruction may
@ -50,7 +50,7 @@ _mm_lddqu_si128(__m128i const *__p)
  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
 }

-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@ -69,7 +69,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in two
+/// Horizontally adds the adjacent pairs of values contained in two
 ///    128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@ -92,7 +92,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b)
  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in two
+/// Horizontally subtracts the adjacent pairs of values contained in two
 ///    128-bit vectors of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@ -115,7 +115,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
 }

-/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
+/// Moves and duplicates odd-indexed values from a 128-bit vector
 ///    of [4 x float] to float values stored in a 128-bit vector of
 ///    [4 x float].
 ///
@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
 }

-/// \brief Duplicates even-indexed values from a 128-bit vector of
+/// Duplicates even-indexed values from a 128-bit vector of
 ///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
@ -158,7 +158,7 @@ _mm_moveldup_ps(__m128 __a)
  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
 }

-/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
+/// Adds the even-indexed values and subtracts the odd-indexed values of
 ///    two 128-bit vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
@ -177,7 +177,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
 }

-/// \brief Horizontally adds the pairs of values contained in two 128-bit
+/// Horizontally adds the pairs of values contained in two 128-bit
 ///    vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
@ -200,7 +200,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
 }

-/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
+/// Horizontally subtracts the pairs of values contained in two 128-bit
 ///    vectors of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
@ -223,13 +223,13 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
 }

-/// \brief Moves and duplicates one double-precision value to double-precision
+/// Moves and duplicates one double-precision value to double-precision
 ///    values stored in a 128-bit vector of [2 x double].
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// __m128d _mm_loaddup_pd(double const * dp);
+/// __m128d _mm_loaddup_pd(double const *dp);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
@ -240,7 +240,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
 ///    duplicated values.
 #define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)

-/// \brief Moves and duplicates the double-precision value in the lower bits of
+/// Moves and duplicates the double-precision value in the lower bits of
 ///    a 128-bit vector of [2 x double] to double-precision values stored in a
 ///    128-bit vector of [2 x double].
 ///
@ -259,7 +259,7 @@ _mm_movedup_pd(__m128d __a)
  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }

-/// \brief Establishes a linear address memory range to be monitored and puts
+/// Establishes a linear address memory range to be monitored and puts
 ///    the processor in the monitor event pending state. Data stored in the
 ///    monitored address range causes the processor to exit the pending state.
 ///
@ -280,7 +280,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
 }

-/// \brief Used with the MONITOR instruction to wait while the processor is in
+/// Used with the MONITOR instruction to wait while the processor is in
 ///    the monitor event pending state. Data stored in the monitored address
 ///    range causes the processor to exit the pending state.
 ///
--- a/c_headers/popcntintrin.h
+++ b/c_headers/popcntintrin.h
@ -21,13 +21,13 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef _POPCNTINTRIN_H
-#define _POPCNTINTRIN_H
+#ifndef __POPCNTINTRIN_H
+#define __POPCNTINTRIN_H

 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))

-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -43,7 +43,7 @@ _mm_popcnt_u32(unsigned int __A)
  return __builtin_popcount(__A);
 }

-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -60,7 +60,7 @@ _popcnt32(int __A)
 }

 #ifdef __x86_64__
-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -76,7 +76,7 @@ _mm_popcnt_u64(unsigned long long __A)
  return __builtin_popcountll(__A);
 }

-/// \brief Counts the number of bits in the source operand having a value of 1.
+/// Counts the number of bits in the source operand having a value of 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -95,4 +95,4 @@ _popcnt64(long long __A)

 #undef __DEFAULT_FN_ATTRS

-#endif /* _POPCNTINTRIN_H */
+#endif /* __POPCNTINTRIN_H */
--- a/c_headers/prfchwintrin.h
+++ b/c_headers/prfchwintrin.h
@ -28,8 +28,7 @@
 #ifndef __PRFCHWINTRIN_H
 #define __PRFCHWINTRIN_H

-#if defined(__PRFCHW__) || defined(__3dNOW__)
-/// \brief Loads a memory sequence containing the specified memory address into
+/// Loads a memory sequence containing the specified memory address into
 ///    all data cache levels. The cache-coherency state is set to exclusive.
 ///    Data can be read from and written to the cache line without additional
 ///    delay.
@ -46,7 +45,7 @@ _m_prefetch(void *__P)
  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
 }

-/// \brief Loads a memory sequence containing the specified memory address into
+/// Loads a memory sequence containing the specified memory address into
 ///    the L1 data cache and sets the cache-coherency to modified. This
 ///    provides a hint to the processor that the cache line will be modified.
 ///    It is intended for use when the cache line will be written to shortly
@ -66,6 +65,5 @@ _m_prefetchw(void *__P)
 {
  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
 }
-#endif

 #endif /* __PRFCHWINTRIN_H */
--- a/c_headers/ptwriteintrin.h
+++ b/c_headers/ptwriteintrin.h
@ -0,0 +1,51 @@
+/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PTWRITEINTRIN_H
+#define __PTWRITEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("ptwrite")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite32(unsigned int __value) {
+  __builtin_ia32_ptwrite32(__value);
+}
+
+#ifdef __x86_64__
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite64(unsigned long long __value) {
+  __builtin_ia32_ptwrite64(__value);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PTWRITEINTRIN_H */
--- a/c_headers/rdseedintrin.h
+++ b/c_headers/rdseedintrin.h
@ -21,7 +21,7 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef __X86INTRIN_H
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
 #error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
 #endif

--- a/c_headers/sgxintrin.h
+++ b/c_headers/sgxintrin.h
@ -0,0 +1,70 @@
+/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __SGXINTRIN_H
+#define __SGXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("sgx")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("enclu"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("encls"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+  unsigned int __result;
+  __asm__ ("enclv"
+           : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+           : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+           : "cc");
+  return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- a/c_headers/shaintrin.h
+++ b/c_headers/shaintrin.h
@ -29,10 +29,10 @@
 #define __SHAINTRIN_H

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))

-#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
-  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
+#define _mm_sha1rnds4_epu32(V1, V2, M) \
+  __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
--- a/c_headers/smmintrin.h
+++ b/c_headers/smmintrin.h
@ -21,13 +21,13 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef _SMMINTRIN_H
-#define _SMMINTRIN_H
+#ifndef __SMMINTRIN_H
+#define __SMMINTRIN_H

 #include <tmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))

 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT    0x00
@ -46,7 +46,7 @@
 #define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)

-/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
+/// Rounds up each element of the 128-bit vector of [4 x float] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
 ///
@ -63,7 +63,7 @@
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)

-/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
+/// Rounds up each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
 ///
@ -80,7 +80,7 @@
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)

-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
 ///    operand to an integer and copies it to the lowest element of the 128-bit
@ -105,7 +105,7 @@
 ///    values.
 #define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)

-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds up the lower element of the second 128-bit vector operand to an
 ///    integer and copies it to the lower element of the 128-bit result vector
@ -130,7 +130,7 @@
 ///    values.
 #define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)

-/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
+/// Rounds down each element of the 128-bit vector of [4 x float] to an
 ///    an integer and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
 ///
@ -147,7 +147,7 @@
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)

-/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
+/// Rounds down each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
 ///
@ -164,7 +164,7 @@
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)

-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
 ///    operand to an integer and copies it to the lowest element of the 128-bit
@ -189,7 +189,7 @@
 ///    values.
 #define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)

-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds down the lower element of the second 128-bit vector operand to an
 ///    integer and copies it to the lower element of the 128-bit result vector
@ -214,7 +214,7 @@
 ///    values.
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)

-/// \brief Rounds each element of the 128-bit vector of [4 x float] to an
+/// Rounds each element of the 128-bit vector of [4 x float] to an
 ///    integer value according to the rounding control specified by the second
 ///    argument and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
@ -244,10 +244,10 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_round_ps(X, M) __extension__ ({ \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+#define _mm_round_ps(X, M) \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))

-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds the lowest element of the second 128-bit vector
 ///    operand to an integer value according to the rounding control specified
@ -285,11 +285,11 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
-#define _mm_round_ss(X, Y, M) __extension__ ({ \
+#define _mm_round_ss(X, Y, M) \
  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M)); })
+                                 (__v4sf)(__m128)(Y), (M))

-/// \brief Rounds each element of the 128-bit vector of [2 x double] to an
+/// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
 ///    argument and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
@ -319,10 +319,10 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_round_pd(X, M) __extension__ ({ \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+#define _mm_round_pd(X, M) \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))

-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds the lower element of the second 128-bit vector operand to an
 ///    integer value according to the rounding control specified by the third
@ -360,12 +360,12 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
-#define _mm_round_sd(X, Y, M) __extension__ ({ \
+#define _mm_round_sd(X, Y, M) \
  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M)); })
+                                  (__v2df)(__m128d)(Y), (M))

 /* SSE4 Packed Blending Intrinsics.  */
-/// \brief Returns a 128-bit vector of [2 x double] where the values are
+/// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@ -389,13 +389,11 @@
 ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
-                                   (__v2df)(__m128d)(V2), \
-                                   (((M) & 0x01) ? 2 : 0), \
-                                   (((M) & 0x02) ? 3 : 1)); })
+#define _mm_blend_pd(V1, V2, M) \
+  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                    (__v2df)(__m128d)(V2), (int)(M))

-/// \brief Returns a 128-bit vector of [4 x float] where the values are selected
+/// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@ -419,14 +417,11 @@
 ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
-                                  (((M) & 0x01) ? 4 : 0), \
-                                  (((M) & 0x02) ? 5 : 1), \
-                                  (((M) & 0x04) ? 6 : 2), \
-                                  (((M) & 0x08) ? 7 : 3)); })
+#define _mm_blend_ps(V1, V2, M) \
+  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                   (__v4sf)(__m128)(V2), (int)(M))

-/// \brief Returns a 128-bit vector of [2 x double] where the values are
+/// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@ -453,7 +448,7 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
                                            (__v2df)__M);
 }

-/// \brief Returns a 128-bit vector of [4 x float] where the values are
+/// Returns a 128-bit vector of [4 x float] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@ -480,7 +475,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
                                           (__v4sf)__M);
 }

-/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
+/// Returns a 128-bit vector of [16 x i8] where the values are selected
 ///    from either of the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@ -493,7 +488,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8].
 /// \param __M
-///    A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
+///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
 ///    how the values are to be copied. The position of the mask bit corresponds
 ///    to the most significant bit of a copied value. When a mask bit is 0, the
 ///    corresponding 8-bit element in operand \a __V1 is copied to the same
@ -507,7 +502,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
                                               (__v16qi)__M);
 }

-/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
+/// Returns a 128-bit vector of [8 x i16] where the values are selected
 ///    from either of the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@ -531,20 +526,12 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
-#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
-                                   (__v8hi)(__m128i)(V2), \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7)); })
+#define _mm_blend_epi16(V1, V2, M) \
+  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                       (__v8hi)(__m128i)(V2), (int)(M))

 /* SSE4 Dword Multiply Instructions.  */
-/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
+/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
 ///    and returns the lower 32 bits of the each product in a 128-bit vector of
 ///    [4 x i32].
 ///
@ -563,7 +550,7 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }

-/// \brief Multiplies corresponding even-indexed elements of two 128-bit
+/// Multiplies corresponding even-indexed elements of two 128-bit
 ///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
 ///    containing the products.
 ///
@ -584,7 +571,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 }

 /* SSE4 Floating Point Dot Product Instructions.  */
-/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
+/// Computes the dot product of the two 128-bit vectors of [4 x float]
 ///    and returns it in the elements of the 128-bit result vector of
 ///    [4 x float].
 ///
@ -616,11 +603,11 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [4 x float] subvector. If a bit is set, the dot product is returned
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
-#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+#define _mm_dp_ps(X, Y, M) \
  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M)); })
+                               (__v4sf)(__m128)(Y), (M))

-/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
+/// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
 ///    [2 x double].
 ///
@ -651,12 +638,12 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    to the lowest element and bit [1] corresponding to the highest element of
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
-#define _mm_dp_pd(X, Y, M) __extension__ ({\
+#define _mm_dp_pd(X, Y, M) \
  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M)); })
+                                (__v2df)(__m128d)(Y), (M))

 /* SSE4 Streaming Load Hint Instruction.  */
-/// \brief Loads integer values from a 128-bit aligned memory location to a
+/// Loads integer values from a 128-bit aligned memory location to a
 ///    128-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
@ -675,7 +662,7 @@ _mm_stream_load_si128 (__m128i const *__V)
 }

 /* SSE4 Packed Integer Min/Max Instructions.  */
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
 ///    of the two values.
 ///
@ -694,7 +681,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
 ///    greater value of the two.
 ///
@ -713,7 +700,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
 ///    value of the two.
 ///
@ -732,7 +719,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
 ///    greater value of the two.
 ///
@ -751,7 +738,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
 ///    value of the two.
 ///
@ -770,7 +757,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
 ///    greater value of the two.
 ///
@ -789,7 +776,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
 ///    value of the two.
 ///
@ -808,7 +795,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }

-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
 ///    greater value of the two.
 ///
@ -828,7 +815,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 }

 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
-/// \brief Takes the first argument \a X and inserts an element from the second
+/// Takes the first argument \a X and inserts an element from the second
 ///    argument \a Y as selected by the third argument \a N. That result then
 ///    has elements zeroed out also as selected by the third argument \a N. The
 ///    resulting 128-bit vector of [4 x float] is then returned.
@ -870,7 +857,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    single-precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))

-/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
 ///    returns it, using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@ -893,15 +880,14 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N) (__extension__                      \
-                              ({ union { int __i; float __f; } __t;  \
-                                 __v4sf __a = (__v4sf)(__m128)(X);       \
-                                 __t.__f = __a[(N) & 3];                 \
-                                 __t.__i;}))
+  ({ union { int __i; float __f; } __t;  \
+     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
+     __t.__i;}))

 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
-#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
-                                                    (D) = __a[N]; }))
+#define _MM_EXTRACT_FLOAT(D, X, N) \
+  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }

 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
   an index suitable for _mm_insert_ps.  */
@ -912,7 +898,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
                                             _MM_MK_INSERTPS_NDX((N), 0, 0x0e))

 /* Insert int into packed integer array at index.  */
-/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
+/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
 ///    of an integer parameter \a I into an offset specified by the immediate
 ///    value parameter \a N.
@ -952,12 +938,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1110: Bits [119:112] of the result are used for insertion. \n
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi8(X, I, N) (__extension__                           \
-                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                     __a[(N) & 15] = (I);                 \
-                                     (__m128i)__a;}))
+#define _mm_insert_epi8(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                        (int)(I), (int)(N))

-/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
+/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
 ///    integer parameter \a I at the offset specified by the immediate value
 ///    parameter \a N.
@ -985,13 +970,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of the result are used for insertion. \n
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi32(X, I, N) (__extension__                         \
-                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                      __a[(N) & 3] = (I);                \
-                                      (__m128i)__a;}))
+#define _mm_insert_epi32(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                       (int)(I), (int)(N))

 #ifdef __x86_64__
-/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
+/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 64-bit
 ///    integer parameter \a I, using the immediate value parameter \a N as an
 ///    insertion location selector.
@ -1017,16 +1001,15 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] of the result are used for insertion. \n
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi64(X, I, N) (__extension__                         \
-                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                      __a[(N) & 1] = (I);                \
-                                      (__m128i)__a;}))
+#define _mm_insert_epi64(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                       (long long)(I), (int)(N))
 #endif /* __x86_64__ */

 /* Extract int from packed integer array at index.  This returns the element
 * as a zero extended value, so it is unsigned.
 */
-/// \brief Extracts an 8-bit element from the 128-bit integer vector of
+/// Extracts an 8-bit element from the 128-bit integer vector of
 ///    [16 x i8], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@ -1061,11 +1044,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  An unsigned integer, whose lower 8 bits are selected from the
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
-#define _mm_extract_epi8(X, N) (__extension__                           \
-                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi8(X, N) \
+  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                   (int)(N))

-/// \brief Extracts a 32-bit element from the 128-bit integer vector of
+/// Extracts a 32-bit element from the 128-bit integer vector of
 ///    [4 x i32], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@ -1087,12 +1070,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the parameter \a X are exracted.
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi32(X, N) (__extension__                         \
-                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                    (int)__a[(N) & 3];}))
+#define _mm_extract_epi32(X, N) \
+  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))

 #ifdef __x86_64__
-/// \brief Extracts a 64-bit element from the 128-bit integer vector of
+/// Extracts a 64-bit element from the 128-bit integer vector of
 ///    [2 x i64], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@ -1111,13 +1093,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] are returned. \n
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
-#define _mm_extract_epi64(X, N) (__extension__                         \
-                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                    (long long)__a[(N) & 1];}))
+#define _mm_extract_epi64(X, N) \
+  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
 #endif /* __x86_64 */

 /* SSE4 128-bit Packed Integer Comparisons.  */
-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    zeros.
 ///
 /// \headerfile <x86intrin.h>
@ -1135,7 +1116,7 @@ _mm_testz_si128(__m128i __M, __m128i __V)
  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }

-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    ones.
 ///
 /// \headerfile <x86intrin.h>
@ -1153,7 +1134,7 @@ _mm_testc_si128(__m128i __M, __m128i __V)
  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }

-/// \brief Tests whether the specified bits in a 128-bit integer vector are
+/// Tests whether the specified bits in a 128-bit integer vector are
 ///    neither all zeros nor all ones.
 ///
 /// \headerfile <x86intrin.h>
@ -1172,7 +1153,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }

-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    ones.
 ///
 /// \headerfile <x86intrin.h>
@ -1189,7 +1170,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 ///    otherwise.
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))

-/// \brief Tests whether the specified bits in a 128-bit integer vector are
+/// Tests whether the specified bits in a 128-bit integer vector are
 ///    neither all zeros nor all ones.
 ///
 /// \headerfile <x86intrin.h>
@ -1208,7 +1189,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 ///    FALSE otherwise.
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))

-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    zeros.
 ///
 /// \headerfile <x86intrin.h>
@ -1227,7 +1208,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))

 /* SSE4 64-bit Packed Integer Comparisons.  */
-/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+/// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
 /// \headerfile <x86intrin.h>
@ -1246,7 +1227,7 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 }

 /* SSE4 Packed Integer Sign-Extension.  */
-/// \brief Sign-extends each of the lower eight 8-bit integer elements of a
+/// Sign-extends each of the lower eight 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
 ///    are unused.
@ -1267,7 +1248,7 @@ _mm_cvtepi8_epi16(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }

-/// \brief Sign-extends each of the lower four 8-bit integer elements of a
+/// Sign-extends each of the lower four 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
 ///    vector are unused.
@ -1277,8 +1258,8 @@ _mm_cvtepi8_epi16(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
-///    extended to 32-bit values.
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
@ -1288,7 +1269,7 @@ _mm_cvtepi8_epi32(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }

-/// \brief Sign-extends each of the lower two 8-bit integer elements of a
+/// Sign-extends each of the lower two 8-bit integer elements of a
 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
 ///    vector are unused.
@ -1298,8 +1279,8 @@ _mm_cvtepi8_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
@ -1309,7 +1290,7 @@ _mm_cvtepi8_epi64(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }

-/// \brief Sign-extends each of the lower four 16-bit integer elements of a
+/// Sign-extends each of the lower four 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
 ///    vector are unused.
@ -1319,8 +1300,8 @@ _mm_cvtepi8_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
-///    extended to 32-bit values.
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
@ -1328,7 +1309,7 @@ _mm_cvtepi16_epi32(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }

-/// \brief Sign-extends each of the lower two 16-bit integer elements of a
+/// Sign-extends each of the lower two 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input
 ///    vector are unused.
@ -1338,8 +1319,8 @@ _mm_cvtepi16_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///     sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
@ -1347,7 +1328,7 @@ _mm_cvtepi16_epi64(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }

-/// \brief Sign-extends each of the lower two 32-bit integer elements of a
+/// Sign-extends each of the lower two 32-bit integer elements of a
 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
 ///    are unused.
@ -1357,8 +1338,8 @@ _mm_cvtepi16_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
@ -1367,7 +1348,7 @@ _mm_cvtepi32_epi64(__m128i __V)
 }

 /* SSE4 Packed Integer Zero-Extension.  */
-/// \brief Zero-extends each of the lower eight 8-bit integer elements of a
+/// Zero-extends each of the lower eight 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
 ///    are unused.
@ -1377,8 +1358,8 @@ _mm_cvtepi32_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
-///    extended to 16-bit values.
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+///    zero-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
@ -1386,7 +1367,7 @@ _mm_cvtepu8_epi16(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }

-/// \brief Zero-extends each of the lower four 8-bit integer elements of a
+/// Zero-extends each of the lower four 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
 ///    vector are unused.
@ -1396,8 +1377,8 @@ _mm_cvtepu8_epi16(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
-///    extended to 32-bit values.
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
@ -1405,7 +1386,7 @@ _mm_cvtepu8_epi32(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }

-/// \brief Zero-extends each of the lower two 8-bit integer elements of a
+/// Zero-extends each of the lower two 8-bit integer elements of a
 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
 ///    vector are unused.
@ -1415,8 +1396,8 @@ _mm_cvtepu8_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
@ -1424,7 +1405,7 @@ _mm_cvtepu8_epi64(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }

-/// \brief Zero-extends each of the lower four 16-bit integer elements of a
+/// Zero-extends each of the lower four 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
 ///    vector are unused.
@ -1434,8 +1415,8 @@ _mm_cvtepu8_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
-///    extended to 32-bit values.
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
@ -1443,7 +1424,7 @@ _mm_cvtepu16_epi32(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }

-/// \brief Zero-extends each of the lower two 16-bit integer elements of a
+/// Zero-extends each of the lower two 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
 ///    are unused.
@ -1453,8 +1434,8 @@ _mm_cvtepu16_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
@ -1462,7 +1443,7 @@ _mm_cvtepu16_epi64(__m128i __V)
  return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }

-/// \brief Zero-extends each of the lower two 32-bit integer elements of a
+/// Zero-extends each of the lower two 32-bit integer elements of a
 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
 ///    are unused.
@ -1472,8 +1453,8 @@ _mm_cvtepu16_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
@ -1482,7 +1463,7 @@ _mm_cvtepu32_epi64(__m128i __V)
 }

 /* SSE4 Pack with Unsigned Saturation.  */
-/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+/// Converts 32-bit signed integers from both 128-bit integer vector
 ///    operands into 16-bit unsigned integers, and returns the packed result.
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
@ -1511,7 +1492,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 }

 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
-/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
+/// Subtracts 8-bit unsigned integer values and computes the absolute
 ///    values of the differences to the corresponding bits in the destination.
 ///    Then sums of the absolute differences are returned according to the bit
 ///    fields in the immediate operand.
@ -1534,23 +1515,23 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 ///    \code
 ///    // M2 represents bit 2 of the immediate operand
 ///    // M10 represents bits [1:0] of the immediate operand
-///    i = M2 * 4
-///    j = M10 * 4
+///    i = M2 * 4;
+///    j = M10 * 4;
 ///    for (k = 0; k < 8; k = k + 1) {
-///      d0 = abs(X[i + k + 0] - Y[j + 0])
-///      d1 = abs(X[i + k + 1] - Y[j + 1])
-///      d2 = abs(X[i + k + 2] - Y[j + 2])
-///      d3 = abs(X[i + k + 3] - Y[j + 3])
-///      r[k] = d0 + d1 + d2 + d3
+///      d0 = abs(X[i + k + 0] - Y[j + 0]);
+///      d1 = abs(X[i + k + 1] - Y[j + 1]);
+///      d2 = abs(X[i + k + 2] - Y[j + 2]);
+///      d3 = abs(X[i + k + 3] - Y[j + 3]);
+///      r[k] = d0 + d1 + d2 + d3;
 ///    }
 ///    \endcode
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
-#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+#define _mm_mpsadbw_epu8(X, Y, M) \
  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M)); })
+                                      (__v16qi)(__m128i)(Y), (M))

-/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
+/// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
 ///
 /// \headerfile <x86intrin.h>
@ -1604,7 +1585,7 @@ _mm_minpos_epu16(__m128i __V)
 #define _SIDD_UNIT_MASK                 0x40

 /* SSE4.2 Packed Comparison Intrinsics.  */
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
 ///    mask of the comparison.
@ -1660,7 +1641,7 @@ _mm_minpos_epu16(__m128i __V)
  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
                                       (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns an integer representing the result index of the
 ///    comparison.
@ -1714,7 +1695,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
                                   (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
 ///    mask of the comparison.
@ -1775,7 +1756,7 @@ _mm_minpos_epu16(__m128i __V)
                                       (__v16qi)(__m128i)(B), (int)(LB), \
                                       (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns an integer representing the result index of the
 ///    comparison.
@ -1835,7 +1816,7 @@ _mm_minpos_epu16(__m128i __V)
                                   (int)(M))

 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
 ///    string in \a B is the maximum, otherwise, returns 0.
@ -1885,7 +1866,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
 ///    0.
@ -1934,7 +1915,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
 ///
@ -1982,7 +1963,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
 ///    the maximum, otherwise, returns 0.
@ -2032,7 +2013,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
 ///    the maximum, otherwise, returns 0.
@ -2082,7 +2063,7 @@ _mm_minpos_epu16(__m128i __V)
  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
 ///    string in \a B is the maximum, otherwise, returns 0.
@ -2137,7 +2118,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
 ///    returns 0.
@ -2191,7 +2172,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
 ///
@ -2244,7 +2225,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
 ///    the maximum, otherwise, returns 0.
@ -2299,7 +2280,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))

-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
 ///    the maximum, otherwise, returns 0.
@ -2354,7 +2335,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (int)(M))

 /* SSE4.2 Compare Packed Data -- Greater Than.  */
-/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+/// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
@ -2374,7 +2355,7 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 }

 /* SSE4.2 Accumulate CRC32.  */
-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned char operand.
 ///
 /// \headerfile <x86intrin.h>
@ -2394,7 +2375,7 @@ _mm_crc32_u8(unsigned int __C, unsigned char __D)
  return __builtin_ia32_crc32qi(__C, __D);
 }

-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned short operand.
 ///
 /// \headerfile <x86intrin.h>
@ -2414,7 +2395,7 @@ _mm_crc32_u16(unsigned int __C, unsigned short __D)
  return __builtin_ia32_crc32hi(__C, __D);
 }

-/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
 ///    the second unsigned integer operand.
 ///
 /// \headerfile <x86intrin.h>
@ -2435,7 +2416,7 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D)
 }

 #ifdef __x86_64__
-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned 64-bit integer operand.
 ///
 /// \headerfile <x86intrin.h>
@ -2458,8 +2439,6 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)

 #undef __DEFAULT_FN_ATTRS

-#ifdef __POPCNT__
 #include <popcntintrin.h>
-#endif

-#endif /* _SMMINTRIN_H */
+#endif /* __SMMINTRIN_H */
--- a/c_headers/stdint.h
+++ b/c_headers/stdint.h
@ -88,7 +88,7 @@
 *
 * To accommodate targets that are missing types that are exactly 8, 16, 32, or
 * 64 bits wide, this implementation takes an approach of cascading
- * redefintions, redefining __int_leastN_t to successively smaller exact-width
+ * redefinitions, redefining __int_leastN_t to successively smaller exact-width
 * types. It is therefore important that the types are defined in order of
 * descending widths.
 *
@ -461,7 +461,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 * As in the type definitions, this section takes an approach of
 * successive-shrinking to determine which limits to use for the standard (8,
 * 16, 32, 64) bit widths when they don't have exact representations. It is
- * therefore important that the defintions be kept in order of decending
+ * therefore important that the definitions be kept in order of decending
 * widths.
 *
 * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the
--- a/c_headers/tmmintrin.h
+++ b/c_headers/tmmintrin.h
@ -27,9 +27,10 @@
 #include <pmmintrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))

-/// \brief Computes the absolute value of each of the packed 8-bit signed
+/// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
 ///    results in the destination.
 ///
@ -41,13 +42,13 @@
 ///    A 64-bit vector of [8 x i8].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi8(__m64 __a)
 {
    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
 }

-/// \brief Computes the absolute value of each of the packed 8-bit signed
+/// Computes the absolute value of each of the packed 8-bit signed
 ///    integers in the source operand and stores the 8-bit unsigned integer
 ///    results in the destination.
 ///
@ -65,7 +66,7 @@ _mm_abs_epi8(__m128i __a)
    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
 }

-/// \brief Computes the absolute value of each of the packed 16-bit signed
+/// Computes the absolute value of each of the packed 16-bit signed
 ///    integers in the source operand and stores the 16-bit unsigned integer
 ///    results in the destination.
 ///
@ -77,13 +78,13 @@ _mm_abs_epi8(__m128i __a)
 ///    A 64-bit vector of [4 x i16].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi16(__m64 __a)
 {
    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
 }

-/// \brief Computes the absolute value of each of the packed 16-bit signed
+/// Computes the absolute value of each of the packed 16-bit signed
 ///    integers in the source operand and stores the 16-bit unsigned integer
 ///    results in the destination.
 ///
@ -101,7 +102,7 @@ _mm_abs_epi16(__m128i __a)
    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
 }

-/// \brief Computes the absolute value of each of the packed 32-bit signed
+/// Computes the absolute value of each of the packed 32-bit signed
 ///    integers in the source operand and stores the 32-bit unsigned integer
 ///    results in the destination.
 ///
@ -113,13 +114,13 @@ _mm_abs_epi16(__m128i __a)
 ///    A 64-bit vector of [2 x i32].
 /// \returns A 64-bit integer vector containing the absolute values of the
 ///    elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_abs_pi32(__m64 __a)
 {
    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
 }

-/// \brief Computes the absolute value of each of the packed 32-bit signed
+/// Computes the absolute value of each of the packed 32-bit signed
 ///    integers in the source operand and stores the 32-bit unsigned integer
 ///    results in the destination.
 ///
@ -137,7 +138,7 @@ _mm_abs_epi32(__m128i __a)
    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
 }

-/// \brief Concatenates the two 128-bit integer vector operands, and
+/// Concatenates the two 128-bit integer vector operands, and
 ///    right-shifts the result by the number of bytes specified in the immediate
 ///    operand.
 ///
@ -157,11 +158,11 @@ _mm_abs_epi32(__m128i __a)
 ///    An immediate operand specifying how many bytes to right-shift the result.
 /// \returns A 128-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
+#define _mm_alignr_epi8(a, b, n) \
  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
-                                     (__v16qi)(__m128i)(b), (n)); })
+                                     (__v16qi)(__m128i)(b), (n))

-/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
+/// Concatenates the two 64-bit integer vector operands, and right-shifts
 ///    the result by the number of bytes specified in the immediate operand.
 ///
 /// \headerfile <x86intrin.h>
@ -180,10 +181,10 @@ _mm_abs_epi32(__m128i __a)
 ///    An immediate operand specifying how many bytes to right-shift the result.
 /// \returns A 64-bit integer vector containing the concatenated right-shifted
 ///    value.
-#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
-  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
+#define _mm_alignr_pi8(a, b, n) \
+  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [8 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -206,7 +207,7 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    128-bit vectors of [4 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -229,7 +230,7 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    64-bit vectors of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -246,13 +247,13 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadd_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
 ///    64-bit vectors of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -269,15 +270,16 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
 ///    destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
 ///    operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadd_pi32(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
-///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
+///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+///    0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -299,9 +301,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
-///    saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
+///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+///    0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -317,13 +320,13 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
 ///    destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hadds_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [8 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -346,7 +349,7 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [4 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -369,7 +372,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
@ -386,13 +389,13 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsub_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
@ -409,16 +412,16 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
 ///    the destination.
 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
 ///    of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsub_pi32(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
-///    saturated to 8000h.
+///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+///    saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -440,10 +443,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
+/// Horizontally subtracts the adjacent pairs of values contained in 2
 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
-///    saturated to 8000h.
+///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+///    saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@ -459,13 +462,13 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
 ///    the destination.
 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
 ///    differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_hsubs_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
@ -499,7 +502,7 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
 }

-/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
@ -523,13 +526,13 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_maddubs_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
 }

-/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 ///    products to the 18 most significant bits by right-shifting, rounds the
 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 ///
@ -549,7 +552,7 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 ///    products to the 18 most significant bits by right-shifting, rounds the
 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
 ///
@ -563,13 +566,13 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
 ///    products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
+/// Copies the 8-bit integers from a 128-bit integer vector to the
 ///    destination or clears 8-bit values in the destination, as specified by
 ///    the second source operand.
 ///
@ -595,7 +598,7 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
 }

-/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
+/// Copies the 8-bit integers from a 64-bit integer vector to the
 ///    destination or clears 8-bit values in the destination, as specified by
 ///    the second source operand.
 ///
@ -614,13 +617,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
 ///    destination. \n
 ///    Bits [3:0] select the source byte to be copied.
 /// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_shuffle_pi8(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
 }

-/// \brief For each 8-bit integer in the first source operand, perform one of
+/// For each 8-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the byte in the second source is negative, calculate the two's
@ -646,7 +649,7 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }

-/// \brief For each 16-bit integer in the first source operand, perform one of
+/// For each 16-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the word in the second source is negative, calculate the two's
@ -672,7 +675,7 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }

-/// \brief For each 32-bit integer in the first source operand, perform one of
+/// For each 32-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the doubleword in the second source is negative, calculate the two's
@ -698,7 +701,7 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }

-/// \brief For each 8-bit integer in the first source operand, perform one of
+/// For each 8-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the byte in the second source is negative, calculate the two's
@ -718,13 +721,13 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi8(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
 }

-/// \brief For each 16-bit integer in the first source operand, perform one of
+/// For each 16-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the word in the second source is negative, calculate the two's
@ -744,13 +747,13 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi16(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
 }

-/// \brief For each 32-bit integer in the first source operand, perform one of
+/// For each 32-bit integer in the first source operand, perform one of
 ///    the following actions as specified by the second source operand.
 ///
 ///    If the doubleword in the second source is negative, calculate the two's
@ -770,12 +773,13 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
 _mm_sign_pi32(__m64 __a, __m64 __b)
 {
    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
 }

 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX

 #endif /* __TMMINTRIN_H */
--- a/c_headers/vaesintrin.h
+++ b/c_headers/vaesintrin.h
@ -29,10 +29,10 @@
 #define __VAESINTRIN_H

 /* Default attributes for YMM forms. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))

 /* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes")))
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))


 static __inline__ __m256i __DEFAULT_FN_ATTRS
--- a/c_headers/vpclmulqdqintrin.h
+++ b/c_headers/vpclmulqdqintrin.h
@ -28,15 +28,15 @@
 #ifndef __VPCLMULQDQINTRIN_H
 #define __VPCLMULQDQINTRIN_H

-#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({    \
+#define _mm256_clmulepi64_epi128(A, B, I) \
  (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
                                       (__v4di)(__m256i)(B),  \
-                                       (char)(I)); })
+                                       (char)(I))

-#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({    \
+#define _mm512_clmulepi64_epi128(A, B, I) \
  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
                                       (__v8di)(__m512i)(B),  \
-                                       (char)(I)); })
+                                       (char)(I))

-#endif // __VPCLMULQDQINTRIN_H
+#endif /* __VPCLMULQDQINTRIN_H */

--- a/c_headers/waitpkgintrin.h
+++ b/c_headers/waitpkgintrin.h
@ -0,0 +1,56 @@
+/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WAITPKGINTRIN_H
+#define __WAITPKGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("waitpkg")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_umonitor (void * __address)
+{
+  __builtin_ia32_umonitor (__address);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_umwait (unsigned int __control, unsigned long long __counter)
+{
+  return __builtin_ia32_umwait (__control,
+    (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_tpause (unsigned int __control, unsigned long long __counter)
+{
+  return __builtin_ia32_tpause (__control,
+    (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __WAITPKGINTRIN_H */
--- a/c_headers/wbnoinvdintrin.h
+++ b/c_headers/wbnoinvdintrin.h
@ -0,0 +1,38 @@
+/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WBNOINVDINTRIN_H
+#define __WBNOINVDINTRIN_H
+
+static __inline__ void
+  __attribute__((__always_inline__, __nodebug__,  __target__("wbnoinvd")))
+_wbnoinvd (void)
+{
+  __builtin_ia32_wbnoinvd ();
+}
+
+#endif /* __WBNOINVDINTRIN_H */
--- a/c_headers/wmmintrin.h
+++ b/c_headers/wmmintrin.h
@ -21,8 +21,8 @@
 *===-----------------------------------------------------------------------===
 */

-#ifndef _WMMINTRIN_H
-#define _WMMINTRIN_H
+#ifndef __WMMINTRIN_H
+#define __WMMINTRIN_H

 #include <emmintrin.h>

@ -30,4 +30,4 @@

 #include <__wmmintrin_pclmul.h>

-#endif /* _WMMINTRIN_H */
+#endif /* __WMMINTRIN_H */
--- a/c_headers/x86intrin.h
+++ b/c_headers/x86intrin.h
@ -32,26 +32,6 @@
 #include <mm3dnow.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
-#include <bmiintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
-#include <bmi2intrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
-#include <lzcntintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
-#include <rdseedintrin.h>
-#endif
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
 #include <prfchwintrin.h>
 #endif
@ -76,10 +56,6 @@
 #include <lwpintrin.h>
 #endif

-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
-#include <f16cintrin.h>
-#endif
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
 #include <mwaitxintrin.h>
 #endif
@ -88,4 +64,5 @@
 #include <clzerointrin.h>
 #endif

+
 #endif /* __X86INTRIN_H */
--- a/c_headers/xmmintrin.h
+++ b/c_headers/xmmintrin.h
--- a/c_headers/xopintrin.h
+++ b/c_headers/xopintrin.h
@ -31,7 +31,8 @@
 #include <fma4intrin.h>

 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
@ -201,7 +202,7 @@ _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
  return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
 {
  return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
@ -237,17 +238,17 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
  return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
 }

-#define _mm_roti_epi8(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
+#define _mm_roti_epi8(A, N) \
+  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))

-#define _mm_roti_epi16(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
+#define _mm_roti_epi16(A, N) \
+  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))

-#define _mm_roti_epi32(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
+#define _mm_roti_epi32(A, N) \
+  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))

-#define _mm_roti_epi64(A, N) __extension__ ({ \
-  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
+#define _mm_roti_epi64(A, N) \
+  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))

 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi8(__m128i __A, __m128i __B)
@ -297,37 +298,37 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
  return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
 }

-#define _mm_com_epu8(A, B, N) __extension__ ({ \
+#define _mm_com_epu8(A, B, N) \
  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
-                                  (__v16qi)(__m128i)(B), (N)); })
+                                  (__v16qi)(__m128i)(B), (N))

-#define _mm_com_epu16(A, B, N) __extension__ ({ \
+#define _mm_com_epu16(A, B, N) \
  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
-                                  (__v8hi)(__m128i)(B), (N)); })
+                                  (__v8hi)(__m128i)(B), (N))

-#define _mm_com_epu32(A, B, N) __extension__ ({ \
+#define _mm_com_epu32(A, B, N) \
  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
-                                  (__v4si)(__m128i)(B), (N)); })
+                                  (__v4si)(__m128i)(B), (N))

-#define _mm_com_epu64(A, B, N) __extension__ ({ \
+#define _mm_com_epu64(A, B, N) \
  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
-                                  (__v2di)(__m128i)(B), (N)); })
+                                  (__v2di)(__m128i)(B), (N))

-#define _mm_com_epi8(A, B, N) __extension__ ({ \
+#define _mm_com_epi8(A, B, N) \
  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
-                                 (__v16qi)(__m128i)(B), (N)); })
+                                 (__v16qi)(__m128i)(B), (N))

-#define _mm_com_epi16(A, B, N) __extension__ ({ \
+#define _mm_com_epi16(A, B, N) \
  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
-                                 (__v8hi)(__m128i)(B), (N)); })
+                                 (__v8hi)(__m128i)(B), (N))

-#define _mm_com_epi32(A, B, N) __extension__ ({ \
+#define _mm_com_epi32(A, B, N) \
  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
-                                 (__v4si)(__m128i)(B), (N)); })
+                                 (__v4si)(__m128i)(B), (N))

-#define _mm_com_epi64(A, B, N) __extension__ ({ \
+#define _mm_com_epi64(A, B, N) \
  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
-                                 (__v2di)(__m128i)(B), (N)); })
+                                 (__v2di)(__m128i)(B), (N))

 #define _MM_PCOMCTRL_LT    0
 #define _MM_PCOMCTRL_LE    1
@ -722,24 +723,24 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
  return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
 }

-#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
+#define _mm_permute2_pd(X, Y, C, I) \
  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
                                     (__v2df)(__m128d)(Y), \
-                                     (__v2di)(__m128i)(C), (I)); })
+                                     (__v2di)(__m128i)(C), (I))

-#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
+#define _mm256_permute2_pd(X, Y, C, I) \
  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
                                        (__v4df)(__m256d)(Y), \
-                                        (__v4di)(__m256i)(C), (I)); })
+                                        (__v4di)(__m256i)(C), (I))

-#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
+#define _mm_permute2_ps(X, Y, C, I) \
  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
-                                    (__v4si)(__m128i)(C), (I)); })
+                                    (__v4si)(__m128i)(C), (I))

-#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
+#define _mm256_permute2_ps(X, Y, C, I) \
  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
                                       (__v8sf)(__m256)(Y), \
-                                       (__v8si)(__m256i)(C), (I)); })
+                                       (__v8si)(__m256i)(C), (I))

 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ss(__m128 __A)
@ -765,18 +766,19 @@ _mm_frcz_pd(__m128d __A)
  return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
 }

-static __inline__ __m256 __DEFAULT_FN_ATTRS
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_frcz_ps(__m256 __A)
 {
  return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
 }

-static __inline__ __m256d __DEFAULT_FN_ATTRS
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_frcz_pd(__m256d __A)
 {
  return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
 }

 #undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS256

 #endif /* __XOPINTRIN_H */
--- a/c_headers/xsavecintrin.h
+++ b/c_headers/xsavecintrin.h
@ -1,4 +1,4 @@
-/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
+/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/c_headers/xsaveintrin.h
+++ b/c_headers/xsaveintrin.h
@ -1,4 +1,4 @@
-/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
+/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -33,23 +33,23 @@

 static __inline__ void __DEFAULT_FN_ATTRS
 _xsave(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsave(__p, __m);
+  __builtin_ia32_xsave(__p, __m);
 }

 static __inline__ void __DEFAULT_FN_ATTRS
 _xrstor(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xrstor(__p, __m);
+  __builtin_ia32_xrstor(__p, __m);
 }

 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsave64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsave64(__p, __m);
+  __builtin_ia32_xsave64(__p, __m);
 }

 static __inline__ void __DEFAULT_FN_ATTRS
 _xrstor64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xrstor64(__p, __m);
+  __builtin_ia32_xrstor64(__p, __m);
 }
 #endif

--- a/c_headers/xsaveoptintrin.h
+++ b/c_headers/xsaveoptintrin.h
@ -1,4 +1,4 @@
-/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -33,13 +33,13 @@

 static __inline__ void __DEFAULT_FN_ATTRS
 _xsaveopt(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsaveopt(__p, __m);
+  __builtin_ia32_xsaveopt(__p, __m);
 }

 #ifdef __x86_64__
 static __inline__ void __DEFAULT_FN_ATTRS
 _xsaveopt64(void *__p, unsigned long long __m) {
-  return __builtin_ia32_xsaveopt64(__p, __m);
+  __builtin_ia32_xsaveopt64(__p, __m);
 }
 #endif

--- a/c_headers/xsavesintrin.h
+++ b/c_headers/xsavesintrin.h
@ -1,4 +1,4 @@
-/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
+/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
--- a/c_headers/xtestintrin.h
+++ b/c_headers/xtestintrin.h
@ -1,4 +1,4 @@
-/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
+/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal