update c_headers/* to LLVM 7.0.0rc1

This commit is contained in:
Andrew Kelley 2018-08-05 02:20:05 -04:00
parent ee68f28bba
commit 6cf248ec08
87 changed files with 19107 additions and 15970 deletions

View File

@ -54,7 +54,7 @@ struct dim3;
#define __DELETE
#endif
// Make sure nobody can create instances of the special varible types. nvcc
// Make sure nobody can create instances of the special variable types. nvcc
// also disallows taking address of special variables, so we disable address-of
// operator as well.
#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \

File diff suppressed because it is too large Load Diff

View File

@ -277,6 +277,9 @@ inline __device__ long long __ldg(const long long *ptr) {
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
return __nvvm_ldg_uc(ptr);
}
inline __device__ signed char __ldg(const signed char *ptr) {
return __nvvm_ldg_uc((const unsigned char *)ptr);
}
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
return __nvvm_ldg_us(ptr);
}

View File

@ -0,0 +1,466 @@
/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
extern "C" {
__device__ int __nv_abs(int __a);
__device__ double __nv_acos(double __a);
__device__ float __nv_acosf(float __a);
__device__ double __nv_acosh(double __a);
__device__ float __nv_acoshf(float __a);
__device__ double __nv_asin(double __a);
__device__ float __nv_asinf(float __a);
__device__ double __nv_asinh(double __a);
__device__ float __nv_asinhf(float __a);
__device__ double __nv_atan2(double __a, double __b);
__device__ float __nv_atan2f(float __a, float __b);
__device__ double __nv_atan(double __a);
__device__ float __nv_atanf(float __a);
__device__ double __nv_atanh(double __a);
__device__ float __nv_atanhf(float __a);
__device__ int __nv_brev(int __a);
__device__ long long __nv_brevll(long long __a);
__device__ int __nv_byte_perm(int __a, int __b, int __c);
__device__ double __nv_cbrt(double __a);
__device__ float __nv_cbrtf(float __a);
__device__ double __nv_ceil(double __a);
__device__ float __nv_ceilf(float __a);
__device__ int __nv_clz(int __a);
__device__ int __nv_clzll(long long __a);
__device__ double __nv_copysign(double __a, double __b);
__device__ float __nv_copysignf(float __a, float __b);
__device__ double __nv_cos(double __a);
__device__ float __nv_cosf(float __a);
__device__ double __nv_cosh(double __a);
__device__ float __nv_coshf(float __a);
__device__ double __nv_cospi(double __a);
__device__ float __nv_cospif(float __a);
__device__ double __nv_cyl_bessel_i0(double __a);
__device__ float __nv_cyl_bessel_i0f(float __a);
__device__ double __nv_cyl_bessel_i1(double __a);
__device__ float __nv_cyl_bessel_i1f(float __a);
__device__ double __nv_dadd_rd(double __a, double __b);
__device__ double __nv_dadd_rn(double __a, double __b);
__device__ double __nv_dadd_ru(double __a, double __b);
__device__ double __nv_dadd_rz(double __a, double __b);
__device__ double __nv_ddiv_rd(double __a, double __b);
__device__ double __nv_ddiv_rn(double __a, double __b);
__device__ double __nv_ddiv_ru(double __a, double __b);
__device__ double __nv_ddiv_rz(double __a, double __b);
__device__ double __nv_dmul_rd(double __a, double __b);
__device__ double __nv_dmul_rn(double __a, double __b);
__device__ double __nv_dmul_ru(double __a, double __b);
__device__ double __nv_dmul_rz(double __a, double __b);
__device__ float __nv_double2float_rd(double __a);
__device__ float __nv_double2float_rn(double __a);
__device__ float __nv_double2float_ru(double __a);
__device__ float __nv_double2float_rz(double __a);
__device__ int __nv_double2hiint(double __a);
__device__ int __nv_double2int_rd(double __a);
__device__ int __nv_double2int_rn(double __a);
__device__ int __nv_double2int_ru(double __a);
__device__ int __nv_double2int_rz(double __a);
__device__ long long __nv_double2ll_rd(double __a);
__device__ long long __nv_double2ll_rn(double __a);
__device__ long long __nv_double2ll_ru(double __a);
__device__ long long __nv_double2ll_rz(double __a);
__device__ int __nv_double2loint(double __a);
__device__ unsigned int __nv_double2uint_rd(double __a);
__device__ unsigned int __nv_double2uint_rn(double __a);
__device__ unsigned int __nv_double2uint_ru(double __a);
__device__ unsigned int __nv_double2uint_rz(double __a);
__device__ unsigned long long __nv_double2ull_rd(double __a);
__device__ unsigned long long __nv_double2ull_rn(double __a);
__device__ unsigned long long __nv_double2ull_ru(double __a);
__device__ unsigned long long __nv_double2ull_rz(double __a);
__device__ unsigned long long __nv_double_as_longlong(double __a);
__device__ double __nv_drcp_rd(double __a);
__device__ double __nv_drcp_rn(double __a);
__device__ double __nv_drcp_ru(double __a);
__device__ double __nv_drcp_rz(double __a);
__device__ double __nv_dsqrt_rd(double __a);
__device__ double __nv_dsqrt_rn(double __a);
__device__ double __nv_dsqrt_ru(double __a);
__device__ double __nv_dsqrt_rz(double __a);
__device__ double __nv_dsub_rd(double __a, double __b);
__device__ double __nv_dsub_rn(double __a, double __b);
__device__ double __nv_dsub_ru(double __a, double __b);
__device__ double __nv_dsub_rz(double __a, double __b);
__device__ double __nv_erfc(double __a);
__device__ float __nv_erfcf(float __a);
__device__ double __nv_erfcinv(double __a);
__device__ float __nv_erfcinvf(float __a);
__device__ double __nv_erfcx(double __a);
__device__ float __nv_erfcxf(float __a);
__device__ double __nv_erf(double __a);
__device__ float __nv_erff(float __a);
__device__ double __nv_erfinv(double __a);
__device__ float __nv_erfinvf(float __a);
__device__ double __nv_exp10(double __a);
__device__ float __nv_exp10f(float __a);
__device__ double __nv_exp2(double __a);
__device__ float __nv_exp2f(float __a);
__device__ double __nv_exp(double __a);
__device__ float __nv_expf(float __a);
__device__ double __nv_expm1(double __a);
__device__ float __nv_expm1f(float __a);
__device__ double __nv_fabs(double __a);
__device__ float __nv_fabsf(float __a);
__device__ float __nv_fadd_rd(float __a, float __b);
__device__ float __nv_fadd_rn(float __a, float __b);
__device__ float __nv_fadd_ru(float __a, float __b);
__device__ float __nv_fadd_rz(float __a, float __b);
__device__ float __nv_fast_cosf(float __a);
__device__ float __nv_fast_exp10f(float __a);
__device__ float __nv_fast_expf(float __a);
__device__ float __nv_fast_fdividef(float __a, float __b);
__device__ float __nv_fast_log10f(float __a);
__device__ float __nv_fast_log2f(float __a);
__device__ float __nv_fast_logf(float __a);
__device__ float __nv_fast_powf(float __a, float __b);
__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr);
__device__ float __nv_fast_sinf(float __a);
__device__ float __nv_fast_tanf(float __a);
__device__ double __nv_fdim(double __a, double __b);
__device__ float __nv_fdimf(float __a, float __b);
__device__ float __nv_fdiv_rd(float __a, float __b);
__device__ float __nv_fdiv_rn(float __a, float __b);
__device__ float __nv_fdiv_ru(float __a, float __b);
__device__ float __nv_fdiv_rz(float __a, float __b);
__device__ int __nv_ffs(int __a);
__device__ int __nv_ffsll(long long __a);
__device__ int __nv_finitef(float __a);
__device__ unsigned short __nv_float2half_rn(float __a);
__device__ int __nv_float2int_rd(float __a);
__device__ int __nv_float2int_rn(float __a);
__device__ int __nv_float2int_ru(float __a);
__device__ int __nv_float2int_rz(float __a);
__device__ long long __nv_float2ll_rd(float __a);
__device__ long long __nv_float2ll_rn(float __a);
__device__ long long __nv_float2ll_ru(float __a);
__device__ long long __nv_float2ll_rz(float __a);
__device__ unsigned int __nv_float2uint_rd(float __a);
__device__ unsigned int __nv_float2uint_rn(float __a);
__device__ unsigned int __nv_float2uint_ru(float __a);
__device__ unsigned int __nv_float2uint_rz(float __a);
__device__ unsigned long long __nv_float2ull_rd(float __a);
__device__ unsigned long long __nv_float2ull_rn(float __a);
__device__ unsigned long long __nv_float2ull_ru(float __a);
__device__ unsigned long long __nv_float2ull_rz(float __a);
__device__ int __nv_float_as_int(float __a);
__device__ unsigned int __nv_float_as_uint(float __a);
__device__ double __nv_floor(double __a);
__device__ float __nv_floorf(float __a);
__device__ double __nv_fma(double __a, double __b, double __c);
__device__ float __nv_fmaf(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__device__ float __nv_fmaf_rd(float __a, float __b, float __c);
__device__ float __nv_fmaf_rn(float __a, float __b, float __c);
__device__ float __nv_fmaf_ru(float __a, float __b, float __c);
__device__ float __nv_fmaf_rz(float __a, float __b, float __c);
__device__ double __nv_fma_rd(double __a, double __b, double __c);
__device__ double __nv_fma_rn(double __a, double __b, double __c);
__device__ double __nv_fma_ru(double __a, double __b, double __c);
__device__ double __nv_fma_rz(double __a, double __b, double __c);
__device__ double __nv_fmax(double __a, double __b);
__device__ float __nv_fmaxf(float __a, float __b);
__device__ double __nv_fmin(double __a, double __b);
__device__ float __nv_fminf(float __a, float __b);
__device__ double __nv_fmod(double __a, double __b);
__device__ float __nv_fmodf(float __a, float __b);
__device__ float __nv_fmul_rd(float __a, float __b);
__device__ float __nv_fmul_rn(float __a, float __b);
__device__ float __nv_fmul_ru(float __a, float __b);
__device__ float __nv_fmul_rz(float __a, float __b);
__device__ float __nv_frcp_rd(float __a);
__device__ float __nv_frcp_rn(float __a);
__device__ float __nv_frcp_ru(float __a);
__device__ float __nv_frcp_rz(float __a);
__device__ double __nv_frexp(double __a, int *__b);
__device__ float __nv_frexpf(float __a, int *__b);
__device__ float __nv_frsqrt_rn(float __a);
__device__ float __nv_fsqrt_rd(float __a);
__device__ float __nv_fsqrt_rn(float __a);
__device__ float __nv_fsqrt_ru(float __a);
__device__ float __nv_fsqrt_rz(float __a);
__device__ float __nv_fsub_rd(float __a, float __b);
__device__ float __nv_fsub_rn(float __a, float __b);
__device__ float __nv_fsub_ru(float __a, float __b);
__device__ float __nv_fsub_rz(float __a, float __b);
__device__ int __nv_hadd(int __a, int __b);
__device__ float __nv_half2float(unsigned short __h);
__device__ double __nv_hiloint2double(int __a, int __b);
__device__ double __nv_hypot(double __a, double __b);
__device__ float __nv_hypotf(float __a, float __b);
__device__ int __nv_ilogb(double __a);
__device__ int __nv_ilogbf(float __a);
__device__ double __nv_int2double_rn(int __a);
__device__ float __nv_int2float_rd(int __a);
__device__ float __nv_int2float_rn(int __a);
__device__ float __nv_int2float_ru(int __a);
__device__ float __nv_int2float_rz(int __a);
__device__ float __nv_int_as_float(int __a);
__device__ int __nv_isfinited(double __a);
__device__ int __nv_isinfd(double __a);
__device__ int __nv_isinff(float __a);
__device__ int __nv_isnand(double __a);
__device__ int __nv_isnanf(float __a);
__device__ double __nv_j0(double __a);
__device__ float __nv_j0f(float __a);
__device__ double __nv_j1(double __a);
__device__ float __nv_j1f(float __a);
__device__ float __nv_jnf(int __a, float __b);
__device__ double __nv_jn(int __a, double __b);
__device__ double __nv_ldexp(double __a, int __b);
__device__ float __nv_ldexpf(float __a, int __b);
__device__ double __nv_lgamma(double __a);
__device__ float __nv_lgammaf(float __a);
__device__ double __nv_ll2double_rd(long long __a);
__device__ double __nv_ll2double_rn(long long __a);
__device__ double __nv_ll2double_ru(long long __a);
__device__ double __nv_ll2double_rz(long long __a);
__device__ float __nv_ll2float_rd(long long __a);
__device__ float __nv_ll2float_rn(long long __a);
__device__ float __nv_ll2float_ru(long long __a);
__device__ float __nv_ll2float_rz(long long __a);
__device__ long long __nv_llabs(long long __a);
__device__ long long __nv_llmax(long long __a, long long __b);
__device__ long long __nv_llmin(long long __a, long long __b);
__device__ long long __nv_llrint(double __a);
__device__ long long __nv_llrintf(float __a);
__device__ long long __nv_llround(double __a);
__device__ long long __nv_llroundf(float __a);
__device__ double __nv_log10(double __a);
__device__ float __nv_log10f(float __a);
__device__ double __nv_log1p(double __a);
__device__ float __nv_log1pf(float __a);
__device__ double __nv_log2(double __a);
__device__ float __nv_log2f(float __a);
__device__ double __nv_logb(double __a);
__device__ float __nv_logbf(float __a);
__device__ double __nv_log(double __a);
__device__ float __nv_logf(float __a);
__device__ double __nv_longlong_as_double(long long __a);
__device__ int __nv_max(int __a, int __b);
__device__ int __nv_min(int __a, int __b);
__device__ double __nv_modf(double __a, double *__b);
__device__ float __nv_modff(float __a, float *__b);
__device__ int __nv_mul24(int __a, int __b);
__device__ long long __nv_mul64hi(long long __a, long long __b);
__device__ int __nv_mulhi(int __a, int __b);
__device__ double __nv_nan(const signed char *__a);
__device__ float __nv_nanf(const signed char *__a);
__device__ double __nv_nearbyint(double __a);
__device__ float __nv_nearbyintf(float __a);
__device__ double __nv_nextafter(double __a, double __b);
__device__ float __nv_nextafterf(float __a, float __b);
__device__ double __nv_norm3d(double __a, double __b, double __c);
__device__ float __nv_norm3df(float __a, float __b, float __c);
__device__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__device__ double __nv_normcdf(double __a);
__device__ float __nv_normcdff(float __a);
__device__ double __nv_normcdfinv(double __a);
__device__ float __nv_normcdfinvf(float __a);
__device__ float __nv_normf(int __a, const float *__b);
__device__ double __nv_norm(int __a, const double *__b);
__device__ int __nv_popc(int __a);
__device__ int __nv_popcll(long long __a);
__device__ double __nv_pow(double __a, double __b);
__device__ float __nv_powf(float __a, float __b);
__device__ double __nv_powi(double __a, int __b);
__device__ float __nv_powif(float __a, int __b);
__device__ double __nv_rcbrt(double __a);
__device__ float __nv_rcbrtf(float __a);
__device__ double __nv_rcp64h(double __a);
__device__ double __nv_remainder(double __a, double __b);
__device__ float __nv_remainderf(float __a, float __b);
__device__ double __nv_remquo(double __a, double __b, int *__c);
__device__ float __nv_remquof(float __a, float __b, int *__c);
__device__ int __nv_rhadd(int __a, int __b);
__device__ double __nv_rhypot(double __a, double __b);
__device__ float __nv_rhypotf(float __a, float __b);
__device__ double __nv_rint(double __a);
__device__ float __nv_rintf(float __a);
__device__ double __nv_rnorm3d(double __a, double __b, double __c);
__device__ float __nv_rnorm3df(float __a, float __b, float __c);
__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__device__ float __nv_rnormf(int __a, const float *__b);
__device__ double __nv_rnorm(int __a, const double *__b);
__device__ double __nv_round(double __a);
__device__ float __nv_roundf(float __a);
__device__ double __nv_rsqrt(double __a);
__device__ float __nv_rsqrtf(float __a);
__device__ int __nv_sad(int __a, int __b, int __c);
__device__ float __nv_saturatef(float __a);
__device__ double __nv_scalbn(double __a, int __b);
__device__ float __nv_scalbnf(float __a, int __b);
__device__ int __nv_signbitd(double __a);
__device__ int __nv_signbitf(float __a);
__device__ void __nv_sincos(double __a, double *__b, double *__c);
__device__ void __nv_sincosf(float __a, float *__b, float *__c);
__device__ void __nv_sincospi(double __a, double *__b, double *__c);
__device__ void __nv_sincospif(float __a, float *__b, float *__c);
__device__ double __nv_sin(double __a);
__device__ float __nv_sinf(float __a);
__device__ double __nv_sinh(double __a);
__device__ float __nv_sinhf(float __a);
__device__ double __nv_sinpi(double __a);
__device__ float __nv_sinpif(float __a);
__device__ double __nv_sqrt(double __a);
__device__ float __nv_sqrtf(float __a);
__device__ double __nv_tan(double __a);
__device__ float __nv_tanf(float __a);
__device__ double __nv_tanh(double __a);
__device__ float __nv_tanhf(float __a);
__device__ double __nv_tgamma(double __a);
__device__ float __nv_tgammaf(float __a);
__device__ double __nv_trunc(double __a);
__device__ float __nv_truncf(float __a);
__device__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__device__ double __nv_uint2double_rn(unsigned int __i);
__device__ float __nv_uint2float_rd(unsigned int __a);
__device__ float __nv_uint2float_rn(unsigned int __a);
__device__ float __nv_uint2float_ru(unsigned int __a);
__device__ float __nv_uint2float_rz(unsigned int __a);
__device__ float __nv_uint_as_float(unsigned int __a);
__device__ double __nv_ull2double_rd(unsigned long long __a);
__device__ double __nv_ull2double_rn(unsigned long long __a);
__device__ double __nv_ull2double_ru(unsigned long long __a);
__device__ double __nv_ull2double_rz(unsigned long long __a);
__device__ float __nv_ull2float_rd(unsigned long long __a);
__device__ float __nv_ull2float_rn(unsigned long long __a);
__device__ float __nv_ull2float_ru(unsigned long long __a);
__device__ float __nv_ull2float_rz(unsigned long long __a);
__device__ unsigned long long __nv_ullmax(unsigned long long __a,
unsigned long long __b);
__device__ unsigned long long __nv_ullmin(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__device__ unsigned long long __nv_umul64hi(unsigned long long __a,
unsigned long long __b);
__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
unsigned int __c);
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
__device__ int __nv_vabs2(int __a);
__device__ int __nv_vabs4(int __a);
__device__ int __nv_vabsdiffs2(int __a, int __b);
__device__ int __nv_vabsdiffs4(int __a, int __b);
__device__ int __nv_vabsdiffu2(int __a, int __b);
__device__ int __nv_vabsdiffu4(int __a, int __b);
__device__ int __nv_vabsss2(int __a);
__device__ int __nv_vabsss4(int __a);
__device__ int __nv_vadd2(int __a, int __b);
__device__ int __nv_vadd4(int __a, int __b);
__device__ int __nv_vaddss2(int __a, int __b);
__device__ int __nv_vaddss4(int __a, int __b);
__device__ int __nv_vaddus2(int __a, int __b);
__device__ int __nv_vaddus4(int __a, int __b);
__device__ int __nv_vavgs2(int __a, int __b);
__device__ int __nv_vavgs4(int __a, int __b);
__device__ int __nv_vavgu2(int __a, int __b);
__device__ int __nv_vavgu4(int __a, int __b);
__device__ int __nv_vcmpeq2(int __a, int __b);
__device__ int __nv_vcmpeq4(int __a, int __b);
__device__ int __nv_vcmpges2(int __a, int __b);
__device__ int __nv_vcmpges4(int __a, int __b);
__device__ int __nv_vcmpgeu2(int __a, int __b);
__device__ int __nv_vcmpgeu4(int __a, int __b);
__device__ int __nv_vcmpgts2(int __a, int __b);
__device__ int __nv_vcmpgts4(int __a, int __b);
__device__ int __nv_vcmpgtu2(int __a, int __b);
__device__ int __nv_vcmpgtu4(int __a, int __b);
__device__ int __nv_vcmples2(int __a, int __b);
__device__ int __nv_vcmples4(int __a, int __b);
__device__ int __nv_vcmpleu2(int __a, int __b);
__device__ int __nv_vcmpleu4(int __a, int __b);
__device__ int __nv_vcmplts2(int __a, int __b);
__device__ int __nv_vcmplts4(int __a, int __b);
__device__ int __nv_vcmpltu2(int __a, int __b);
__device__ int __nv_vcmpltu4(int __a, int __b);
__device__ int __nv_vcmpne2(int __a, int __b);
__device__ int __nv_vcmpne4(int __a, int __b);
__device__ int __nv_vhaddu2(int __a, int __b);
__device__ int __nv_vhaddu4(int __a, int __b);
__device__ int __nv_vmaxs2(int __a, int __b);
__device__ int __nv_vmaxs4(int __a, int __b);
__device__ int __nv_vmaxu2(int __a, int __b);
__device__ int __nv_vmaxu4(int __a, int __b);
__device__ int __nv_vmins2(int __a, int __b);
__device__ int __nv_vmins4(int __a, int __b);
__device__ int __nv_vminu2(int __a, int __b);
__device__ int __nv_vminu4(int __a, int __b);
__device__ int __nv_vneg2(int __a);
__device__ int __nv_vneg4(int __a);
__device__ int __nv_vnegss2(int __a);
__device__ int __nv_vnegss4(int __a);
__device__ int __nv_vsads2(int __a, int __b);
__device__ int __nv_vsads4(int __a, int __b);
__device__ int __nv_vsadu2(int __a, int __b);
__device__ int __nv_vsadu4(int __a, int __b);
__device__ int __nv_vseteq2(int __a, int __b);
__device__ int __nv_vseteq4(int __a, int __b);
__device__ int __nv_vsetges2(int __a, int __b);
__device__ int __nv_vsetges4(int __a, int __b);
__device__ int __nv_vsetgeu2(int __a, int __b);
__device__ int __nv_vsetgeu4(int __a, int __b);
__device__ int __nv_vsetgts2(int __a, int __b);
__device__ int __nv_vsetgts4(int __a, int __b);
__device__ int __nv_vsetgtu2(int __a, int __b);
__device__ int __nv_vsetgtu4(int __a, int __b);
__device__ int __nv_vsetles2(int __a, int __b);
__device__ int __nv_vsetles4(int __a, int __b);
__device__ int __nv_vsetleu2(int __a, int __b);
__device__ int __nv_vsetleu4(int __a, int __b);
__device__ int __nv_vsetlts2(int __a, int __b);
__device__ int __nv_vsetlts4(int __a, int __b);
__device__ int __nv_vsetltu2(int __a, int __b);
__device__ int __nv_vsetltu4(int __a, int __b);
__device__ int __nv_vsetne2(int __a, int __b);
__device__ int __nv_vsetne4(int __a, int __b);
__device__ int __nv_vsub2(int __a, int __b);
__device__ int __nv_vsub4(int __a, int __b);
__device__ int __nv_vsubss2(int __a, int __b);
__device__ int __nv_vsubss4(int __a, int __b);
__device__ int __nv_vsubus2(int __a, int __b);
__device__ int __nv_vsubus4(int __a, int __b);
#endif // CUDA_VERSION
__device__ double __nv_y0(double __a);
__device__ float __nv_y0f(float __a);
__device__ double __nv_y1(double __a);
__device__ float __nv_y1f(float __a);
__device__ float __nv_ynf(int __a, float __b);
__device__ double __nv_yn(int __a, double __b);
} // extern "C"
#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__

View File

@ -62,7 +62,7 @@
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020
#error "Unsupported CUDA version!"
#endif
@ -84,6 +84,9 @@
#define __DEVICE_FUNCTIONS_H__
#define __MATH_FUNCTIONS_H__
#define __COMMON_FUNCTIONS_H__
// device_functions_decls is replaced by __clang_cuda_device_functions.h
// included below.
#define __DEVICE_FUNCTIONS_DECLS_H__
#undef __CUDACC__
#if CUDA_VERSION < 9000
@ -97,11 +100,17 @@
#include "host_config.h"
#include "host_defines.h"
// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
// functional equivalent of what we need.
#pragma push_macro("nv_weak")
#define nv_weak weak
#undef __CUDABE__
#undef __CUDA_LIBDEVICE__
#define __CUDACC__
#include "cuda_runtime.h"
#pragma pop_macro("nv_weak")
#undef __CUDACC__
#define __CUDABE__
@ -137,20 +146,22 @@ inline __host__ double __signbitd(double x) {
}
#endif
// We need decls for functions in CUDA's libdevice with __device__
// attribute only. Alas they come either as __host__ __device__ or
// with no attributes at all. To work around that, define __CUDA_RTC__
// which produces HD variant and undef __host__ which gives us desided
// decls with __device__ attribute.
#pragma push_macro("__host__")
#define __host__
#define __CUDACC_RTC__
#include "device_functions_decls.h"
#undef __CUDACC_RTC__
// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
// to provide our own.
#include <__clang_cuda_libdevice_declares.h>
// Temporarily poison __host__ macro to ensure it's not used by any of
// the headers we're about to include.
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
// Wrappers for many device-side standard library functions became compiler
// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now
// provides its own implementation of the wrappers.
#if CUDA_VERSION >= 9000
#include <__clang_cuda_device_functions.h>
#endif
// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
// counterpart does not do it, so we need to make it empty here to keep
// following CUDA includes happy.
#undef __THROW
#define __THROW
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
// Previous versions used to check whether they are defined or not.
@ -167,24 +178,20 @@ inline __host__ double __signbitd(double x) {
#endif
#endif
// Temporarily poison __host__ macro to ensure it's not used by any of
// the headers we're about to include.
#pragma push_macro("__host__")
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
// device_functions.hpp and math_functions*.hpp use 'static
// __forceinline__' (with no __device__) for definitions of device
// functions. Temporarily redefine __forceinline__ to include
// __device__.
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
#pragma push_macro("__float2half_rn")
#if CUDA_VERSION >= 9000
// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
// device_functions.hpp out of the way.
#define __float2half_rn __float2half_rn_disabled
#endif
#if CUDA_VERSION < 9000
#include "device_functions.hpp"
#pragma pop_macro("__float2half_rn")
#endif
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
@ -196,17 +203,32 @@ inline __host__ double __signbitd(double x) {
#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
#define __USE_FAST_MATH__ 1
#endif
#if CUDA_VERSION >= 9000
// CUDA-9.2 needs host-side memcpy for some host functions in
// device_functions.hpp
#if CUDA_VERSION >= 9020
#include <string.h>
#endif
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
#endif
#pragma pop_macro("__USE_FAST_MATH__")
#if CUDA_VERSION < 9000
#include "math_functions_dbl_ptx3.hpp"
#endif
#pragma pop_macro("__forceinline__")
// Pull in host-only functions that are only available when neither
// __CUDACC__ nor __CUDABE__ are defined.
#undef __MATH_FUNCTIONS_HPP__
#undef __CUDABE__
#if CUDA_VERSION < 9000
#include "math_functions.hpp"
#endif
// Alas, additional overloads for these functions are hard to get to.
// Considering that we only need these overloads for a few functions,
// we can provide them here.
@ -222,22 +244,36 @@ static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
static inline float normcdf(float __a) { return normcdff(__a); }
static inline float erfcx(float __a) { return erfcxf(__a); }
#if CUDA_VERSION < 9000
// For some reason single-argument variant is not always declared by
// CUDA headers. Alas, device_functions.hpp included below needs it.
static inline __device__ void __brkpt(int __c) { __brkpt(); }
#endif
// Now include *.hpp with definitions of various GPU functions. Alas,
// a lot of thins get declared/defined with __host__ attribute which
// we don't want and we have to define it out. We also have to include
// {device,math}_functions.hpp again in order to extract the other
// branch of #if/else inside.
#define __host__
#undef __CUDABE__
#define __CUDACC__
#if CUDA_VERSION >= 9000
// Some atomic functions became compiler builtins in CUDA-9 , so we need their
// declarations.
#include "device_atomic_functions.h"
#endif
#undef __DEVICE_FUNCTIONS_HPP__
#include "device_atomic_functions.hpp"
#if CUDA_VERSION >= 9000
#include "crt/device_functions.hpp"
#include "crt/device_double_functions.hpp"
#else
#include "device_functions.hpp"
#define __CUDABE__
#include "device_double_functions.h"
#undef __CUDABE__
#endif
#include "sm_20_atomic_functions.hpp"
#include "sm_20_intrinsics.hpp"
#include "sm_32_atomic_functions.hpp"
@ -251,8 +287,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
// reason about our code.
#if CUDA_VERSION >= 8000
#pragma push_macro("__CUDA_ARCH__")
#undef __CUDA_ARCH__
#include "sm_60_atomic_functions.hpp"
#include "sm_61_intrinsics.hpp"
#pragma pop_macro("__CUDA_ARCH__")
#endif
#undef __MATH_FUNCTIONS_HPP__
@ -279,7 +318,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
#endif
#endif
#if CUDA_VERSION >= 9000
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
#endif
#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("_LIBCPP_VERSION")
#pragma pop_macro("__GNUC__")

View File

@ -20,15 +20,18 @@
*
*===-----------------------------------------------------------------------===
*/
#ifndef _WMMINTRIN_AES_H
#define _WMMINTRIN_AES_H
#include <emmintrin.h>
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_AES_H
#define __WMMINTRIN_AES_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
/// \brief Performs a single round of AES encryption using the Equivalent
/// Performs a single round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
@ -48,7 +51,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R)
return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES encryption using the Equivalent
/// Performs the final round of AES encryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
@ -68,7 +71,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R)
return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs a single round of AES decryption using the Equivalent
/// Performs a single round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
@ -88,7 +91,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R)
return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
}
/// \brief Performs the final round of AES decryption using the Equivalent
/// Performs the final round of AES decryption using the Equivalent
/// Inverse Cipher, transforming the state value from the first source
/// operand using a 128-bit round key value contained in the second source
/// operand, and writes the result to the destination.
@ -108,7 +111,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R)
return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
}
/// \brief Applies the AES InvMixColumns() transformation to an expanded key
/// Applies the AES InvMixColumns() transformation to an expanded key
/// contained in the source operand, and writes the result to the
/// destination.
///
@ -125,7 +128,7 @@ _mm_aesimc_si128(__m128i __V)
return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
}
/// \brief Generates a round key for AES encyption, operating on 128-bit data
/// Generates a round key for AES encryption, operating on 128-bit data
/// specified in the first source operand and using an 8-bit round constant
/// specified by the second source operand, and writes the result to the
/// destination.
@ -148,4 +151,4 @@ _mm_aesimc_si128(__m128i __V)
#undef __DEFAULT_FN_ATTRS
#endif /* _WMMINTRIN_AES_H */
#endif /* __WMMINTRIN_AES_H */

View File

@ -20,10 +20,15 @@
*
*===-----------------------------------------------------------------------===
*/
#ifndef _WMMINTRIN_PCLMUL_H
#define _WMMINTRIN_PCLMUL_H
/// \brief Multiplies two 64-bit integer values, which are selected from source
#ifndef __WMMINTRIN_H
#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
#endif
#ifndef __WMMINTRIN_PCLMUL_H
#define __WMMINTRIN_PCLMUL_H
/// Multiplies two 64-bit integer values, which are selected from source
/// operands using the immediate-value operand. The multiplication is a
/// carry-less multiplication, and the 128-bit integer product is stored in
/// the destination.
@ -50,8 +55,8 @@
/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
/// \returns The 128-bit integer vector containing the result of the carry-less
/// multiplication of the selected 64-bit values.
#define _mm_clmulepi64_si128(__X, __Y, __I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \
(__v2di)(__m128i)(__Y), (char)(__I)))
#define _mm_clmulepi64_si128(X, Y, I) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (char)(I)))
#endif /* _WMMINTRIN_PCLMUL_H */
#endif /* __WMMINTRIN_PCLMUL_H */

View File

@ -27,9 +27,9 @@
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index \a idx and of the length \a len.
///
/// \headerfile <x86intrin.h>
@ -57,7 +57,7 @@
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
(char)(len), (char)(idx)))
/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit
/// Extracts the specified bits from the lower 64 bits of the 128-bit
/// integer vector operand at the index and of the length specified by
/// \a __y.
///
@ -82,7 +82,7 @@ _mm_extract_si64(__m128i __x, __m128i __y)
return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
}
/// \brief Inserts bits of a specified length from the source integer vector
/// Inserts bits of a specified length from the source integer vector
/// \a y into the lower 64 bits of the destination integer vector \a x at
/// the index \a idx and of the length \a len.
///
@ -120,7 +120,7 @@ _mm_extract_si64(__m128i __x, __m128i __y)
(__v2di)(__m128i)(y), \
(char)(len), (char)(idx)))
/// \brief Inserts bits of a specified length from the source integer vector
/// Inserts bits of a specified length from the source integer vector
/// \a __y into the lower 64 bits of the destination integer vector \a __x
/// at the index and of the length specified by \a __y.
///
@ -152,7 +152,7 @@ _mm_insert_si64(__m128i __x, __m128i __y)
return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
}
/// \brief Stores a 64-bit double-precision value in a 64-bit memory location.
/// Stores a 64-bit double-precision value in a 64-bit memory location.
/// To minimize caching, the data is flagged as non-temporal (unlikely to be
/// used again soon).
///
@ -170,7 +170,7 @@ _mm_stream_sd(double *__p, __m128d __a)
__builtin_ia32_movntsd(__p, (__v2df)__a);
}
/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit
/// Stores a 32-bit single-precision floating-point value in a 32-bit
/// memory location. To minimize caching, the data is flagged as
/// non-temporal (unlikely to be used again soon).
///

1499
c_headers/arm_fp16.h Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,7 @@
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_popcnt_epi16(__m512i __A)
@ -48,7 +48,7 @@ _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(),
return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
__U,
__B);
}
@ -70,7 +70,7 @@ _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
{
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(),
return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
__U,
__B);
}

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,7 @@
#define __AVX512CDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_conflict_epi64 (__m512i __A)
@ -82,49 +82,45 @@ _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi32 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) -1);
return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) __W,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
(__v16si) _mm512_setzero_si512 (),
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
(__v16si)_mm512_lzcnt_epi32(__A),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_lzcnt_epi64 (__m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) -1);
return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) __W,
(__mmask8) __U);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
{
return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
(__v8di) _mm512_setzero_si512 (),
(__mmask8) __U);
return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
(__v8di)_mm512_lzcnt_epi64(__A),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS

File diff suppressed because it is too large Load Diff

View File

@ -27,21 +27,21 @@
#ifndef __AVX512ERINTRIN_H
#define __AVX512ERINTRIN_H
// exp2a23
#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \
/* exp2a23 */
#define _mm512_exp2a23_round_pd(A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -52,20 +52,20 @@
#define _mm512_maskz_exp2a23_pd(M, A) \
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \
#define _mm512_exp2a23_round_ps(A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); })
(__mmask16)-1, (int)(R))
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); })
(__mmask16)(M), (int)(R))
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -76,21 +76,21 @@
#define _mm512_maskz_exp2a23_ps(M, A) \
_mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
// rsqrt28
#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \
/* rsqrt28 */
#define _mm512_rsqrt28_round_pd(A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -101,20 +101,20 @@
#define _mm512_maskz_rsqrt28_pd(M, A) \
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \
#define _mm512_rsqrt28_round_ps(A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); })
(__mmask16)-1, (int)(R))
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); })
(__mmask16)(M), (int)(R))
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -125,23 +125,23 @@
#define _mm512_maskz_rsqrt28_ps(M, A) \
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \
#define _mm_rsqrt28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -152,23 +152,23 @@
#define _mm_maskz_rsqrt28_ss(M, A, B) \
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \
#define _mm_rsqrt28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -179,21 +179,21 @@
#define _mm_maskz_rsqrt28_sd(M, A, B) \
_mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
// rcp28
#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \
/* rcp28 */
#define _mm512_rcp28_round_pd(A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -204,20 +204,20 @@
#define _mm512_maskz_rcp28_pd(M, A) \
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \
#define _mm512_rcp28_round_ps(A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R)); })
(__mmask16)-1, (int)(R))
#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R)); })
(int)(R))
#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R)); })
(__mmask16)(M), (int)(R))
#define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -228,23 +228,23 @@
#define _mm512_maskz_rcp28_ps(M, A) \
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \
#define _mm_rcp28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -255,23 +255,23 @@
#define _mm_maskz_rcp28_ss(M, A, B) \
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \
#define _mm_rcp28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R)); })
(__mmask8)-1, (int)(R))
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R)); })
(__mmask8)(M), (int)(R))
#define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -282,4 +282,4 @@
#define _mm_maskz_rcp28_sd(M, A, B) \
_mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#endif // __AVX512ERINTRIN_H
#endif /* __AVX512ERINTRIN_H */

File diff suppressed because it is too large Load Diff

View File

@ -29,62 +29,52 @@
#define __IFMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) -1);
return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y)
_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
(__v8di) __X,
(__v8di) __Y,
(__mmask8) __M);
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) __M);
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) -1);
return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
(__v8di) __Z);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y)
_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
(__v8di) __X,
(__v8di) __Y,
(__mmask8) __M);
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
(__v8di)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
{
return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
(__v8di) __Y,
(__v8di) __Z,
(__mmask8) __M);
return (__m512i)__builtin_ia32_selectq_512(__M,
(__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
(__v8di)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS

View File

@ -29,121 +29,105 @@
#define __IFMAVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
(__v2di) __Z);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
(__v2di) __X,
(__v2di) __Y,
(__mmask8) __M);
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) __M);
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
(__v4di) __X,
(__v4di) __Y,
(__mmask8) __M);
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) __M);
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
(__v2di)__Z);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
(__v2di) __X,
(__v2di) __Y,
(__mmask8) __M);
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
{
return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
(__v2di) __Y,
(__v2di) __Z,
(__mmask8) __M);
return (__m128i)__builtin_ia32_selectq_128(__M,
(__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
(__v4di)__Z);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
(__v4di) __X,
(__v4di) __Y,
(__mmask8) __M);
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
{
return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
(__v4di) __Y,
(__v4di) __Z,
(__mmask8) __M);
return (__m256i)__builtin_ia32_selectq_256(__M,
(__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View File

@ -1,4 +1,4 @@
/*===------------- avx512pfintrin.h - PF intrinsics ------------------===
/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,80 +31,80 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\
(int)(hint))
#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\
#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); })
(int)(scale), (int)(hint))
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\
#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfdps((__mmask16) -1, \
(__v16si)(__m512i)(index), (int const *)(addr), \
(int)(scale), (int)(hint)); })
(int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\
#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
__builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
(long long const *)(addr), (int)(scale), \
(int)(hint)); })
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\
(int)(hint))
#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); })
(int const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\
#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
__builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
(int const *)(addr), (int)(scale), (int)(hint)); })
(int const *)(addr), (int)(scale), (int)(hint))
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\
#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\
#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
(int *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfdps((__mmask16)(mask), \
(__v16si)(__m512i)(index), (int *)(addr), \
(int)(scale), (int)(hint)); })
(int)(scale), (int)(hint))
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\
#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
(long long *)(addr), (int)(scale), \
(int)(hint)); })
(int)(hint))
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\
#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
(int *)(addr), (int)(scale), (int)(hint))
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\
#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
__builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
(int *)(addr), (int)(scale), (int)(hint)); })
(int *)(addr), (int)(scale), (int)(hint))
#undef __DEFAULT_FN_ATTRS

View File

@ -29,7 +29,7 @@
#define __AVX512VBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
@ -44,7 +44,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_hi(),
(__v32hi) _mm512_setzero_si512(),
__U);
}
@ -60,7 +60,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_qi(),
(__v64qi) _mm512_setzero_si512(),
__U);
}
@ -90,7 +90,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
(__v32hi) _mm512_setzero_hi(),
(__v32hi) _mm512_setzero_si512(),
__U);
}
@ -106,7 +106,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
{
return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
(__v64qi) _mm512_setzero_qi(),
(__v64qi) _mm512_setzero_si512(),
__U);
}
@ -122,7 +122,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
(__v32hi) _mm512_setzero_hi(),
(__v32hi) _mm512_setzero_si512(),
__U);
}
@ -138,87 +138,93 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
{
return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
(__v64qi) _mm512_setzero_qi(),
(__v64qi) _mm512_setzero_si512(),
__U);
}
#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \
(__v8di)(B), \
(int)(I), \
(__v8di)(S), \
(__mmask8)(U)); })
#define _mm512_shldi_epi64(A, B, I) \
(__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I))
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
_mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_shldi_epi64(A, B, I) \
_mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \
(__v16si)(B), \
(int)(I), \
(__v16si)(S), \
(__mmask16)(U)); })
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
_mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512())
#define _mm512_shldi_epi32(A, B, I) \
_mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
(__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \
(__v32hi)(B), \
(int)(I), \
(__v32hi)(S), \
(__mmask32)(U)); })
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
_mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512())
#define _mm512_shldi_epi16(A, B, I) \
_mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
(__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \
(__v8di)(B), \
(int)(I), \
(__v8di)(S), \
(__mmask8)(U)); })
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512())
#define _mm512_shrdi_epi64(A, B, I) \
_mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I))
(__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \
(__v16si)(B), \
(int)(I), \
(__v16si)(S), \
(__mmask16)(U)); })
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512())
#define _mm512_shrdi_epi32(A, B, I) \
_mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I))
(__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \
(__v32hi)(B), \
(int)(I), \
(__v32hi)(S), \
(__mmask32)(U)); })
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
_mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512())
#define _mm512_shrdi_epi16(A, B, I) \
_mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I))
(__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512())
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B)

View File

@ -29,79 +29,65 @@
#define __VBMIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
__mmask64 __U, __m512i __B)
_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
{
return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
(__v64qi) __I
/* idx */ ,
(__v64qi) __B,
(__mmask64) __U);
return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
__m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) -1);
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
__m512i __I, __m512i __B)
_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
__m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) __U);
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)__I);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
__m512i __I, __m512i __B)
_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
__m512i __B)
{
return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
/* idx */ ,
(__v64qi) __A,
(__v64qi) __B,
(__mmask64) __U);
return (__m512i)__builtin_ia32_selectb_512(__U,
(__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) _mm512_undefined_epi32 (),
(__mmask64) -1);
return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) _mm512_setzero_si512(),
(__mmask64) __M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B)
{
return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
(__v64qi) __A,
(__v64qi) __W,
(__mmask64) __M);
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
(__v64qi)_mm512_permutexvar_epi8(__A, __B),
(__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS

View File

@ -29,161 +29,127 @@
#define __VBMIVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
{
return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
(__v16qi) __I
/* idx */ ,
(__v16qi) __B,
(__mmask16)
__U);
return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
(__v16qi)__I,
(__v16qi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
__mmask32 __U, __m256i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B)
{
return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
(__v32qi) __I
/* idx */ ,
(__v32qi) __B,
(__mmask32)
__U);
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16) -
1);
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)__I);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16)
__U);
return (__m128i)__builtin_ia32_selectb_128(__U,
(__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
{
return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
/* idx */ ,
(__v16qi) __A,
(__v16qi) __B,
(__mmask16)
__U);
return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
(__v32qi)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
__m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32) -
1);
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
__m256i __I, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
__m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32)
__U);
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)__I);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
__m256i __I, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
__m256i __B)
{
return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
/* idx */ ,
(__v32qi) __A,
(__v32qi) __B,
(__mmask32)
__U);
return (__m256i)__builtin_ia32_selectb_256(__U,
(__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) _mm_undefined_si128 (),
(__mmask16) -1);
return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) _mm_setzero_si128 (),
(__mmask16) __M);
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
__m128i __B)
{
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
(__v16qi) __A,
(__v16qi) __W,
(__mmask16) __M);
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
(__v16qi)_mm_permutexvar_epi8(__A, __B),
(__v16qi)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) _mm256_undefined_si256 (),
(__mmask32) -1);
return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) _mm256_setzero_si256 (),
(__mmask32) __M);
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
__m256i __B)
{
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
(__v32qi) __A,
(__v32qi) __W,
(__mmask32) __M);
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
(__v32qi)_mm256_permutexvar_epi8(__A, __B),
(__v32qi)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -192,7 +158,7 @@ _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i
(__mmask16) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -202,7 +168,7 @@ _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
(__mmask16) __M);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
@ -212,7 +178,7 @@ _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
(__mmask16) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -221,7 +187,7 @@ _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m2
(__mmask32) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -231,7 +197,7 @@ _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
(__mmask32) __M);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
{
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
@ -242,6 +208,7 @@ _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View File

@ -1,4 +1,4 @@
/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------===
/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@ -29,15 +29,16 @@
#define __AVX512VLBITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi16(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
@ -45,7 +46,7 @@ _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
(__v16hi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
@ -53,35 +54,35 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_popcnt_epi16(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi16(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
(__v8hi) _mm128_popcnt_epi16(__B),
(__v8hi) _mm_popcnt_epi16(__B),
(__v8hi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
{
return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi8(__m256i __A)
{
return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
@ -89,7 +90,7 @@ _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
(__v32qi) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
{
return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
@ -97,61 +98,62 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_popcnt_epi8(__m128i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi8(__m128i __A)
{
return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
(__v16qi) _mm128_popcnt_epi8(__B),
(__v16qi) _mm_popcnt_epi8(__B),
(__v16qi) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
{
return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
__U,
__B);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B)
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
{
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
(__v32qi) __B,
__U);
}
static __inline__ __mmask32 __DEFAULT_FN_ATTRS
_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B)
static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
{
return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1,
return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
__A,
__B);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B)
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
{
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
(__v16qi) __B,
__U);
}
static __inline__ __mmask16 __DEFAULT_FN_ATTRS
_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B)
static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
{
return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1,
return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
__A,
__B);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------===
/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -28,35 +28,36 @@
#define __AVX512VLCDINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmb_epi64 (__mmask8 __A)
{
{
return (__m128i) _mm_set1_epi64x((long long) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmb_epi64 (__mmask8 __A)
{
return (__m256i) _mm256_set1_epi64x((long long)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_broadcastmw_epi32 (__mmask16 __A)
{
return (__m128i) _mm_set1_epi32((int)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastmw_epi32 (__mmask16 __A)
{
return (__m256i) _mm256_set1_epi32((int)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@ -64,7 +65,7 @@ _mm_conflict_epi64 (__m128i __A)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
@ -72,16 +73,16 @@ _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
_mm_setzero_si128 (),
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -89,7 +90,7 @@ _mm256_conflict_epi64 (__m256i __A)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -97,7 +98,7 @@ _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
@ -105,7 +106,7 @@ _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_conflict_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -113,7 +114,7 @@ _mm_conflict_epi32 (__m128i __A)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -121,7 +122,7 @@ _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
@ -129,7 +130,7 @@ _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_conflict_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -137,7 +138,7 @@ _mm256_conflict_epi32 (__m256i __A)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -145,7 +146,7 @@ _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
@ -154,110 +155,95 @@ _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
(__mmask8) __U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi32 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si)
_mm_setzero_si128 (),
(__mmask8) -1);
return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si) __W,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
(__v4si)
_mm_setzero_si128 (),
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
(__v4si)_mm_lzcnt_epi32(__A),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi32 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) -1);
return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si) __W,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
(__v8si)
_mm256_setzero_si256 (),
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
(__v8si)_mm256_lzcnt_epi32(__A),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_lzcnt_epi64 (__m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
(__mmask8) -1);
return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di) __W,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
{
return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
(__v2di)
_mm_setzero_di (),
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
(__v2di)_mm_lzcnt_epi64(__A),
(__v2di)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_lzcnt_epi64 (__m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di)
_mm256_setzero_si256 (),
(__mmask8) -1);
return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di) __W,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
{
return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
(__v4di)
_mm256_setzero_si256 (),
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
(__v4di)_mm256_lzcnt_epi64(__A),
(__v4di)_mm256_setzero_si256());
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __AVX512VLCDINTRIN_H */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -29,130 +29,120 @@
#define __AVX512VLVBMI2INTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
static __inline __m128i __DEFAULT_FN_ATTRS
_mm128_setzero_hi(void) {
return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 };
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
(__v8hi) _mm128_setzero_hi(),
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
(__v16qi) _mm128_setzero_hi(),
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
{
__builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
static __inline__ void __DEFAULT_FN_ATTRS128
_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
{
__builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
(__v8hi) _mm128_setzero_hi(),
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
{
return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
(__v16qi) _mm128_setzero_hi(),
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
(__v8hi) _mm128_setzero_hi(),
(__v8hi) _mm_setzero_si128(),
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) __S,
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
{
return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
(__v16qi) _mm128_setzero_hi(),
(__v16qi) _mm_setzero_si128(),
__U);
}
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_setzero_hi(void) {
return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0 };
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
@ -160,15 +150,15 @@ _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_hi(),
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
@ -176,29 +166,29 @@ _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_hi(),
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
{
__builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
__U);
}
static __inline__ void __DEFAULT_FN_ATTRS
static __inline__ void __DEFAULT_FN_ATTRS256
_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
{
__builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
@ -206,15 +196,15 @@ _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
(__v16hi) _mm256_setzero_hi(),
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
@ -222,15 +212,15 @@ _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
{
return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
(__v32qi) _mm256_setzero_hi(),
(__v32qi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
@ -238,15 +228,15 @@ _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
(__v16hi) _mm256_setzero_hi(),
(__v16hi) _mm256_setzero_si256(),
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
@ -254,171 +244,183 @@ _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
{
return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
(__v32qi) _mm256_setzero_hi(),
(__v32qi) _mm256_setzero_si256(),
__U);
}
#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \
(__v4di)(B), \
(int)(I), \
(__v4di)(S), \
(__mmask8)(U)); })
#define _mm256_shldi_epi64(A, B, I) \
(__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I))
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
_mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256())
#define _mm256_shldi_epi64(A, B, I) \
_mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shldi_epi64(A, B, I) \
(__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I))
#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \
(__v2di)(B), \
(int)(I), \
(__v2di)(S), \
(__mmask8)(U)); })
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S))
#define _mm128_maskz_shldi_epi64(U, A, B, I) \
_mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm128_shldi_epi64(A, B, I) \
_mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \
(__v8si)(B), \
(int)(I), \
(__v8si)(S), \
(__mmask8)(U)); })
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
_mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm_maskz_shldi_epi64(U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128())
#define _mm256_shldi_epi32(A, B, I) \
_mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
(__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I))
#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \
(__v4si)(B), \
(int)(I), \
(__v4si)(S), \
(__mmask8)(U)); })
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S))
#define _mm128_maskz_shldi_epi32(U, A, B, I) \
_mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256())
#define _mm128_shldi_epi32(A, B, I) \
_mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shldi_epi32(A, B, I) \
(__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \
(__v16hi)(B), \
(int)(I), \
(__v16hi)(S), \
(__mmask16)(U)); })
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
_mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm_maskz_shldi_epi32(U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128())
#define _mm256_shldi_epi16(A, B, I) \
_mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
(__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I))
#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \
(__v8hi)(B), \
(int)(I), \
(__v8hi)(S), \
(__mmask8)(U)); })
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S))
#define _mm128_maskz_shldi_epi16(U, A, B, I) \
_mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256())
#define _mm128_shldi_epi16(A, B, I) \
_mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shldi_epi16(A, B, I) \
(__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \
(__v4di)(B), \
(int)(I), \
(__v4di)(S), \
(__mmask8)(U)); })
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm_maskz_shldi_epi16(U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128())
#define _mm256_shrdi_epi64(A, B, I) \
_mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
(__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I))
#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \
(__v2di)(B), \
(int)(I), \
(__v2di)(S), \
(__mmask8)(U)); })
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S))
#define _mm128_maskz_shrdi_epi64(U, A, B, I) \
_mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256())
#define _mm128_shrdi_epi64(A, B, I) \
_mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shrdi_epi64(A, B, I) \
(__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \
(__v8si)(B), \
(int)(I), \
(__v8si)(S), \
(__mmask8)(U)); })
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128())
#define _mm256_shrdi_epi32(A, B, I) \
_mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
(__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I))
#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \
(__v4si)(B), \
(int)(I), \
(__v4si)(S), \
(__mmask8)(U)); })
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S))
#define _mm128_maskz_shrdi_epi32(U, A, B, I) \
_mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256())
#define _mm128_shrdi_epi32(A, B, I) \
_mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shrdi_epi32(A, B, I) \
(__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \
(__v16hi)(B), \
(int)(I), \
(__v16hi)(S), \
(__mmask16)(U)); })
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128())
#define _mm256_shrdi_epi16(A, B, I) \
_mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I))
(__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I))
#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \
(__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \
(__v8hi)(B), \
(int)(I), \
(__v8hi)(S), \
(__mmask8)(U)); })
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S))
#define _mm128_maskz_shrdi_epi16(U, A, B, I) \
_mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256())
#define _mm128_shrdi_epi16(A, B, I) \
_mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I))
#define _mm_shrdi_epi16(A, B, I) \
(__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I))
static __inline__ __m256i __DEFAULT_FN_ATTRS
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128())
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
@ -427,7 +429,7 @@ _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S,
@ -436,7 +438,7 @@ _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S,
@ -445,8 +447,8 @@ _mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
(__v2di) __A,
@ -454,8 +456,8 @@ _mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S,
(__v2di) __A,
@ -463,8 +465,8 @@ _mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S,
(__v2di) __A,
@ -472,7 +474,7 @@ _mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
@ -481,7 +483,7 @@ _mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S,
@ -490,7 +492,7 @@ _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S,
@ -499,8 +501,8 @@ _mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
(__v4si) __A,
@ -508,8 +510,8 @@ _mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S,
(__v4si) __A,
@ -517,8 +519,8 @@ _mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S,
(__v4si) __A,
@ -526,7 +528,7 @@ _mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
@ -535,7 +537,7 @@ _mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S,
@ -544,7 +546,7 @@ _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S,
@ -553,8 +555,8 @@ _mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B)
(__mmask16) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
(__v8hi) __A,
@ -562,8 +564,8 @@ _mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S,
(__v8hi) __A,
@ -571,8 +573,8 @@ _mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S,
(__v8hi) __A,
@ -580,7 +582,7 @@ _mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
@ -589,7 +591,7 @@ _mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S,
@ -598,7 +600,7 @@ _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S,
@ -607,8 +609,8 @@ _mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
(__v2di) __A,
@ -616,8 +618,8 @@ _mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S,
(__v2di) __A,
@ -625,8 +627,8 @@ _mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S,
(__v2di) __A,
@ -634,7 +636,7 @@ _mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
@ -643,7 +645,7 @@ _mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S,
@ -652,7 +654,7 @@ _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S,
@ -661,8 +663,8 @@ _mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B)
(__mmask8) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
(__v4si) __A,
@ -670,8 +672,8 @@ _mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S,
(__v4si) __A,
@ -679,8 +681,8 @@ _mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S,
(__v4si) __A,
@ -688,7 +690,7 @@ _mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B)
(__mmask8) -1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
@ -697,7 +699,7 @@ _mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S,
@ -706,7 +708,7 @@ _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B)
__U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S,
@ -715,8 +717,8 @@ _mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B)
(__mmask16) -1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
(__v8hi) __A,
@ -724,8 +726,8 @@ _mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S,
(__v8hi) __A,
@ -733,8 +735,8 @@ _mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
__U);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S,
(__v8hi) __A,
@ -743,6 +745,7 @@ _mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B)
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View File

@ -29,226 +29,195 @@
#define __AVX512VLVNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
(__v8si) __A,
(__v8si) __B,
(__mmask8) -1);
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m256i)__builtin_ia32_selectd_256(__U,
(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)__S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) __U);
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
(__v4si) __A,
(__v4si) __B,
(__mmask8) -1);
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)__S);
}
#undef __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_selectd_128(__U,
(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
(__v4si)_mm_setzero_si128());
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

View File

@ -29,117 +29,100 @@
#define __AVX512VNNIINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) __U);
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S,
(__v16si) __A,
(__v16si) __B,
(__mmask16) -1);
return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
(__v16si)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)__S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_selectd_512(__U,
(__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
(__v16si)_mm512_setzero_si512());
}
#undef __DEFAULT_FN_ATTRS

View File

@ -1,5 +1,4 @@
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
*------------------===
/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@ -32,8 +31,7 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd" \
"q")))
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);

View File

@ -1,5 +1,4 @@
/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
*------------------===
/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@ -31,69 +30,76 @@
#define __AVX512VPOPCNTDQVLINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl")))
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) {
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi64(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectq_128(
(__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) {
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_popcnt_epi32(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_selectd_128(
(__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) {
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi64(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectq_256(
(__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) {
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi32(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_selectd_256(
(__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif

File diff suppressed because it is too large Load Diff

View File

@ -49,7 +49,7 @@
to use it as a potentially faster version of BSF. */
#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
/// \brief Counts the number of trailing zero bits in the operand.
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -65,7 +65,7 @@ __tzcnt_u16(unsigned short __X)
return __X ? __builtin_ctzs(__X) : 16;
}
/// \brief Performs a bitwise AND of the second operand with the one's
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
@ -85,7 +85,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
}
/* AMD-specified, double-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
@ -100,6 +100,7 @@ __andn_u32(unsigned int __X, unsigned int __Y)
/// number of bits to be extracted.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
/// \see _bextr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__bextr_u32(unsigned int __X, unsigned int __Y)
{
@ -107,7 +108,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
@ -124,13 +125,14 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/// \brief Clears all bits in the source except for the least significant bit
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
@ -147,7 +149,7 @@ __blsi_u32(unsigned int __X)
return __X & -__X;
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
@ -164,7 +166,7 @@ __blsmsk_u32(unsigned int __X)
return __X ^ (__X - 1);
}
/// \brief Clears the least significant bit that is set to 1 in the source
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
@ -181,7 +183,7 @@ __blsr_u32(unsigned int __X)
return __X & (__X - 1);
}
/// \brief Counts the number of trailing zero bits in the operand.
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -197,7 +199,7 @@ __tzcnt_u32(unsigned int __X)
return __X ? __builtin_ctz(__X) : 32;
}
/// \brief Counts the number of trailing zero bits in the operand.
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -226,7 +228,7 @@ _mm_tzcnt_32(unsigned int __X)
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
/// \brief Performs a bitwise AND of the second operand with the one's
/// Performs a bitwise AND of the second operand with the one's
/// complement of the first operand.
///
/// \headerfile <x86intrin.h>
@ -246,7 +248,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
}
/* AMD-specified, double-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
@ -261,6 +263,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y)
/// the number of bits to be extracted.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
/// \see _bextr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__bextr_u64(unsigned long long __X, unsigned long long __Y)
{
@ -268,7 +271,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
}
/* Intel-specified, single-leading-underscore version of BEXTR */
/// \brief Extracts the specified bits from the first operand and returns them
/// Extracts the specified bits from the first operand and returns them
/// in the least significant bits of the result.
///
/// \headerfile <x86intrin.h>
@ -285,13 +288,14 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
/// Bits [7:0] specify the number of bits.
/// \returns An unsigned 64-bit integer whose least significant bits contain the
/// extracted bits.
/// \see __bextr_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
{
return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
/// \brief Clears all bits in the source except for the least significant bit
/// Clears all bits in the source except for the least significant bit
/// containing a value of 1 and returns the result.
///
/// \headerfile <x86intrin.h>
@ -308,7 +312,7 @@ __blsi_u64(unsigned long long __X)
return __X & -__X;
}
/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
/// Creates a mask whose bits are set to 1, using bit 0 up to and
/// including the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
@ -325,7 +329,7 @@ __blsmsk_u64(unsigned long long __X)
return __X ^ (__X - 1);
}
/// \brief Clears the least significant bit that is set to 1 in the source
/// Clears the least significant bit that is set to 1 in the source
/// operand and returns the result.
///
/// \headerfile <x86intrin.h>
@ -342,7 +346,7 @@ __blsr_u64(unsigned long long __X)
return __X & (__X - 1);
}
/// \brief Counts the number of trailing zero bits in the operand.
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -358,7 +362,7 @@ __tzcnt_u64(unsigned long long __X)
return __X ? __builtin_ctzll(__X) : 64;
}
/// \brief Counts the number of trailing zero bits in the operand.
/// Counts the number of trailing zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///

View File

@ -1,4 +1,4 @@
/*===---- cetintrin.h - CET intrinsic ------------------------------------===
/*===---- cetintrin.h - CET intrinsic --------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -42,6 +42,16 @@ static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
}
#endif /* __x86_64__ */
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
__builtin_ia32_incsspq(__a);
}
#else /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
__builtin_ia32_incsspd((int)__a);
}
#endif /* __x86_64__ */
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
return __builtin_ia32_rdsspd(__a);
}
@ -52,6 +62,16 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo
}
#endif /* __x86_64__ */
#ifdef __x86_64__
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
return __builtin_ia32_rdsspq(0);
}
#else /* __x86_64__ */
static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
return __builtin_ia32_rdsspd(0);
}
#endif /* __x86_64__ */
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
__builtin_ia32_saveprevssp();
}

View File

@ -0,0 +1,42 @@
/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __CLDEMOTEINTRIN_H
#define __CLDEMOTEINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("cldemote")))
static __inline__ void __DEFAULT_FN_ATTRS
_cldemote(const void * __P) {
__builtin_ia32_cldemote(__P);
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -1,4 +1,4 @@
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------===
/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal

View File

@ -31,7 +31,7 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb")))
/// \brief Writes back to memory the cache line (if modified) that contains the
/// Writes back to memory the cache line (if modified) that contains the
/// linear address specified in \a __p from any level of the cache hierarchy in
/// the cache coherence domain
///

View File

@ -20,18 +20,18 @@
*
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _CLZEROINTRIN_H
#define _CLZEROINTRIN_H
#ifndef __CLZEROINTRIN_H
#define __CLZEROINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("clzero")))
/// \brief Loads the cache line address and zero's out the cacheline
/// Loads the cache line address and zero's out the cacheline
///
/// \headerfile <clzerointrin.h>
///
@ -45,6 +45,6 @@ _mm_clzero (void * __line)
__builtin_ia32_clzero ((void *)__line);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS
#endif /* _CLZEROINTRIN_H */
#endif /* __CLZEROINTRIN_H */

View File

@ -156,6 +156,7 @@
#define bit_SMEP 0x00000080
#define bit_BMI2 0x00000100
#define bit_ENH_MOVSB 0x00000200
#define bit_INVPCID 0x00000400
#define bit_RTM 0x00000800
#define bit_MPX 0x00004000
#define bit_AVX512F 0x00010000
@ -166,7 +167,7 @@
#define bit_CLFLUSHOPT 0x00800000
#define bit_CLWB 0x01000000
#define bit_AVX512PF 0x04000000
#define bit_AVX51SER 0x08000000
#define bit_AVX512ER 0x08000000
#define bit_AVX512CD 0x10000000
#define bit_SHA 0x20000000
#define bit_AVX512BW 0x40000000
@ -177,6 +178,7 @@
#define bit_AVX512VBMI 0x00000002
#define bit_PKU 0x00000004
#define bit_OSPKE 0x00000010
#define bit_WAITPKG 0x00000020
#define bit_AVX512VBMI2 0x00000040
#define bit_SHSTK 0x00000080
#define bit_GFNI 0x00000100
@ -186,10 +188,14 @@
#define bit_AVX512BITALG 0x00001000
#define bit_AVX512VPOPCNTDQ 0x00004000
#define bit_RDPID 0x00400000
#define bit_CLDEMOTE 0x02000000
#define bit_MOVDIRI 0x08000000
#define bit_MOVDIR64B 0x10000000
/* Features in %edx for leaf 7 sub-leaf 0 */
#define bit_AVX5124VNNIW 0x00000004
#define bit_AVX5124FMAPS 0x00000008
#define bit_PCONFIG 0x00040000
#define bit_IBT 0x00100000
/* Features in %eax for leaf 13 sub-leaf 1 */
@ -197,6 +203,9 @@
#define bit_XSAVEC 0x00000002
#define bit_XSAVES 0x00000008
/* Features in %eax for leaf 0x14 sub-leaf 0 */
#define bit_PTWRITE 0x00000010
/* Features in %ecx for leaf 0x80000001 */
#define bit_LAHF_LM 0x00000001
#define bit_ABM 0x00000020
@ -215,8 +224,9 @@
#define bit_3DNOWP 0x40000000
#define bit_3DNOW 0x80000000
/* Features in %ebx for leaf 0x80000001 */
/* Features in %ebx for leaf 0x80000008 */
#define bit_CLZERO 0x00000001
#define bit_WBNOINVD 0x00000200
#if __i386__

View File

@ -24,28 +24,36 @@
#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
#define __CLANG_CUDA_WRAPPERS_ALGORITHM
// This header defines __device__ overloads of std::min/max, but only if we're
// <= C++11. In C++14, these functions are constexpr, and so are implicitly
// __host__ __device__.
// This header defines __device__ overloads of std::min/max.
//
// We don't support the initializer_list overloads because
// initializer_list::begin() and end() are not __host__ __device__ functions.
// Ideally we'd declare these functions only if we're <= C++11. In C++14,
// these functions are constexpr, and so are implicitly __host__ __device__.
//
// When compiling in C++14 mode, we could force std::min/max to have different
// implementations for host and device, by declaring the device overloads
// before the constexpr overloads appear. We choose not to do this because
// a) why write our own implementation when we can use one from the standard
// library? and
// b) libstdc++ is evil and declares min/max inside a header that is included
// *before* we include <algorithm>. So we'd have to unconditionally
// declare our __device__ overloads of min/max, but that would pollute
// things for people who choose not to include <algorithm>.
// However, the compiler being in C++14 mode does not imply that the standard
// library supports C++14. There is no macro we can test to check that the
// stdlib has constexpr std::min/max. Thus we have to unconditionally define
// our device overloads.
//
// A host+device function cannot be overloaded, and a constexpr function
// implicitly become host device if there's no explicitly host or device
// overload preceding it. So the simple thing to do would be to declare our
// device min/max overloads, and then #include_next <algorithm>. This way our
// device overloads would come first, and so if we have a C++14 stdlib, its
// min/max won't become host+device and conflict with our device overloads.
//
// But that also doesn't work. libstdc++ is evil and declares std::min/max in
// an internal header that is included *before* <algorithm>. Thus by the time
// we're inside of this file, std::min/max may already have been declared, and
// thus we can't prevent them from becoming host+device if they're constexpr.
//
// Therefore we perpetrate the following hack: We mark our __device__ overloads
// with __attribute__((enable_if(true, ""))). This causes the signature of the
// function to change without changing anything else about it. (Except that
// overload resolution will prefer it over the __host__ __device__ version
// rather than considering them equally good).
#include_next <algorithm>
#if __cplusplus <= 201103L
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
@ -59,30 +67,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
#pragma push_macro("_CPP14_CONSTEXPR")
#if __cplusplus >= 201402L
#define _CPP14_CONSTEXPR constexpr
#else
#define _CPP14_CONSTEXPR
#endif
template <class __T, class __Cmp>
inline __device__ const __T &
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__a, __b) ? __b : __a;
}
template <class __T>
inline __device__ const __T &
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
}
template <class __T, class __Cmp>
inline __device__ const __T &
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__b, __a) ? __b : __a;
}
template <class __T>
inline __device__ const __T &
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b) {
return __a < __b ? __a : __b;
}
#pragma pop_macro("_CPP14_CONSTEXPR")
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
@ -92,5 +113,4 @@ _GLIBCXX_END_NAMESPACE_VERSION
} // namespace std
#endif
#endif // __cplusplus <= 201103L
#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM

File diff suppressed because it is too large Load Diff

View File

@ -21,18 +21,25 @@
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead."
#if !defined __IMMINTRIN_H
#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __F16CINTRIN_H
#define __F16CINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("f16c")))
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 \
__attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
/// \brief Converts a 16-bit half-precision float value into a 32-bit float
/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
* but that's because icc can emulate these without f16c using a library call.
* Since we don't do that let's leave these in f16cintrin.h.
*/
/// Converts a 16-bit half-precision float value into a 32-bit float
/// value.
///
/// \headerfile <x86intrin.h>
@ -42,7 +49,7 @@
/// \param __a
/// A 16-bit half-precision float value.
/// \returns The converted 32-bit float value.
static __inline float __DEFAULT_FN_ATTRS
static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
@ -50,7 +57,7 @@ _cvtsh_ss(unsigned short __a)
return r[0];
}
/// \brief Converts a 32-bit single-precision float value to a 16-bit
/// Converts a 32-bit single-precision float value to a 16-bit
/// half-precision float value.
///
/// \headerfile <x86intrin.h>
@ -72,11 +79,11 @@ _cvtsh_ss(unsigned short __a)
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) __extension__ ({ \
#define _cvtss_sh(a, imm) \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]); })
(imm)))[0])
/// \brief Converts a 128-bit vector containing 32-bit float values into a
/// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
@ -99,10 +106,10 @@ _cvtsh_ss(unsigned short __a)
/// \returns A 128-bit vector containing converted 16-bit half-precision float
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) __extension__ ({ \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
#define _mm_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
///
/// \headerfile <x86intrin.h>
@ -113,12 +120,57 @@ _cvtsh_ss(unsigned short __a)
/// A 128-bit vector containing 16-bit half-precision float values. The lower
/// 64 bits are used in the conversion.
/// \returns A 128-bit vector of [4 x float] containing converted float values.
static __inline __m128 __DEFAULT_FN_ATTRS
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
}
#undef __DEFAULT_FN_ATTRS
/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
/// containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 256-bit vector containing 32-bit single-precision float values to be
/// converted to 16-bit half-precision float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values to be
/// converted to 32-bit single-precision float values.
/// \returns A vector of [8 x float] containing the converted 32-bit
/// single-precision float values.
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __F16CINTRIN_H */

View File

@ -31,200 +31,202 @@
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __FMA4INTRIN_H */

View File

@ -1,4 +1,4 @@
/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -29,200 +29,202 @@
#define __FMAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma")))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
}
static __inline__ __m128 __DEFAULT_FN_ATTRS
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
}
static __inline__ __m128d __DEFAULT_FN_ATTRS
static __inline__ __m128d __DEFAULT_FN_ATTRS128
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
{
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
{
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif /* __FMAINTRIN_H */

View File

@ -30,7 +30,7 @@
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
@ -43,10 +43,10 @@
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave(void *__p)
{
return __builtin_ia32_fxsave(__p);
__builtin_ia32_fxsave(__p);
}
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
@ -61,11 +61,11 @@ _fxsave(void *__p)
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor(void *__p)
{
return __builtin_ia32_fxrstor(__p);
__builtin_ia32_fxrstor(__p);
}
#ifdef __x86_64__
/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
/// memory region pointed to by the input parameter \a __p.
///
/// \headerfile <x86intrin.h>
@ -78,10 +78,10 @@ _fxrstor(void *__p)
static __inline__ void __DEFAULT_FN_ATTRS
_fxsave64(void *__p)
{
return __builtin_ia32_fxsave64(__p);
__builtin_ia32_fxsave64(__p);
}
/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
/// memory region pointed to by the input parameter \a __p. The contents of
/// this memory region should have been written to by a previous \c _fxsave
/// or \c _fxsave64 intrinsic.
@ -96,7 +96,7 @@ _fxsave64(void *__p)
static __inline__ void __DEFAULT_FN_ATTRS
_fxrstor64(void *__p)
{
return __builtin_ia32_fxrstor64(__p);
__builtin_ia32_fxrstor64(__p);
}
#endif

View File

@ -29,104 +29,108 @@
#define __GFNIINTRIN_H
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)); })
(char)(I))
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); })
(__v16qi)(__m128i)(S))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); })
U, A, B, I)
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)); })
(char)(I))
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); })
(__v32qi)(__m256i)(S))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); })
U, A, B, I)
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)); })
(char)(I))
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); })
(__v64qi)(__m512i)(S))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \
U, A, B, I); })
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I)); })
(char)(I))
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S)); })
(__v16qi)(__m128i)(S))
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I); })
U, A, B, I)
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I)); })
(char)(I))
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S)); })
(__v32qi)(__m256i)(S))
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I); })
U, A, B, I)
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I)); })
(char)(I))
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S)); })
(__v64qi)(__m512i)(S))
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \
U, A, B, I); })
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
/* Default attributes for simple form (no masking). */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
/* Default attributes for YMM unmasked form. */
#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni")))
#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
/* Default attributes for VLX forms. */
#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni")))
#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
@ -135,7 +139,7 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
(__v16qi) __B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
{
return (__m128i) __builtin_ia32_selectb_128(__U,
@ -143,21 +147,21 @@ _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
(__v16qi) __S);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
__U, __A, __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
(__v32qi) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
{
return (__m256i) __builtin_ia32_selectb_256(__U,
@ -165,21 +169,21 @@ _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
(__v32qi) __S);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL
static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
__U, __A, __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
(__v64qi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_selectb_512(__U,
@ -187,16 +191,18 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
(__v64qi) __S);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(),
return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
__U, __A, __B);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F
#undef __DEFAULT_FN_ATTRS_VL
#undef __DEFAULT_FN_ATTRS_Y
#undef __DEFAULT_FN_ATTRS_Z
#undef __DEFAULT_FN_ATTRS_VL128
#undef __DEFAULT_FN_ATTRS_VL256
#endif // __GFNIINTRIN_H
#endif /* __GFNIINTRIN_H */

View File

@ -214,7 +214,7 @@ __TM_failure_code(void* const __TM_buff)
/* These intrinsics are being made available for compatibility with
the IBM XL compiler. For documentation please see the "z/OS XL
C/C++ Programming Guide" publically available on the web. */
C/C++ Programming Guide" publicly available on the web. */
static __inline long __attribute__((__always_inline__, __nodebug__))
__TM_simple_begin ()

View File

@ -70,4 +70,9 @@ __rdtscp(unsigned int *__A) {
#define _rdpmc(A) __rdpmc(A)
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_wbinvd(void) {
__builtin_ia32_wbinvd();
}
#endif /* __IA32INTRIN_H */

View File

@ -68,55 +68,11 @@
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__)
#include <avx2intrin.h>
#endif
/* The 256-bit versions of functions in f16cintrin.h.
Intel documents these as being in immintrin.h, and
they depend on typedefs from avxintrin.h. */
/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector
/// containing 16-bit half-precision float values.
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
/// \endcode
///
/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
///
/// \param a
/// A 256-bit vector containing 32-bit single-precision float values to be
/// converted to 16-bit half-precision float values.
/// \param imm
/// An immediate value controlling rounding using bits [2:0]: \n
/// 000: Nearest \n
/// 001: Down \n
/// 010: Up \n
/// 011: Truncate \n
/// 1XX: Use MXCSR.RC for rounding
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) __extension__ ({ \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); })
/// \brief Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
///
/// \param __a
/// A 128-bit vector containing 16-bit half-precision float values to be
/// converted to 32-bit single-precision float values.
/// \returns A vector of [8 x float] containing the converted 32-bit
/// single-precision float values.
static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c")))
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
}
#endif /* __AVX2__ */
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
#include <f16cintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__)
#include <vpclmulqdqintrin.h>
@ -134,6 +90,10 @@ _mm256_cvtph_ps(__m128i __a)
#include <lzcntintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
#include <popcntintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__)
#include <fmaintrin.h>
#endif
@ -247,6 +207,18 @@ _mm256_cvtph_ps(__m128i __a)
#include <gfniintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__)
/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> RDPID </c> instruction.
static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
_rdpid_u32(void) {
return __builtin_ia32_rdpid();
}
#endif // __RDPID__
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__)
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
_rdrand16_step(unsigned short *__p)
@ -310,25 +282,25 @@ _readgsbase_u64(void)
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u32(unsigned int __V)
{
return __builtin_ia32_wrfsbase32(__V);
__builtin_ia32_wrfsbase32(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writefsbase_u64(unsigned long long __V)
{
return __builtin_ia32_wrfsbase64(__V);
__builtin_ia32_wrfsbase64(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u32(unsigned int __V)
{
return __builtin_ia32_wrgsbase32(__V);
__builtin_ia32_wrgsbase32(__V);
}
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
_writegsbase_u64(unsigned long long __V)
{
return __builtin_ia32_wrgsbase64(__V);
__builtin_ia32_wrgsbase64(__V);
}
#endif
@ -371,4 +343,125 @@ _writegsbase_u64(unsigned long long __V)
* whereas others are also available at all times. */
#include <adxintrin.h>
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
#include <rdseedintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__)
#include <wbnoinvdintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__)
#include <cldemoteintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__)
#include <waitpkgintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || \
defined(__MOVDIRI__) || defined(__MOVDIR64B__)
#include <movdirintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__)
#include <pconfigintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__)
#include <sgxintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__)
#include <ptwriteintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__)
#include <invpcidintrin.h>
#endif
#ifdef _MSC_VER
/* Define the default attributes for these intrinsics */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#ifdef __cplusplus
extern "C" {
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Exchange HLE
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
#endif
#if defined(__x86_64__)
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Compare Exchange HLE
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
#endif
#if defined(__x86_64__)
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
}
#endif
#ifdef __cplusplus
}
#endif
#undef __DEFAULT_FN_ATTRS
#endif /* _MSC_VER */
#endif /* __IMMINTRIN_H */

View File

@ -38,7 +38,7 @@
#include <armintr.h>
#endif
#if defined(_M_ARM64)
#if defined(__aarch64__)
#include <arm64intr.h>
#endif
@ -83,6 +83,7 @@ void __incfsdword(unsigned long);
void __incfsword(unsigned long);
unsigned long __indword(unsigned short);
void __indwordstring(unsigned short, unsigned long *, unsigned long);
void __int2c(void);
void __invlpg(void *);
unsigned short __inword(unsigned short);
void __inwordstring(unsigned short, unsigned short *, unsigned long);
@ -140,6 +141,7 @@ void __svm_stgi(void);
void __svm_vmload(size_t);
void __svm_vmrun(size_t);
void __svm_vmsave(size_t);
void __ud2(void);
unsigned __int64 __ull_rshift(unsigned __int64, int);
void __vmx_off(void);
void __vmx_vmptrst(unsigned __int64 *);
@ -161,25 +163,15 @@ static __inline__
unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask);
static __inline__
unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask);
static __inline__
unsigned char _bittest(long const *, long);
static __inline__
unsigned char _bittestandcomplement(long *, long);
static __inline__
unsigned char _bittestandreset(long *, long);
static __inline__
unsigned char _bittestandset(long *, long);
void __cdecl _disable(void);
void __cdecl _enable(void);
long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
unsigned char _interlockedbittestandreset(long volatile *, long);
unsigned char _interlockedbittestandset(long volatile *, long);
long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64,
__int64);
__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
__int64);
void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *,
void *);
void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *,
@ -256,24 +248,15 @@ void __writegsbyte(unsigned long, unsigned char);
void __writegsdword(unsigned long, unsigned long);
void __writegsqword(unsigned long, unsigned __int64);
void __writegsword(unsigned long, unsigned short);
static __inline__
unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
static __inline__
unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
static __inline__
unsigned char _bittest64(__int64 const *, __int64);
static __inline__
unsigned char _bittestandcomplement64(__int64 *, __int64);
static __inline__
unsigned char _bittestandreset64(__int64 *, __int64);
static __inline__
unsigned char _bittestandset64(__int64 *, __int64);
long _InterlockedAnd_np(long volatile *_Value, long _Mask);
short _InterlockedAnd16_np(short volatile *_Value, short _Mask);
__int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask);
char _InterlockedAnd8_np(char volatile *_Value, char _Mask);
unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64);
static __inline__
unsigned char _interlockedbittestandset64(__int64 volatile *, __int64);
long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange,
long _Comparand);
@ -287,10 +270,6 @@ unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination,
__int64 *_ComparandResult);
short _InterlockedCompareExchange16_np(short volatile *_Destination,
short _Exchange, short _Comparand);
__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64,
__int64);
__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64,
__int64);
__int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand);
void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination,
@ -320,7 +299,12 @@ unsigned __int64 _umul128(unsigned __int64,
#endif /* __x86_64__ */
#if defined(__x86_64__) || defined(__arm__)
#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
static __inline__
unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask);
static __inline__
unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask);
static __inline__
__int64 _InterlockedDecrement64(__int64 volatile *_Addend);
@ -341,78 +325,6 @@ __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask);
#endif
/*----------------------------------------------------------------------------*\
|* Bit Counting and Testing
\*----------------------------------------------------------------------------*/
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittest(long const *_BitBase, long _BitPos) {
return (*_BitBase >> _BitPos) & 1;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandcomplement(long *_BitBase, long _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase ^ (1 << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandreset(long *_BitBase, long _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase & ~(1 << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandset(long *_BitBase, long _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase | (1 << _BitPos);
return _Res;
}
#if defined(__arm__) || defined(__aarch64__)
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE);
return (_PrevVal >> _BitPos) & 1;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) {
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED);
return (_PrevVal >> _BitPos) & 1;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) {
long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE);
return (_PrevVal >> _BitPos) & 1;
}
#endif
#ifdef __x86_64__
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittest64(__int64 const *_BitBase, __int64 _BitPos) {
return (*_BitBase >> _BitPos) & 1;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase ^ (1ll << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase & ~(1ll << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_bittestandset64(__int64 *_BitBase, __int64 _BitPos) {
unsigned char _Res = (*_BitBase >> _BitPos) & 1;
*_BitBase = *_BitBase | (1ll << _BitPos);
return _Res;
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) {
long long _PrevVal =
__atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST);
return (_PrevVal >> _BitPos) & 1;
}
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Exchange Add
\*----------------------------------------------------------------------------*/
@ -602,6 +514,23 @@ _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) {
}
#endif
/*----------------------------------------------------------------------------*\
|* Bit Counting and Testing
\*----------------------------------------------------------------------------*/
#if defined(__arm__) || defined(__aarch64__)
unsigned char _interlockedbittestandset_acq(long volatile *_BitBase,
long _BitPos);
unsigned char _interlockedbittestandset_nf(long volatile *_BitBase,
long _BitPos);
unsigned char _interlockedbittestandset_rel(long volatile *_BitBase,
long _BitPos);
unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase,
long _BitPos);
unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase,
long _BitPos);
unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase,
long _BitPos);
#endif
/*----------------------------------------------------------------------------*\
|* Interlocked Or
\*----------------------------------------------------------------------------*/
#if defined(__arm__) || defined(__aarch64__)
@ -868,33 +797,40 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
#if defined(__i386__) || defined(__x86_64__)
static __inline__ void __DEFAULT_FN_ATTRS
__movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) {
__asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n));
__asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
: : "memory");
}
static __inline__ void __DEFAULT_FN_ATTRS
__movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) {
__asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n));
__asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n)
: : "memory");
}
static __inline__ void __DEFAULT_FN_ATTRS
__movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) {
__asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n));
__asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n)
: : "memory");
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosd(unsigned long *__dst, unsigned long __x, size_t __n) {
__asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n));
__asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x)
: "memory");
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosw(unsigned short *__dst, unsigned short __x, size_t __n) {
__asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n));
__asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x)
: "memory");
}
#endif
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
__movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) {
__asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n));
__asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n)
: : "memory");
}
static __inline__ void __DEFAULT_FN_ATTRS
__stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) {
__asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n));
__asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x)
: "memory");
}
#endif
@ -927,6 +863,20 @@ __nop(void) {
__asm__ volatile ("nop");
}
#endif
#if defined(__x86_64__)
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
__shiftleft128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
unsigned __int128 __res = __val << (__d & 63);
return (unsigned __int64)(__res >> 64);
}
static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
__shiftright128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) {
unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l;
unsigned __int128 __res = __val >> (__d & 63);
return (unsigned __int64)__res;
}
#endif
/*----------------------------------------------------------------------------*\
|* Privileged intrinsics

37
c_headers/invpcidintrin.h Normal file
View File

@ -0,0 +1,37 @@
/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __INVPCIDINTRIN_H
#define __INVPCIDINTRIN_H
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("invpcid")))
_invpcid(unsigned int __type, void *__descriptor) {
__builtin_ia32_invpcid(__type, __descriptor);
}
#endif /* __INVPCIDINTRIN_H */

View File

@ -31,7 +31,7 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
/// \brief Parses the LWPCB at the specified address and enables
/// Parses the LWPCB at the specified address and enables
/// profiling if valid.
///
/// \headerfile <x86intrin.h>
@ -48,7 +48,7 @@ __llwpcb (void *__addr)
__builtin_ia32_llwpcb(__addr);
}
/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
/// Flushes the LWP state to memory and returns the address of the LWPCB.
///
/// \headerfile <x86intrin.h>
///
@ -58,12 +58,12 @@ __llwpcb (void *__addr)
/// Address to the current Lightweight Profiling Control Block (LWPCB).
/// If LWP is not currently enabled, returns NULL.
static __inline__ void* __DEFAULT_FN_ATTRS
__slwpcb ()
__slwpcb (void)
{
return __builtin_ia32_slwpcb();
}
/// \brief Inserts programmed event record into the LWP event ring buffer
/// Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
@ -84,7 +84,7 @@ __slwpcb ()
(__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// \brief Decrements the LWP programmed value sample event counter. If the result is
/// Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///
@ -104,7 +104,7 @@ __slwpcb ()
#ifdef __x86_64__
/// \brief Inserts programmed event record into the LWP event ring buffer
/// Inserts programmed event record into the LWP event ring buffer
/// and advances the ring buffer pointer.
///
/// \headerfile <x86intrin.h>
@ -125,7 +125,7 @@ __slwpcb ()
(__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
(unsigned int) (FLAGS)))
/// \brief Decrements the LWP programmed value sample event counter. If the result is
/// Decrements the LWP programmed value sample event counter. If the result is
/// negative, inserts an event record into the LWP event ring buffer in memory
/// and advances the ring buffer pointer.
///

View File

@ -31,7 +31,7 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
/// \brief Counts the number of leading zero bits in the operand.
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -47,7 +47,7 @@ __lzcnt16(unsigned short __X)
return __X ? __builtin_clzs(__X) : 16;
}
/// \brief Counts the number of leading zero bits in the operand.
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -57,13 +57,14 @@ __lzcnt16(unsigned short __X)
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
/// \see _lzcnt_u32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
__lzcnt32(unsigned int __X)
{
return __X ? __builtin_clz(__X) : 32;
}
/// \brief Counts the number of leading zero bits in the operand.
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -73,6 +74,7 @@ __lzcnt32(unsigned int __X)
/// An unsigned 32-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 32-bit integer containing the number of leading zero
/// bits in the operand.
/// \see __lzcnt32
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_lzcnt_u32(unsigned int __X)
{
@ -80,7 +82,7 @@ _lzcnt_u32(unsigned int __X)
}
#ifdef __x86_64__
/// \brief Counts the number of leading zero bits in the operand.
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -90,13 +92,14 @@ _lzcnt_u32(unsigned int __X)
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
/// \see _lzcnt_u64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
__lzcnt64(unsigned long long __X)
{
return __X ? __builtin_clzll(__X) : 64;
}
/// \brief Counts the number of leading zero bits in the operand.
/// Counts the number of leading zero bits in the operand.
///
/// \headerfile <x86intrin.h>
///
@ -106,6 +109,7 @@ __lzcnt64(unsigned long long __X)
/// An unsigned 64-bit integer whose leading zeros are to be counted.
/// \returns An unsigned 64-bit integer containing the number of leading zero
/// bits in the operand.
/// \see __lzcnt64
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_lzcnt_u64(unsigned long long __X)
{

View File

@ -30,9 +30,9 @@
typedef float __v2sf __attribute__((__vector_size__(8)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
static __inline__ void __DEFAULT_FN_ATTRS
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
_m_femms(void) {
__builtin_ia32_femms();
}
@ -134,7 +134,7 @@ _m_pmulhrw(__m64 __m1, __m64 __m2) {
/* Handle the 3dnowa instructions here. */
#undef __DEFAULT_FN_ATTRS
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
static __inline__ __m64 __DEFAULT_FN_ATTRS
_m_pf2iw(__m64 __m) {

View File

@ -32,27 +32,27 @@ typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8)));
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
/// \brief Clears the MMX state by setting the state of the x87 stack registers
/// Clears the MMX state by setting the state of the x87 stack registers
/// to empty.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> EMMS </c> instruction.
///
static __inline__ void __DEFAULT_FN_ATTRS
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
_mm_empty(void)
{
__builtin_ia32_emms();
}
/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the
/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
/// value of the 32-bit integer parameter and setting the upper 32 bits to 0.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
/// This intrinsic corresponds to the <c> MOVD </c> instruction.
///
/// \param __i
/// A 32-bit integer value.
@ -64,12 +64,12 @@ _mm_cvtsi32_si64(int __i)
return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
}
/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
/// signed integer.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
/// This intrinsic corresponds to the <c> MOVD </c> instruction.
///
/// \param __m
/// A 64-bit integer vector.
@ -81,11 +81,11 @@ _mm_cvtsi64_si32(__m64 __m)
return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
}
/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector.
/// Casts a 64-bit signed integer value into a 64-bit integer vector.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
///
/// \param __i
/// A 64-bit signed integer.
@ -97,11 +97,11 @@ _mm_cvtsi64_m64(long long __i)
return (__m64)__i;
}
/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value.
/// Casts a 64-bit integer vector into a 64-bit signed integer value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction.
/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
///
/// \param __m
/// A 64-bit integer vector.
@ -113,7 +113,7 @@ _mm_cvtm64_si64(__m64 __m)
return (long long)__m;
}
/// \brief Converts 16-bit signed integers from both 64-bit integer vector
/// Converts 16-bit signed integers from both 64-bit integer vector
/// parameters of [4 x i16] into 8-bit signed integer values, and constructs
/// a 64-bit integer vector of [8 x i8] as the result. Positive values
/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
@ -143,7 +143,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Converts 32-bit signed integers from both 64-bit integer vector
/// Converts 32-bit signed integers from both 64-bit integer vector
/// parameters of [2 x i32] into 16-bit signed integer values, and constructs
/// a 64-bit integer vector of [4 x i16] as the result. Positive values
/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
@ -173,7 +173,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
}
/// \brief Converts 16-bit signed integers from both 64-bit integer vector
/// Converts 16-bit signed integers from both 64-bit integer vector
/// parameters of [4 x i16] into 8-bit unsigned integer values, and
/// constructs a 64-bit integer vector of [8 x i8] as the result. Values
/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
@ -203,7 +203,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
/// and interleaves them into a 64-bit integer vector of [8 x i8].
///
/// \headerfile <x86intrin.h>
@ -230,7 +230,7 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
///
/// \headerfile <x86intrin.h>
@ -253,7 +253,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of
/// Unpacks the upper 32 bits from two 64-bit integer vectors of
/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
///
/// \headerfile <x86intrin.h>
@ -274,7 +274,7 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
}
/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
/// and interleaves them into a 64-bit integer vector of [8 x i8].
///
/// \headerfile <x86intrin.h>
@ -301,7 +301,7 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
///
/// \headerfile <x86intrin.h>
@ -324,7 +324,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of
/// Unpacks the lower 32 bits from two 64-bit integer vectors of
/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
///
/// \headerfile <x86intrin.h>
@ -345,7 +345,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
}
/// \brief Adds each 8-bit integer element of the first 64-bit integer vector
/// Adds each 8-bit integer element of the first 64-bit integer vector
/// of [8 x i8] to the corresponding 8-bit integer element of the second
/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
/// packed into a 64-bit integer vector of [8 x i8].
@ -366,7 +366,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Adds each 16-bit integer element of the first 64-bit integer vector
/// Adds each 16-bit integer element of the first 64-bit integer vector
/// of [4 x i16] to the corresponding 16-bit integer element of the second
/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
/// packed into a 64-bit integer vector of [4 x i16].
@ -387,7 +387,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Adds each 32-bit integer element of the first 64-bit integer vector
/// Adds each 32-bit integer element of the first 64-bit integer vector
/// of [2 x i32] to the corresponding 32-bit integer element of the second
/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
/// packed into a 64-bit integer vector of [2 x i32].
@ -408,7 +408,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
}
/// \brief Adds each 8-bit signed integer element of the first 64-bit integer
/// Adds each 8-bit signed integer element of the first 64-bit integer
/// vector of [8 x i8] to the corresponding 8-bit signed integer element of
/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than
/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
@ -430,7 +430,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Adds each 16-bit signed integer element of the first 64-bit integer
/// Adds each 16-bit signed integer element of the first 64-bit integer
/// vector of [4 x i16] to the corresponding 16-bit signed integer element of
/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than
/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
@ -453,7 +453,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer
/// Adds each 8-bit unsigned integer element of the first 64-bit integer
/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
/// saturated to 0xFF. The results are packed into a 64-bit integer vector of
@ -475,7 +475,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer
/// Adds each 16-bit unsigned integer element of the first 64-bit integer
/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element
/// of the second 64-bit integer vector of [4 x i16]. Sums greater than
/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
@ -497,7 +497,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Subtracts each 8-bit integer element of the second 64-bit integer
/// Subtracts each 8-bit integer element of the second 64-bit integer
/// vector of [8 x i8] from the corresponding 8-bit integer element of the
/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
/// are packed into a 64-bit integer vector of [8 x i8].
@ -518,7 +518,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Subtracts each 16-bit integer element of the second 64-bit integer
/// Subtracts each 16-bit integer element of the second 64-bit integer
/// vector of [4 x i16] from the corresponding 16-bit integer element of the
/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
/// results are packed into a 64-bit integer vector of [4 x i16].
@ -539,7 +539,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Subtracts each 32-bit integer element of the second 64-bit integer
/// Subtracts each 32-bit integer element of the second 64-bit integer
/// vector of [2 x i32] from the corresponding 32-bit integer element of the
/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
/// results are packed into a 64-bit integer vector of [2 x i32].
@ -560,7 +560,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
}
/// \brief Subtracts each 8-bit signed integer element of the second 64-bit
/// Subtracts each 8-bit signed integer element of the second 64-bit
/// integer vector of [8 x i8] from the corresponding 8-bit signed integer
/// element of the first 64-bit integer vector of [8 x i8]. Positive results
/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
@ -583,7 +583,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Subtracts each 16-bit signed integer element of the second 64-bit
/// Subtracts each 16-bit signed integer element of the second 64-bit
/// integer vector of [4 x i16] from the corresponding 16-bit signed integer
/// element of the first 64-bit integer vector of [4 x i16]. Positive results
/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
@ -606,7 +606,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
/// Subtracts each 8-bit unsigned integer element of the second 64-bit
/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
/// element of the first 64-bit integer vector of [8 x i8].
///
@ -630,7 +630,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
/// Subtracts each 16-bit unsigned integer element of the second 64-bit
/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
/// integer element of the first 64-bit integer vector of [4 x i16].
///
@ -654,7 +654,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
/// Multiplies each 16-bit signed integer element of the first 64-bit
/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
/// element of the second 64-bit integer vector of [4 x i16] and get four
/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
@ -681,7 +681,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
/// Multiplies each 16-bit signed integer element of the first 64-bit
/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper
/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
@ -702,7 +702,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Multiplies each 16-bit signed integer element of the first 64-bit
/// Multiplies each 16-bit signed integer element of the first 64-bit
/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower
/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
@ -723,7 +723,7 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Left-shifts each 16-bit signed integer element of the first
/// Left-shifts each 16-bit signed integer element of the first
/// parameter, which is a 64-bit integer vector of [4 x i16], by the number
/// of bits specified by the second parameter, which is a 64-bit integer. The
/// lower 16 bits of the results are packed into a 64-bit integer vector of
@ -746,7 +746,7 @@ _mm_sll_pi16(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
}
/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer
/// Left-shifts each 16-bit signed integer element of a 64-bit integer
/// vector of [4 x i16] by the number of bits specified by a 32-bit integer.
/// The lower 16 bits of the results are packed into a 64-bit integer vector
/// of [4 x i16].
@ -768,7 +768,7 @@ _mm_slli_pi16(__m64 __m, int __count)
return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
}
/// \brief Left-shifts each 32-bit signed integer element of the first
/// Left-shifts each 32-bit signed integer element of the first
/// parameter, which is a 64-bit integer vector of [2 x i32], by the number
/// of bits specified by the second parameter, which is a 64-bit integer. The
/// lower 32 bits of the results are packed into a 64-bit integer vector of
@ -791,7 +791,7 @@ _mm_sll_pi32(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
}
/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer
/// Left-shifts each 32-bit signed integer element of a 64-bit integer
/// vector of [2 x i32] by the number of bits specified by a 32-bit integer.
/// The lower 32 bits of the results are packed into a 64-bit integer vector
/// of [2 x i32].
@ -813,7 +813,7 @@ _mm_slli_pi32(__m64 __m, int __count)
return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
}
/// \brief Left-shifts the first 64-bit integer parameter by the number of bits
/// Left-shifts the first 64-bit integer parameter by the number of bits
/// specified by the second 64-bit integer parameter. The lower 64 bits of
/// result are returned.
///
@ -833,7 +833,7 @@ _mm_sll_si64(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
}
/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the
/// Left-shifts the first parameter, which is a 64-bit integer, by the
/// number of bits specified by the second parameter, which is a 32-bit
/// integer. The lower 64 bits of result are returned.
///
@ -853,7 +853,7 @@ _mm_slli_si64(__m64 __m, int __count)
return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
}
/// \brief Right-shifts each 16-bit integer element of the first parameter,
/// Right-shifts each 16-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
/// specified by the second parameter, which is a 64-bit integer.
///
@ -877,7 +877,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
}
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
@ -900,7 +900,7 @@ _mm_srai_pi16(__m64 __m, int __count)
return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
}
/// \brief Right-shifts each 32-bit integer element of the first parameter,
/// Right-shifts each 32-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
/// specified by the second parameter, which is a 64-bit integer.
///
@ -924,7 +924,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
}
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are filled with the sign bit of the initial value of each
@ -947,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count)
return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
}
/// \brief Right-shifts each 16-bit integer element of the first parameter,
/// Right-shifts each 16-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [4 x i16], by the number of bits
/// specified by the second parameter, which is a 64-bit integer.
///
@ -970,7 +970,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
}
/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
/// Right-shifts each 16-bit integer element of a 64-bit integer vector
/// of [4 x i16] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
@ -992,7 +992,7 @@ _mm_srli_pi16(__m64 __m, int __count)
return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
}
/// \brief Right-shifts each 32-bit integer element of the first parameter,
/// Right-shifts each 32-bit integer element of the first parameter,
/// which is a 64-bit integer vector of [2 x i32], by the number of bits
/// specified by the second parameter, which is a 64-bit integer.
///
@ -1015,7 +1015,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
}
/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
/// Right-shifts each 32-bit integer element of a 64-bit integer vector
/// of [2 x i32] by the number of bits specified by a 32-bit integer.
///
/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
@ -1037,7 +1037,7 @@ _mm_srli_pi32(__m64 __m, int __count)
return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
}
/// \brief Right-shifts the first 64-bit integer parameter by the number of bits
/// Right-shifts the first 64-bit integer parameter by the number of bits
/// specified by the second 64-bit integer parameter.
///
/// High-order bits are cleared.
@ -1057,7 +1057,7 @@ _mm_srl_si64(__m64 __m, __m64 __count)
return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
}
/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
/// Right-shifts the first parameter, which is a 64-bit integer, by the
/// number of bits specified by the second parameter, which is a 32-bit
/// integer.
///
@ -1078,7 +1078,7 @@ _mm_srli_si64(__m64 __m, int __count)
return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
}
/// \brief Performs a bitwise AND of two 64-bit integer vectors.
/// Performs a bitwise AND of two 64-bit integer vectors.
///
/// \headerfile <x86intrin.h>
///
@ -1096,7 +1096,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2)
return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
}
/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then
/// Performs a bitwise NOT of the first 64-bit integer vector, and then
/// performs a bitwise AND of the intermediate result and the second 64-bit
/// integer vector.
///
@ -1117,7 +1117,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2)
return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
}
/// \brief Performs a bitwise OR of two 64-bit integer vectors.
/// Performs a bitwise OR of two 64-bit integer vectors.
///
/// \headerfile <x86intrin.h>
///
@ -1135,7 +1135,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2)
return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
}
/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors.
/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
///
/// \headerfile <x86intrin.h>
///
@ -1153,7 +1153,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
}
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
/// [8 x i8] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector.
///
@ -1175,7 +1175,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
/// [4 x i16] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector.
///
@ -1197,7 +1197,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
/// [2 x i32] to determine if the element of the first vector is equal to the
/// corresponding element of the second vector.
///
@ -1219,7 +1219,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
}
/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
/// Compares the 8-bit integer elements of two 64-bit integer vectors of
/// [8 x i8] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector.
///
@ -1241,7 +1241,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
}
/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
/// Compares the 16-bit integer elements of two 64-bit integer vectors of
/// [4 x i16] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector.
///
@ -1263,7 +1263,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
}
/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
/// Compares the 32-bit integer elements of two 64-bit integer vectors of
/// [2 x i32] to determine if the element of the first vector is greater than
/// the corresponding element of the second vector.
///
@ -1285,20 +1285,20 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
}
/// \brief Constructs a 64-bit integer vector initialized to zero.
/// Constructs a 64-bit integer vector initialized to zero.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
/// This intrinsic corresponds to the <c> PXOR </c> instruction.
///
/// \returns An initialized 64-bit integer vector with all elements set to zero.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_setzero_si64(void)
{
return (__m64){ 0LL };
return __extension__ (__m64){ 0LL };
}
/// \brief Constructs a 64-bit integer vector initialized with the specified
/// Constructs a 64-bit integer vector initialized with the specified
/// 32-bit integer values.
///
/// \headerfile <x86intrin.h>
@ -1319,7 +1319,7 @@ _mm_set_pi32(int __i1, int __i0)
return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
}
/// \brief Constructs a 64-bit integer vector initialized with the specified
/// Constructs a 64-bit integer vector initialized with the specified
/// 16-bit integer values.
///
/// \headerfile <x86intrin.h>
@ -1342,7 +1342,7 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
}
/// \brief Constructs a 64-bit integer vector initialized with the specified
/// Constructs a 64-bit integer vector initialized with the specified
/// 8-bit integer values.
///
/// \headerfile <x86intrin.h>
@ -1375,13 +1375,14 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
__b4, __b5, __b6, __b7);
}
/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the
/// Constructs a 64-bit integer vector of [2 x i32], with each of the
/// 32-bit integer vector elements set to the specified 32-bit integer
/// value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __i
/// A 32-bit integer value used to initialize each vector element of the
@ -1393,13 +1394,14 @@ _mm_set1_pi32(int __i)
return _mm_set_pi32(__i, __i);
}
/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the
/// Constructs a 64-bit integer vector of [4 x i16], with each of the
/// 16-bit integer vector elements set to the specified 16-bit integer
/// value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __w
/// A 16-bit integer value used to initialize each vector element of the
@ -1411,13 +1413,13 @@ _mm_set1_pi16(short __w)
return _mm_set_pi16(__w, __w, __w, __w);
}
/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the
/// Constructs a 64-bit integer vector of [8 x i8], with each of the
/// 8-bit integer vector elements set to the specified 8-bit integer value.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW +
/// PSHUFLW </c> instruction.
/// This intrinsic is a utility function and does not correspond to a specific
/// instruction.
///
/// \param __b
/// An 8-bit integer value used to initialize each vector element of the
@ -1429,7 +1431,7 @@ _mm_set1_pi8(char __b)
return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
}
/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
/// Constructs a 64-bit integer vector, initialized in reverse order with
/// the specified 32-bit integer values.
///
/// \headerfile <x86intrin.h>
@ -1450,7 +1452,7 @@ _mm_setr_pi32(int __i0, int __i1)
return _mm_set_pi32(__i1, __i0);
}
/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
/// Constructs a 64-bit integer vector, initialized in reverse order with
/// the specified 16-bit integer values.
///
/// \headerfile <x86intrin.h>
@ -1473,7 +1475,7 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
return _mm_set_pi16(__w3, __w2, __w1, __w0);
}
/// \brief Constructs a 64-bit integer vector, initialized in reverse order with
/// Constructs a 64-bit integer vector, initialized in reverse order with
/// the specified 8-bit integer values.
///
/// \headerfile <x86intrin.h>

View File

@ -38,6 +38,7 @@ module _Builtin_intrinsics [system] [extern_c] {
explicit module neon {
requires neon
header "arm_neon.h"
header "arm_fp16.h"
export *
}
}
@ -62,6 +63,17 @@ module _Builtin_intrinsics [system] [extern_c] {
textual header "fma4intrin.h"
textual header "mwaitxintrin.h"
textual header "clzerointrin.h"
textual header "wbnoinvdintrin.h"
textual header "cldemoteintrin.h"
textual header "waitpkgintrin.h"
textual header "movdirintrin.h"
textual header "pconfigintrin.h"
textual header "sgxintrin.h"
textual header "ptwriteintrin.h"
textual header "invpcidintrin.h"
textual header "__wmmintrin_aes.h"
textual header "__wmmintrin_pclmul.h"
explicit module mm_malloc {
requires !freestanding
@ -128,14 +140,6 @@ module _Builtin_intrinsics [system] [extern_c] {
export aes
export pclmul
}
explicit module aes {
header "__wmmintrin_aes.h"
}
explicit module pclmul {
header "__wmmintrin_pclmul.h"
}
}
explicit module systemz {

63
c_headers/movdirintrin.h Normal file
View File

@ -0,0 +1,63 @@
/*===------------------------- movdirintrin.h ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _MOVDIRINTRIN_H
#define _MOVDIRINTRIN_H
/* Move doubleword as direct store */
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
_directstoreu_u32 (void *__dst, unsigned int __value)
{
__builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
}
#ifdef __x86_64__
/* Move quadword as direct store */
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
_directstoreu_u64 (void *__dst, unsigned long __value)
{
__builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
}
#endif /* __x86_64__ */
/*
* movdir64b - Move 64 bytes as direct store.
* The destination must be 64 byte aligned, and the store is atomic.
* The source address has no alignment requirement, and the load from
* the source address is not atomic.
*/
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("movdir64b")))
_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
{
__builtin_ia32_movdir64b(__dst, __src);
}
#endif /* _MOVDIRINTRIN_H */

View File

@ -25,8 +25,8 @@
#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _MWAITXINTRIN_H
#define _MWAITXINTRIN_H
#ifndef __MWAITXINTRIN_H
#define __MWAITXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
@ -44,4 +44,4 @@ _mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
#undef __DEFAULT_FN_ATTRS
#endif /* _MWAITXINTRIN_H */
#endif /* __MWAITXINTRIN_H */

View File

@ -21,10 +21,10 @@
*===-----------------------------------------------------------------------===
*/
#ifndef _NMMINTRIN_H
#define _NMMINTRIN_H
#ifndef __NMMINTRIN_H
#define __NMMINTRIN_H
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
just include it now then. */
#include <smmintrin.h>
#endif /* _NMMINTRIN_H */
#endif /* __NMMINTRIN_H */

View File

@ -11540,7 +11540,7 @@ half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c);
*
* vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)).
*
* The address computed as (p + (offset * n)) must be
* The address computed as (p + (offset * n)) must be
* 8-bit aligned if gentype is char, uchar;
* 16-bit aligned if gentype is short, ushort, half;
* 32-bit aligned if gentype is int, uint, float;
@ -12862,7 +12862,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
* Read memory barrier that orders only
* loads.
* The flags argument specifies the memory
* address space and can be set to to a
* address space and can be set to a
* combination of the following literal
* values:
* CLK_LOCAL_MEM_FENCE
@ -12874,7 +12874,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
* Write memory barrier that orders only
* stores.
* The flags argument specifies the memory
* address space and can be set to to a
* address space and can be set to a
* combination of the following literal
* values:
* CLK_LOCAL_MEM_FENCE
@ -12888,7 +12888,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags);
cl_mem_fence_flags __ovld get_fence(const void *ptr);
cl_mem_fence_flags __ovld get_fence(void *ptr);
/**
/**
* Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions
* and checked in Sema since they should be declared as
* addr gentype* to_addr (gentype*);
@ -13773,7 +13773,7 @@ ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long opera
// add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t.
// or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t.
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand);
uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order);
uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope);
@ -14571,7 +14571,7 @@ int printf(__constant const char* st, ...);
* only. The filter_mode specified in sampler
* must be set to CLK_FILTER_NEAREST; otherwise
* the values returned are undefined.
* The read_image{f|i|ui} calls that take
* integer coordinates must use a sampler with
* normalized coordinates set to
@ -15421,8 +15421,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept
#define CLK_DEPTH_STENCIL 0x10BE
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_sRGB 0x10BF
#define CLK_sRGBA 0x10C1
#define CLK_sRGBx 0x10C0
#define CLK_sRGBA 0x10C1
#define CLK_sBGRA 0x10C2
#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0

50
c_headers/pconfigintrin.h Normal file
View File

@ -0,0 +1,50 @@
/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __PCONFIGINTRIN_H
#define __PCONFIGINTRIN_H
#define __PCONFIG_KEY_PROGRAM 0x00000001
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("pconfig")))
static __inline unsigned int __DEFAULT_FN_ATTRS
_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("pconfig"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -1,4 +1,4 @@
/*===------------- pkuintrin.h - PKU intrinsics ------------------===
/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
@ -40,7 +40,7 @@ _rdpkru_u32(void)
static __inline__ void __DEFAULT_FN_ATTRS
_wrpkru(unsigned int __val)
{
return __builtin_ia32_wrpkru(__val);
__builtin_ia32_wrpkru(__val);
}
#undef __DEFAULT_FN_ATTRS

View File

@ -28,9 +28,9 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sse3")))
__attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
/// \brief Loads data from an unaligned memory location to elements in a 128-bit
/// Loads data from an unaligned memory location to elements in a 128-bit
/// vector.
///
/// If the address of the data is not 16-byte aligned, the instruction may
@ -50,7 +50,7 @@ _mm_lddqu_si128(__m128i const *__p)
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
}
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
@ -69,7 +69,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b)
return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in two
/// Horizontally adds the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
@ -92,7 +92,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b)
return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in two
/// Horizontally subtracts the adjacent pairs of values contained in two
/// 128-bit vectors of [4 x float].
///
/// \headerfile <x86intrin.h>
@ -115,7 +115,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
}
/// \brief Moves and duplicates odd-indexed values from a 128-bit vector
/// Moves and duplicates odd-indexed values from a 128-bit vector
/// of [4 x float] to float values stored in a 128-bit vector of
/// [4 x float].
///
@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a)
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
}
/// \brief Duplicates even-indexed values from a 128-bit vector of
/// Duplicates even-indexed values from a 128-bit vector of
/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
///
/// \headerfile <x86intrin.h>
@ -158,7 +158,7 @@ _mm_moveldup_ps(__m128 __a)
return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
}
/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
/// Adds the even-indexed values and subtracts the odd-indexed values of
/// two 128-bit vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
@ -177,7 +177,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b)
return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
}
/// \brief Horizontally adds the pairs of values contained in two 128-bit
/// Horizontally adds the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
@ -200,7 +200,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b)
return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
}
/// \brief Horizontally subtracts the pairs of values contained in two 128-bit
/// Horizontally subtracts the pairs of values contained in two 128-bit
/// vectors of [2 x double].
///
/// \headerfile <x86intrin.h>
@ -223,13 +223,13 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
}
/// \brief Moves and duplicates one double-precision value to double-precision
/// Moves and duplicates one double-precision value to double-precision
/// values stored in a 128-bit vector of [2 x double].
///
/// \headerfile <x86intrin.h>
///
/// \code
/// __m128d _mm_loaddup_pd(double const * dp);
/// __m128d _mm_loaddup_pd(double const *dp);
/// \endcode
///
/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
@ -240,7 +240,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b)
/// duplicated values.
#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
/// \brief Moves and duplicates the double-precision value in the lower bits of
/// Moves and duplicates the double-precision value in the lower bits of
/// a 128-bit vector of [2 x double] to double-precision values stored in a
/// 128-bit vector of [2 x double].
///
@ -259,7 +259,7 @@ _mm_movedup_pd(__m128d __a)
return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
}
/// \brief Establishes a linear address memory range to be monitored and puts
/// Establishes a linear address memory range to be monitored and puts
/// the processor in the monitor event pending state. Data stored in the
/// monitored address range causes the processor to exit the pending state.
///
@ -280,7 +280,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
__builtin_ia32_monitor((void *)__p, __extensions, __hints);
}
/// \brief Used with the MONITOR instruction to wait while the processor is in
/// Used with the MONITOR instruction to wait while the processor is in
/// the monitor event pending state. Data stored in the monitored address
/// range causes the processor to exit the pending state.
///

View File

@ -21,13 +21,13 @@
*===-----------------------------------------------------------------------===
*/
#ifndef _POPCNTINTRIN_H
#define _POPCNTINTRIN_H
#ifndef __POPCNTINTRIN_H
#define __POPCNTINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
/// \brief Counts the number of bits in the source operand having a value of 1.
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
@ -43,7 +43,7 @@ _mm_popcnt_u32(unsigned int __A)
return __builtin_popcount(__A);
}
/// \brief Counts the number of bits in the source operand having a value of 1.
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
@ -60,7 +60,7 @@ _popcnt32(int __A)
}
#ifdef __x86_64__
/// \brief Counts the number of bits in the source operand having a value of 1.
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
@ -76,7 +76,7 @@ _mm_popcnt_u64(unsigned long long __A)
return __builtin_popcountll(__A);
}
/// \brief Counts the number of bits in the source operand having a value of 1.
/// Counts the number of bits in the source operand having a value of 1.
///
/// \headerfile <x86intrin.h>
///
@ -95,4 +95,4 @@ _popcnt64(long long __A)
#undef __DEFAULT_FN_ATTRS
#endif /* _POPCNTINTRIN_H */
#endif /* __POPCNTINTRIN_H */

View File

@ -28,8 +28,7 @@
#ifndef __PRFCHWINTRIN_H
#define __PRFCHWINTRIN_H
#if defined(__PRFCHW__) || defined(__3dNOW__)
/// \brief Loads a memory sequence containing the specified memory address into
/// Loads a memory sequence containing the specified memory address into
/// all data cache levels. The cache-coherency state is set to exclusive.
/// Data can be read from and written to the cache line without additional
/// delay.
@ -46,7 +45,7 @@ _m_prefetch(void *__P)
__builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
}
/// \brief Loads a memory sequence containing the specified memory address into
/// Loads a memory sequence containing the specified memory address into
/// the L1 data cache and sets the cache-coherency to modified. This
/// provides a hint to the processor that the cache line will be modified.
/// It is intended for use when the cache line will be written to shortly
@ -66,6 +65,5 @@ _m_prefetchw(void *__P)
{
__builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
}
#endif
#endif /* __PRFCHWINTRIN_H */

51
c_headers/ptwriteintrin.h Normal file
View File

@ -0,0 +1,51 @@
/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __PTWRITEINTRIN_H
#define __PTWRITEINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("ptwrite")))
static __inline__ void __DEFAULT_FN_ATTRS
_ptwrite32(unsigned int __value) {
__builtin_ia32_ptwrite32(__value);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_ptwrite64(unsigned long long __value) {
__builtin_ia32_ptwrite64(__value);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __PTWRITEINTRIN_H */

View File

@ -21,7 +21,7 @@
*===-----------------------------------------------------------------------===
*/
#ifndef __X86INTRIN_H
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
#endif

70
c_headers/sgxintrin.h Normal file
View File

@ -0,0 +1,70 @@
/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __SGXINTRIN_H
#define __SGXINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("sgx")))
static __inline unsigned int __DEFAULT_FN_ATTRS
_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("enclu"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
static __inline unsigned int __DEFAULT_FN_ATTRS
_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("encls"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
static __inline unsigned int __DEFAULT_FN_ATTRS
_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
{
unsigned int __result;
__asm__ ("enclv"
: "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
: "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
: "cc");
return __result;
}
#undef __DEFAULT_FN_ATTRS
#endif

View File

@ -29,10 +29,10 @@
#define __SHAINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); })
#define _mm_sha1rnds4_epu32(V1, V2, M) \
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)

View File

@ -21,13 +21,13 @@
*===-----------------------------------------------------------------------===
*/
#ifndef _SMMINTRIN_H
#define _SMMINTRIN_H
#ifndef __SMMINTRIN_H
#define __SMMINTRIN_H
#include <tmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
/* SSE4 Rounding macros. */
#define _MM_FROUND_TO_NEAREST_INT 0x00
@ -46,7 +46,7 @@
#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
/// Rounds up each element of the 128-bit vector of [4 x float] to an
/// integer and returns the rounded values in a 128-bit vector of
/// [4 x float].
///
@ -63,7 +63,7 @@
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
/// Rounds up each element of the 128-bit vector of [2 x double] to an
/// integer and returns the rounded values in a 128-bit vector of
/// [2 x double].
///
@ -80,7 +80,7 @@
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
/// \brief Copies three upper elements of the first 128-bit vector operand to
/// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of
/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
/// operand to an integer and copies it to the lowest element of the 128-bit
@ -105,7 +105,7 @@
/// values.
#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
/// \brief Copies the upper element of the first 128-bit vector operand to the
/// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double].
/// Rounds up the lower element of the second 128-bit vector operand to an
/// integer and copies it to the lower element of the 128-bit result vector
@ -130,7 +130,7 @@
/// values.
#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
/// Rounds down each element of the 128-bit vector of [4 x float] to an
/// an integer and returns the rounded values in a 128-bit vector of
/// [4 x float].
///
@ -147,7 +147,7 @@
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
/// Rounds down each element of the 128-bit vector of [2 x double] to an
/// integer and returns the rounded values in a 128-bit vector of
/// [2 x double].
///
@ -164,7 +164,7 @@
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
/// \brief Copies three upper elements of the first 128-bit vector operand to
/// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of
/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
/// operand to an integer and copies it to the lowest element of the 128-bit
@ -189,7 +189,7 @@
/// values.
#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
/// \brief Copies the upper element of the first 128-bit vector operand to the
/// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double].
/// Rounds down the lower element of the second 128-bit vector operand to an
/// integer and copies it to the lower element of the 128-bit result vector
@ -214,7 +214,7 @@
/// values.
#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
/// \brief Rounds each element of the 128-bit vector of [4 x float] to an
/// Rounds each element of the 128-bit vector of [4 x float] to an
/// integer value according to the rounding control specified by the second
/// argument and returns the rounded values in a 128-bit vector of
/// [4 x float].
@ -244,10 +244,10 @@
/// 10: Upward (toward positive infinity) \n
/// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_round_ps(X, M) __extension__ ({ \
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
#define _mm_round_ps(X, M) \
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
/// \brief Copies three upper elements of the first 128-bit vector operand to
/// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of
/// [4 x float]. Rounds the lowest element of the second 128-bit vector
/// operand to an integer value according to the rounding control specified
@ -285,11 +285,11 @@
/// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
/// values.
#define _mm_round_ss(X, Y, M) __extension__ ({ \
#define _mm_round_ss(X, Y, M) \
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); })
(__v4sf)(__m128)(Y), (M))
/// \brief Rounds each element of the 128-bit vector of [2 x double] to an
/// Rounds each element of the 128-bit vector of [2 x double] to an
/// integer value according to the rounding control specified by the second
/// argument and returns the rounded values in a 128-bit vector of
/// [2 x double].
@ -319,10 +319,10 @@
/// 10: Upward (toward positive infinity) \n
/// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_round_pd(X, M) __extension__ ({ \
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
#define _mm_round_pd(X, M) \
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
/// \brief Copies the upper element of the first 128-bit vector operand to the
/// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double].
/// Rounds the lower element of the second 128-bit vector operand to an
/// integer value according to the rounding control specified by the third
@ -360,12 +360,12 @@
/// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
/// values.
#define _mm_round_sd(X, Y, M) __extension__ ({ \
#define _mm_round_sd(X, Y, M) \
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); })
(__v2df)(__m128d)(Y), (M))
/* SSE4 Packed Blending Intrinsics. */
/// \brief Returns a 128-bit vector of [2 x double] where the values are
/// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the
/// third operand, the control mask.
///
@ -389,13 +389,11 @@
/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
(__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), \
(((M) & 0x01) ? 2 : 0), \
(((M) & 0x02) ? 3 : 1)); })
#define _mm_blend_pd(V1, V2, M) \
(__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), (int)(M))
/// \brief Returns a 128-bit vector of [4 x float] where the values are selected
/// Returns a 128-bit vector of [4 x float] where the values are selected
/// from either the first or second operand as specified by the third
/// operand, the control mask.
///
@ -419,14 +417,11 @@
/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
(__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
(((M) & 0x01) ? 4 : 0), \
(((M) & 0x02) ? 5 : 1), \
(((M) & 0x04) ? 6 : 2), \
(((M) & 0x08) ? 7 : 3)); })
#define _mm_blend_ps(V1, V2, M) \
(__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
(__v4sf)(__m128)(V2), (int)(M))
/// \brief Returns a 128-bit vector of [2 x double] where the values are
/// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the
/// third operand, the control mask.
///
@ -453,7 +448,7 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
(__v2df)__M);
}
/// \brief Returns a 128-bit vector of [4 x float] where the values are
/// Returns a 128-bit vector of [4 x float] where the values are
/// selected from either the first or second operand as specified by the
/// third operand, the control mask.
///
@ -480,7 +475,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
(__v4sf)__M);
}
/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
/// Returns a 128-bit vector of [16 x i8] where the values are selected
/// from either of the first or second operand as specified by the third
/// operand, the control mask.
///
@ -493,7 +488,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
/// \param __V2
/// A 128-bit vector of [16 x i8].
/// \param __M
/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
/// how the values are to be copied. The position of the mask bit corresponds
/// to the most significant bit of a copied value. When a mask bit is 0, the
/// corresponding 8-bit element in operand \a __V1 is copied to the same
@ -507,7 +502,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
(__v16qi)__M);
}
/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
/// Returns a 128-bit vector of [8 x i16] where the values are selected
/// from either of the first or second operand as specified by the third
/// operand, the control mask.
///
@ -531,20 +526,12 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
(__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), \
(((M) & 0x01) ? 8 : 0), \
(((M) & 0x02) ? 9 : 1), \
(((M) & 0x04) ? 10 : 2), \
(((M) & 0x08) ? 11 : 3), \
(((M) & 0x10) ? 12 : 4), \
(((M) & 0x20) ? 13 : 5), \
(((M) & 0x40) ? 14 : 6), \
(((M) & 0x80) ? 15 : 7)); })
#define _mm_blend_epi16(V1, V2, M) \
(__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), (int)(M))
/* SSE4 Dword Multiply Instructions. */
/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
/// and returns the lower 32 bits of the each product in a 128-bit vector of
/// [4 x i32].
///
@ -563,7 +550,7 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
}
/// \brief Multiplies corresponding even-indexed elements of two 128-bit
/// Multiplies corresponding even-indexed elements of two 128-bit
/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
/// containing the products.
///
@ -584,7 +571,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
}
/* SSE4 Floating Point Dot Product Instructions. */
/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
/// Computes the dot product of the two 128-bit vectors of [4 x float]
/// and returns it in the elements of the 128-bit result vector of
/// [4 x float].
///
@ -616,11 +603,11 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// each [4 x float] subvector. If a bit is set, the dot product is returned
/// in the corresponding element; otherwise that element is set to zero.
/// \returns A 128-bit vector of [4 x float] containing the dot product.
#define _mm_dp_ps(X, Y, M) __extension__ ({ \
#define _mm_dp_ps(X, Y, M) \
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)); })
(__v4sf)(__m128)(Y), (M))
/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
/// Computes the dot product of the two 128-bit vectors of [2 x double]
/// and returns it in the elements of the 128-bit result vector of
/// [2 x double].
///
@ -651,12 +638,12 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// to the lowest element and bit [1] corresponding to the highest element of
/// each [2 x double] vector. If a bit is set, the dot product is returned in
/// the corresponding element; otherwise that element is set to zero.
#define _mm_dp_pd(X, Y, M) __extension__ ({\
#define _mm_dp_pd(X, Y, M) \
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)); })
(__v2df)(__m128d)(Y), (M))
/* SSE4 Streaming Load Hint Instruction. */
/// \brief Loads integer values from a 128-bit aligned memory location to a
/// Loads integer values from a 128-bit aligned memory location to a
/// 128-bit integer vector.
///
/// \headerfile <x86intrin.h>
@ -675,7 +662,7 @@ _mm_stream_load_si128 (__m128i const *__V)
}
/* SSE4 Packed Integer Min/Max Instructions. */
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
/// of the two values.
///
@ -694,7 +681,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
/// greater value of the two.
///
@ -713,7 +700,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
/// value of the two.
///
@ -732,7 +719,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
/// greater value of the two.
///
@ -751,7 +738,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
/// value of the two.
///
@ -770,7 +757,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
/// greater value of the two.
///
@ -789,7 +776,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
/// value of the two.
///
@ -808,7 +795,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
}
/// \brief Compares the corresponding elements of two 128-bit vectors of
/// Compares the corresponding elements of two 128-bit vectors of
/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
/// greater value of the two.
///
@ -828,7 +815,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
}
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
/// \brief Takes the first argument \a X and inserts an element from the second
/// Takes the first argument \a X and inserts an element from the second
/// argument \a Y as selected by the third argument \a N. That result then
/// has elements zeroed out also as selected by the third argument \a N. The
/// resulting 128-bit vector of [4 x float] is then returned.
@ -870,7 +857,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// single-precision floating point elements from the operands.
#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
/// returns it, using the immediate value parameter \a N as a selector.
///
/// \headerfile <x86intrin.h>
@ -893,15 +880,14 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 11: Bits [127:96] of parameter \a X are returned.
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
#define _mm_extract_ps(X, N) (__extension__ \
({ union { int __i; float __f; } __t; \
__v4sf __a = (__v4sf)(__m128)(X); \
__t.__f = __a[(N) & 3]; \
__t.__i;}))
({ union { int __i; float __f; } __t; \
__t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
__t.__i;}))
/* Miscellaneous insert and extract macros. */
/* Extract a single-precision float from X at index N into D. */
#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
(D) = __a[N]; }))
#define _MM_EXTRACT_FLOAT(D, X, N) \
{ (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
an index suitable for _mm_insert_ps. */
@ -912,7 +898,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
/* Insert int into packed integer array at index. */
/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
/// of an integer parameter \a I into an offset specified by the immediate
/// value parameter \a N.
@ -952,12 +938,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1110: Bits [119:112] of the result are used for insertion. \n
/// 1111: Bits [127:120] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi8(X, I, N) (__extension__ \
({ __v16qi __a = (__v16qi)(__m128i)(X); \
__a[(N) & 15] = (I); \
(__m128i)__a;}))
#define _mm_insert_epi8(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
(int)(I), (int)(N))
/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
/// the 128-bit integer vector parameter, and then inserting the 32-bit
/// integer parameter \a I at the offset specified by the immediate value
/// parameter \a N.
@ -985,13 +970,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 10: Bits [95:64] of the result are used for insertion. \n
/// 11: Bits [127:96] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi32(X, I, N) (__extension__ \
({ __v4si __a = (__v4si)(__m128i)(X); \
__a[(N) & 3] = (I); \
(__m128i)__a;}))
#define _mm_insert_epi32(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
(int)(I), (int)(N))
#ifdef __x86_64__
/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
/// the 128-bit integer vector parameter, and then inserting the 64-bit
/// integer parameter \a I, using the immediate value parameter \a N as an
/// insertion location selector.
@ -1017,16 +1001,15 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 0: Bits [63:0] of the result are used for insertion. \n
/// 1: Bits [127:64] of the result are used for insertion. \n
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi64(X, I, N) (__extension__ \
({ __v2di __a = (__v2di)(__m128i)(X); \
__a[(N) & 1] = (I); \
(__m128i)__a;}))
#define _mm_insert_epi64(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
(long long)(I), (int)(N))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
* as a zero extended value, so it is unsigned.
*/
/// \brief Extracts an 8-bit element from the 128-bit integer vector of
/// Extracts an 8-bit element from the 128-bit integer vector of
/// [16 x i8], using the immediate value parameter \a N as a selector.
///
/// \headerfile <x86intrin.h>
@ -1061,11 +1044,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// \returns An unsigned integer, whose lower 8 bits are selected from the
/// 128-bit integer vector parameter and the remaining bits are assigned
/// zeros.
#define _mm_extract_epi8(X, N) (__extension__ \
({ __v16qi __a = (__v16qi)(__m128i)(X); \
(int)(unsigned char) __a[(N) & 15];}))
#define _mm_extract_epi8(X, N) \
(int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N))
/// \brief Extracts a 32-bit element from the 128-bit integer vector of
/// Extracts a 32-bit element from the 128-bit integer vector of
/// [4 x i32], using the immediate value parameter \a N as a selector.
///
/// \headerfile <x86intrin.h>
@ -1087,12 +1070,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 11: Bits [127:96] of the parameter \a X are exracted.
/// \returns An integer, whose lower 32 bits are selected from the 128-bit
/// integer vector parameter and the remaining bits are assigned zeros.
#define _mm_extract_epi32(X, N) (__extension__ \
({ __v4si __a = (__v4si)(__m128i)(X); \
(int)__a[(N) & 3];}))
#define _mm_extract_epi32(X, N) \
(int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
#ifdef __x86_64__
/// \brief Extracts a 64-bit element from the 128-bit integer vector of
/// Extracts a 64-bit element from the 128-bit integer vector of
/// [2 x i64], using the immediate value parameter \a N as a selector.
///
/// \headerfile <x86intrin.h>
@ -1111,13 +1093,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 0: Bits [63:0] are returned. \n
/// 1: Bits [127:64] are returned. \n
/// \returns A 64-bit integer.
#define _mm_extract_epi64(X, N) (__extension__ \
({ __v2di __a = (__v2di)(__m128i)(X); \
(long long)__a[(N) & 1];}))
#define _mm_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
/// \brief Tests whether the specified bits in a 128-bit integer vector are all
/// Tests whether the specified bits in a 128-bit integer vector are all
/// zeros.
///
/// \headerfile <x86intrin.h>
@ -1135,7 +1116,7 @@ _mm_testz_si128(__m128i __M, __m128i __V)
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
}
/// \brief Tests whether the specified bits in a 128-bit integer vector are all
/// Tests whether the specified bits in a 128-bit integer vector are all
/// ones.
///
/// \headerfile <x86intrin.h>
@ -1153,7 +1134,7 @@ _mm_testc_si128(__m128i __M, __m128i __V)
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
}
/// \brief Tests whether the specified bits in a 128-bit integer vector are
/// Tests whether the specified bits in a 128-bit integer vector are
/// neither all zeros nor all ones.
///
/// \headerfile <x86intrin.h>
@ -1172,7 +1153,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
}
/// \brief Tests whether the specified bits in a 128-bit integer vector are all
/// Tests whether the specified bits in a 128-bit integer vector are all
/// ones.
///
/// \headerfile <x86intrin.h>
@ -1189,7 +1170,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
/// otherwise.
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
/// \brief Tests whether the specified bits in a 128-bit integer vector are
/// Tests whether the specified bits in a 128-bit integer vector are
/// neither all zeros nor all ones.
///
/// \headerfile <x86intrin.h>
@ -1208,7 +1189,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
/// FALSE otherwise.
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
/// \brief Tests whether the specified bits in a 128-bit integer vector are all
/// Tests whether the specified bits in a 128-bit integer vector are all
/// zeros.
///
/// \headerfile <x86intrin.h>
@ -1227,7 +1208,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
/* SSE4 64-bit Packed Integer Comparisons. */
/// \brief Compares each of the corresponding 64-bit values of the 128-bit
/// Compares each of the corresponding 64-bit values of the 128-bit
/// integer vectors for equality.
///
/// \headerfile <x86intrin.h>
@ -1246,7 +1227,7 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
}
/* SSE4 Packed Integer Sign-Extension. */
/// \brief Sign-extends each of the lower eight 8-bit integer elements of a
/// Sign-extends each of the lower eight 8-bit integer elements of a
/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
/// are unused.
@ -1267,7 +1248,7 @@ _mm_cvtepi8_epi16(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
}
/// \brief Sign-extends each of the lower four 8-bit integer elements of a
/// Sign-extends each of the lower four 8-bit integer elements of a
/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
/// vector are unused.
@ -1277,8 +1258,8 @@ _mm_cvtepi8_epi16(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
///
/// \param __V
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
/// extended to 32-bit values.
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
/// sign-extended to 32-bit values.
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi32(__m128i __V)
@ -1288,7 +1269,7 @@ _mm_cvtepi8_epi32(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
}
/// \brief Sign-extends each of the lower two 8-bit integer elements of a
/// Sign-extends each of the lower two 8-bit integer elements of a
/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
/// vector are unused.
@ -1298,8 +1279,8 @@ _mm_cvtepi8_epi32(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
/// extended to 64-bit values.
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
/// sign-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi8_epi64(__m128i __V)
@ -1309,7 +1290,7 @@ _mm_cvtepi8_epi64(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
}
/// \brief Sign-extends each of the lower four 16-bit integer elements of a
/// Sign-extends each of the lower four 16-bit integer elements of a
/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
/// a 128-bit vector of [4 x i32]. The upper four elements of the input
/// vector are unused.
@ -1319,8 +1300,8 @@ _mm_cvtepi8_epi64(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
///
/// \param __V
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
/// extended to 32-bit values.
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
/// sign-extended to 32-bit values.
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi32(__m128i __V)
@ -1328,7 +1309,7 @@ _mm_cvtepi16_epi32(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
}
/// \brief Sign-extends each of the lower two 16-bit integer elements of a
/// Sign-extends each of the lower two 16-bit integer elements of a
/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper six elements of the input
/// vector are unused.
@ -1338,8 +1319,8 @@ _mm_cvtepi16_epi32(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
/// extended to 64-bit values.
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
/// sign-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi16_epi64(__m128i __V)
@ -1347,7 +1328,7 @@ _mm_cvtepi16_epi64(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
}
/// \brief Sign-extends each of the lower two 32-bit integer elements of a
/// Sign-extends each of the lower two 32-bit integer elements of a
/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
/// are unused.
@ -1357,8 +1338,8 @@ _mm_cvtepi16_epi64(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
/// extended to 64-bit values.
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
/// sign-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepi32_epi64(__m128i __V)
@ -1367,7 +1348,7 @@ _mm_cvtepi32_epi64(__m128i __V)
}
/* SSE4 Packed Integer Zero-Extension. */
/// \brief Zero-extends each of the lower eight 8-bit integer elements of a
/// Zero-extends each of the lower eight 8-bit integer elements of a
/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
/// are unused.
@ -1377,8 +1358,8 @@ _mm_cvtepi32_epi64(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
///
/// \param __V
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
/// extended to 16-bit values.
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
/// zero-extended to 16-bit values.
/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi16(__m128i __V)
@ -1386,7 +1367,7 @@ _mm_cvtepu8_epi16(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
}
/// \brief Zero-extends each of the lower four 8-bit integer elements of a
/// Zero-extends each of the lower four 8-bit integer elements of a
/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
/// vector are unused.
@ -1396,8 +1377,8 @@ _mm_cvtepu8_epi16(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
///
/// \param __V
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
/// extended to 32-bit values.
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
/// zero-extended to 32-bit values.
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi32(__m128i __V)
@ -1405,7 +1386,7 @@ _mm_cvtepu8_epi32(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
}
/// \brief Zero-extends each of the lower two 8-bit integer elements of a
/// Zero-extends each of the lower two 8-bit integer elements of a
/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
/// vector are unused.
@ -1415,8 +1396,8 @@ _mm_cvtepu8_epi32(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
/// extended to 64-bit values.
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
/// zero-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu8_epi64(__m128i __V)
@ -1424,7 +1405,7 @@ _mm_cvtepu8_epi64(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
}
/// \brief Zero-extends each of the lower four 16-bit integer elements of a
/// Zero-extends each of the lower four 16-bit integer elements of a
/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
/// a 128-bit vector of [4 x i32]. The upper four elements of the input
/// vector are unused.
@ -1434,8 +1415,8 @@ _mm_cvtepu8_epi64(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
///
/// \param __V
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
/// extended to 32-bit values.
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
/// zero-extended to 32-bit values.
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi32(__m128i __V)
@ -1443,7 +1424,7 @@ _mm_cvtepu16_epi32(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
}
/// \brief Zero-extends each of the lower two 16-bit integer elements of a
/// Zero-extends each of the lower two 16-bit integer elements of a
/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
/// are unused.
@ -1453,8 +1434,8 @@ _mm_cvtepu16_epi32(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
/// extended to 64-bit values.
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
/// zero-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu16_epi64(__m128i __V)
@ -1462,7 +1443,7 @@ _mm_cvtepu16_epi64(__m128i __V)
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
}
/// \brief Zero-extends each of the lower two 32-bit integer elements of a
/// Zero-extends each of the lower two 32-bit integer elements of a
/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
/// are unused.
@ -1472,8 +1453,8 @@ _mm_cvtepu16_epi64(__m128i __V)
/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
///
/// \param __V
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
/// extended to 64-bit values.
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
/// zero-extended to 64-bit values.
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_cvtepu32_epi64(__m128i __V)
@ -1482,7 +1463,7 @@ _mm_cvtepu32_epi64(__m128i __V)
}
/* SSE4 Pack with Unsigned Saturation. */
/// \brief Converts 32-bit signed integers from both 128-bit integer vector
/// Converts 32-bit signed integers from both 128-bit integer vector
/// operands into 16-bit unsigned integers, and returns the packed result.
/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
/// 0x0000 are saturated to 0x0000.
@ -1511,7 +1492,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
}
/* SSE4 Multiple Packed Sums of Absolute Difference. */
/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
/// Subtracts 8-bit unsigned integer values and computes the absolute
/// values of the differences to the corresponding bits in the destination.
/// Then sums of the absolute differences are returned according to the bit
/// fields in the immediate operand.
@ -1534,23 +1515,23 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/// \code
/// // M2 represents bit 2 of the immediate operand
/// // M10 represents bits [1:0] of the immediate operand
/// i = M2 * 4
/// j = M10 * 4
/// i = M2 * 4;
/// j = M10 * 4;
/// for (k = 0; k < 8; k = k + 1) {
/// d0 = abs(X[i + k + 0] - Y[j + 0])
/// d1 = abs(X[i + k + 1] - Y[j + 1])
/// d2 = abs(X[i + k + 2] - Y[j + 2])
/// d3 = abs(X[i + k + 3] - Y[j + 3])
/// r[k] = d0 + d1 + d2 + d3
/// d0 = abs(X[i + k + 0] - Y[j + 0]);
/// d1 = abs(X[i + k + 1] - Y[j + 1]);
/// d2 = abs(X[i + k + 2] - Y[j + 2]);
/// d3 = abs(X[i + k + 3] - Y[j + 3]);
/// r[k] = d0 + d1 + d2 + d3;
/// }
/// \endcode
/// \returns A 128-bit integer vector containing the sums of the sets of
/// absolute differences between both operands.
#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
#define _mm_mpsadbw_epu8(X, Y, M) \
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M)); })
(__v16qi)(__m128i)(Y), (M))
/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
/// Finds the minimum unsigned 16-bit element in the input 128-bit
/// vector of [8 x u16] and returns it and along with its index.
///
/// \headerfile <x86intrin.h>
@ -1604,7 +1585,7 @@ _mm_minpos_epu16(__m128i __V)
#define _SIDD_UNIT_MASK 0x40
/* SSE4.2 Packed Comparison Intrinsics. */
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns a 128-bit integer vector representing the result
/// mask of the comparison.
@ -1660,7 +1641,7 @@ _mm_minpos_epu16(__m128i __V)
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns an integer representing the result index of the
/// comparison.
@ -1714,7 +1695,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns a 128-bit integer vector representing the result
/// mask of the comparison.
@ -1775,7 +1756,7 @@ _mm_minpos_epu16(__m128i __V)
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns an integer representing the result index of the
/// comparison.
@ -1835,7 +1816,7 @@ _mm_minpos_epu16(__m128i __V)
(int)(M))
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
/// string in \a B is the maximum, otherwise, returns 0.
@ -1885,7 +1866,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
/// 0.
@ -1934,7 +1915,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns bit 0 of the resulting bit mask.
///
@ -1982,7 +1963,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
/// the maximum, otherwise, returns 0.
@ -2032,7 +2013,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
/// the maximum, otherwise, returns 0.
@ -2082,7 +2063,7 @@ _mm_minpos_epu16(__m128i __V)
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
/// string in \a B is the maximum, otherwise, returns 0.
@ -2137,7 +2118,7 @@ _mm_minpos_epu16(__m128i __V)
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
/// returns 0.
@ -2191,7 +2172,7 @@ _mm_minpos_epu16(__m128i __V)
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns bit 0 of the resulting bit mask.
///
@ -2244,7 +2225,7 @@ _mm_minpos_epu16(__m128i __V)
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
/// the maximum, otherwise, returns 0.
@ -2299,7 +2280,7 @@ _mm_minpos_epu16(__m128i __V)
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
/// \brief Uses the immediate operand \a M to perform a comparison of string
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
/// the maximum, otherwise, returns 0.
@ -2354,7 +2335,7 @@ _mm_minpos_epu16(__m128i __V)
(int)(M))
/* SSE4.2 Compare Packed Data -- Greater Than. */
/// \brief Compares each of the corresponding 64-bit values of the 128-bit
/// Compares each of the corresponding 64-bit values of the 128-bit
/// integer vectors to determine if the values in the first operand are
/// greater than those in the second operand.
///
@ -2374,7 +2355,7 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
}
/* SSE4.2 Accumulate CRC32. */
/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned char operand.
///
/// \headerfile <x86intrin.h>
@ -2394,7 +2375,7 @@ _mm_crc32_u8(unsigned int __C, unsigned char __D)
return __builtin_ia32_crc32qi(__C, __D);
}
/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned short operand.
///
/// \headerfile <x86intrin.h>
@ -2414,7 +2395,7 @@ _mm_crc32_u16(unsigned int __C, unsigned short __D)
return __builtin_ia32_crc32hi(__C, __D);
}
/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
/// Adds the first unsigned integer operand to the CRC-32C checksum of
/// the second unsigned integer operand.
///
/// \headerfile <x86intrin.h>
@ -2435,7 +2416,7 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D)
}
#ifdef __x86_64__
/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned 64-bit integer operand.
///
/// \headerfile <x86intrin.h>
@ -2458,8 +2439,6 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
#undef __DEFAULT_FN_ATTRS
#ifdef __POPCNT__
#include <popcntintrin.h>
#endif
#endif /* _SMMINTRIN_H */
#endif /* __SMMINTRIN_H */

View File

@ -88,7 +88,7 @@
*
* To accommodate targets that are missing types that are exactly 8, 16, 32, or
* 64 bits wide, this implementation takes an approach of cascading
* redefintions, redefining __int_leastN_t to successively smaller exact-width
* redefinitions, redefining __int_leastN_t to successively smaller exact-width
* types. It is therefore important that the types are defined in order of
* descending widths.
*
@ -461,7 +461,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
* As in the type definitions, this section takes an approach of
* successive-shrinking to determine which limits to use for the standard (8,
* 16, 32, 64) bit widths when they don't have exact representations. It is
* therefore important that the defintions be kept in order of decending
* therefore important that the definitions be kept in order of decending
* widths.
*
* Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the

View File

@ -27,9 +27,10 @@
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
/// \brief Computes the absolute value of each of the packed 8-bit signed
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
@ -41,13 +42,13 @@
/// A 64-bit vector of [8 x i8].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi8(__m64 __a)
{
return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
}
/// \brief Computes the absolute value of each of the packed 8-bit signed
/// Computes the absolute value of each of the packed 8-bit signed
/// integers in the source operand and stores the 8-bit unsigned integer
/// results in the destination.
///
@ -65,7 +66,7 @@ _mm_abs_epi8(__m128i __a)
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
}
/// \brief Computes the absolute value of each of the packed 16-bit signed
/// Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
@ -77,13 +78,13 @@ _mm_abs_epi8(__m128i __a)
/// A 64-bit vector of [4 x i16].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi16(__m64 __a)
{
return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
}
/// \brief Computes the absolute value of each of the packed 16-bit signed
/// Computes the absolute value of each of the packed 16-bit signed
/// integers in the source operand and stores the 16-bit unsigned integer
/// results in the destination.
///
@ -101,7 +102,7 @@ _mm_abs_epi16(__m128i __a)
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
}
/// \brief Computes the absolute value of each of the packed 32-bit signed
/// Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
@ -113,13 +114,13 @@ _mm_abs_epi16(__m128i __a)
/// A 64-bit vector of [2 x i32].
/// \returns A 64-bit integer vector containing the absolute values of the
/// elements in the operand.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi32(__m64 __a)
{
return (__m64)__builtin_ia32_pabsd((__v2si)__a);
}
/// \brief Computes the absolute value of each of the packed 32-bit signed
/// Computes the absolute value of each of the packed 32-bit signed
/// integers in the source operand and stores the 32-bit unsigned integer
/// results in the destination.
///
@ -137,7 +138,7 @@ _mm_abs_epi32(__m128i __a)
return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
}
/// \brief Concatenates the two 128-bit integer vector operands, and
/// Concatenates the two 128-bit integer vector operands, and
/// right-shifts the result by the number of bytes specified in the immediate
/// operand.
///
@ -157,11 +158,11 @@ _mm_abs_epi32(__m128i __a)
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_epi8(a, b, n) __extension__ ({ \
#define _mm_alignr_epi8(a, b, n) \
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)); })
(__v16qi)(__m128i)(b), (n))
/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts
/// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
///
/// \headerfile <x86intrin.h>
@ -180,10 +181,10 @@ _mm_abs_epi32(__m128i __a)
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) __extension__ ({ \
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); })
#define _mm_alignr_pi8(a, b, n) \
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
@ -206,7 +207,7 @@ _mm_hadd_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
@ -229,7 +230,7 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
@ -246,13 +247,13 @@ _mm_hadd_epi32(__m128i __a, __m128i __b)
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
@ -269,15 +270,16 @@ _mm_hadd_pi16(__m64 __a, __m64 __b)
/// destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
/// operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
/// 0x8000.
///
/// \headerfile <x86intrin.h>
///
@ -299,9 +301,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
/// 0x8000.
///
/// \headerfile <x86intrin.h>
///
@ -317,13 +320,13 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
/// destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// sums of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadds_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [8 x i16].
///
/// \headerfile <x86intrin.h>
@ -346,7 +349,7 @@ _mm_hsub_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [4 x i32].
///
/// \headerfile <x86intrin.h>
@ -369,7 +372,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [4 x i16].
///
/// \headerfile <x86intrin.h>
@ -386,13 +389,13 @@ _mm_hsub_epi32(__m128i __a, __m128i __b)
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [2 x i32].
///
/// \headerfile <x86intrin.h>
@ -409,16 +412,16 @@ _mm_hsub_pi16(__m64 __a, __m64 __b)
/// the destination.
/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
/// of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
/// saturated to 8000h.
/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
/// saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
@ -440,10 +443,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Horizontally subtracts the adjacent pairs of values contained in 2
/// Horizontally subtracts the adjacent pairs of values contained in 2
/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
/// saturated to 8000h.
/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
/// saturated to 0x8000.
///
/// \headerfile <x86intrin.h>
///
@ -459,13 +462,13 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
/// the destination.
/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
/// differences of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
@ -499,7 +502,7 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
}
/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer
/// Multiplies corresponding pairs of packed 8-bit unsigned integer
/// values contained in the first source operand and packed 8-bit signed
/// integer values contained in the second source operand, adds pairs of
/// contiguous products with signed saturation, and writes the 16-bit sums to
@ -523,13 +526,13 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_maddubs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
}
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
@ -549,7 +552,7 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
/// products to the 18 most significant bits by right-shifting, rounds the
/// truncated value by adding 1, and writes bits [16:1] to the destination.
///
@ -563,13 +566,13 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b)
/// A 64-bit vector of [4 x i16] containing one of the source operands.
/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
/// products of both operands.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhrs_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
}
/// \brief Copies the 8-bit integers from a 128-bit integer vector to the
/// Copies the 8-bit integers from a 128-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
@ -595,7 +598,7 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}
/// \brief Copies the 8-bit integers from a 64-bit integer vector to the
/// Copies the 8-bit integers from a 64-bit integer vector to the
/// destination or clears 8-bit values in the destination, as specified by
/// the second source operand.
///
@ -614,13 +617,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
/// destination. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
@ -646,7 +649,7 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
@ -672,7 +675,7 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
@ -698,7 +701,7 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
}
/// \brief For each 8-bit integer in the first source operand, perform one of
/// For each 8-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the byte in the second source is negative, calculate the two's
@ -718,13 +721,13 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
/// A 64-bit integer vector containing control bytes corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi8(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
}
/// \brief For each 16-bit integer in the first source operand, perform one of
/// For each 16-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the word in the second source is negative, calculate the two's
@ -744,13 +747,13 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
/// A 64-bit integer vector containing control words corresponding to
/// positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi16(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
}
/// \brief For each 32-bit integer in the first source operand, perform one of
/// For each 32-bit integer in the first source operand, perform one of
/// the following actions as specified by the second source operand.
///
/// If the doubleword in the second source is negative, calculate the two's
@ -770,12 +773,13 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
/// A 64-bit integer vector containing two control doublewords corresponding
/// to positions in the destination.
/// \returns A 64-bit integer vector containing the resultant values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi32(__m64 __a, __m64 __b)
{
return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_MMX
#endif /* __TMMINTRIN_H */

View File

@ -29,10 +29,10 @@
#define __VAESINTRIN_H
/* Default attributes for YMM forms. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
/* Default attributes for ZMM forms. */
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes")))
#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))
static __inline__ __m256i __DEFAULT_FN_ATTRS

View File

@ -28,15 +28,15 @@
#ifndef __VPCLMULQDQINTRIN_H
#define __VPCLMULQDQINTRIN_H
#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \
#define _mm256_clmulepi64_epi128(A, B, I) \
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \
(char)(I)); })
(char)(I))
#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \
#define _mm512_clmulepi64_epi128(A, B, I) \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
(char)(I)); })
(char)(I))
#endif // __VPCLMULQDQINTRIN_H
#endif /* __VPCLMULQDQINTRIN_H */

56
c_headers/waitpkgintrin.h Normal file
View File

@ -0,0 +1,56 @@
/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __WAITPKGINTRIN_H
#define __WAITPKGINTRIN_H
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("waitpkg")))
static __inline__ void __DEFAULT_FN_ATTRS
_umonitor (void * __address)
{
__builtin_ia32_umonitor (__address);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_umwait (unsigned int __control, unsigned long long __counter)
{
return __builtin_ia32_umwait (__control,
(unsigned int)(__counter >> 32), (unsigned int)__counter);
}
static __inline__ unsigned char __DEFAULT_FN_ATTRS
_tpause (unsigned int __control, unsigned long long __counter)
{
return __builtin_ia32_tpause (__control,
(unsigned int)(__counter >> 32), (unsigned int)__counter);
}
#undef __DEFAULT_FN_ATTRS
#endif /* __WAITPKGINTRIN_H */

View File

@ -0,0 +1,38 @@
/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef __WBNOINVDINTRIN_H
#define __WBNOINVDINTRIN_H
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd")))
_wbnoinvd (void)
{
__builtin_ia32_wbnoinvd ();
}
#endif /* __WBNOINVDINTRIN_H */

View File

@ -21,8 +21,8 @@
*===-----------------------------------------------------------------------===
*/
#ifndef _WMMINTRIN_H
#define _WMMINTRIN_H
#ifndef __WMMINTRIN_H
#define __WMMINTRIN_H
#include <emmintrin.h>
@ -30,4 +30,4 @@
#include <__wmmintrin_pclmul.h>
#endif /* _WMMINTRIN_H */
#endif /* __WMMINTRIN_H */

View File

@ -32,26 +32,6 @@
#include <mm3dnow.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
#include <bmiintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
#include <bmi2intrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__)
#include <lzcntintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__)
#include <popcntintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__)
#include <rdseedintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__)
#include <prfchwintrin.h>
#endif
@ -76,10 +56,6 @@
#include <lwpintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
#include <f16cintrin.h>
#endif
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__)
#include <mwaitxintrin.h>
#endif
@ -88,4 +64,5 @@
#include <clzerointrin.h>
#endif
#endif /* __X86INTRIN_H */

File diff suppressed because it is too large Load Diff

View File

@ -31,7 +31,8 @@
#include <fma4intrin.h>
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop")))
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
@ -201,7 +202,7 @@ _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
{
return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
@ -237,17 +238,17 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
}
#define _mm_roti_epi8(A, N) __extension__ ({ \
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); })
#define _mm_roti_epi8(A, N) \
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
#define _mm_roti_epi16(A, N) __extension__ ({ \
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); })
#define _mm_roti_epi16(A, N) \
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
#define _mm_roti_epi32(A, N) __extension__ ({ \
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); })
#define _mm_roti_epi32(A, N) \
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
#define _mm_roti_epi64(A, N) __extension__ ({ \
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); })
#define _mm_roti_epi64(A, N) \
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shl_epi8(__m128i __A, __m128i __B)
@ -297,37 +298,37 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
}
#define _mm_com_epu8(A, B, N) __extension__ ({ \
#define _mm_com_epu8(A, B, N) \
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); })
(__v16qi)(__m128i)(B), (N))
#define _mm_com_epu16(A, B, N) __extension__ ({ \
#define _mm_com_epu16(A, B, N) \
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); })
(__v8hi)(__m128i)(B), (N))
#define _mm_com_epu32(A, B, N) __extension__ ({ \
#define _mm_com_epu32(A, B, N) \
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); })
(__v4si)(__m128i)(B), (N))
#define _mm_com_epu64(A, B, N) __extension__ ({ \
#define _mm_com_epu64(A, B, N) \
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); })
(__v2di)(__m128i)(B), (N))
#define _mm_com_epi8(A, B, N) __extension__ ({ \
#define _mm_com_epi8(A, B, N) \
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)); })
(__v16qi)(__m128i)(B), (N))
#define _mm_com_epi16(A, B, N) __extension__ ({ \
#define _mm_com_epi16(A, B, N) \
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)); })
(__v8hi)(__m128i)(B), (N))
#define _mm_com_epi32(A, B, N) __extension__ ({ \
#define _mm_com_epi32(A, B, N) \
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)); })
(__v4si)(__m128i)(B), (N))
#define _mm_com_epi64(A, B, N) __extension__ ({ \
#define _mm_com_epi64(A, B, N) \
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)); })
(__v2di)(__m128i)(B), (N))
#define _MM_PCOMCTRL_LT 0
#define _MM_PCOMCTRL_LE 1
@ -722,24 +723,24 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
}
#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \
#define _mm_permute2_pd(X, Y, C, I) \
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), \
(__v2di)(__m128i)(C), (I)); })
(__v2di)(__m128i)(C), (I))
#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \
#define _mm256_permute2_pd(X, Y, C, I) \
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
(__v4df)(__m256d)(Y), \
(__v4di)(__m256i)(C), (I)); })
(__v4di)(__m256i)(C), (I))
#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \
#define _mm_permute2_ps(X, Y, C, I) \
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(__v4si)(__m128i)(C), (I)); })
(__v4si)(__m128i)(C), (I))
#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \
#define _mm256_permute2_ps(X, Y, C, I) \
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
(__v8sf)(__m256)(Y), \
(__v8si)(__m256i)(C), (I)); })
(__v8si)(__m256i)(C), (I))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_frcz_ss(__m128 __A)
@ -765,18 +766,19 @@ _mm_frcz_pd(__m128d __A)
return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_frcz_ps(__m256 __A)
{
return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
static __inline__ __m256d __DEFAULT_FN_ATTRS256
_mm256_frcz_pd(__m256d __A)
{
return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS256
#endif /* __XOPINTRIN_H */

View File

@ -1,4 +1,4 @@
/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------===
/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal

View File

@ -1,4 +1,4 @@
/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------===
/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -33,23 +33,23 @@
static __inline__ void __DEFAULT_FN_ATTRS
_xsave(void *__p, unsigned long long __m) {
return __builtin_ia32_xsave(__p, __m);
__builtin_ia32_xsave(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xrstor(void *__p, unsigned long long __m) {
return __builtin_ia32_xrstor(__p, __m);
__builtin_ia32_xrstor(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xsave64(void *__p, unsigned long long __m) {
return __builtin_ia32_xsave64(__p, __m);
__builtin_ia32_xsave64(__p, __m);
}
static __inline__ void __DEFAULT_FN_ATTRS
_xrstor64(void *__p, unsigned long long __m) {
return __builtin_ia32_xrstor64(__p, __m);
__builtin_ia32_xrstor64(__p, __m);
}
#endif

View File

@ -1,4 +1,4 @@
/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------===
/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
@ -33,13 +33,13 @@
static __inline__ void __DEFAULT_FN_ATTRS
_xsaveopt(void *__p, unsigned long long __m) {
return __builtin_ia32_xsaveopt(__p, __m);
__builtin_ia32_xsaveopt(__p, __m);
}
#ifdef __x86_64__
static __inline__ void __DEFAULT_FN_ATTRS
_xsaveopt64(void *__p, unsigned long long __m) {
return __builtin_ia32_xsaveopt64(__p, __m);
__builtin_ia32_xsaveopt64(__p, __m);
}
#endif

View File

@ -1,4 +1,4 @@
/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------===
/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal

View File

@ -1,4 +1,4 @@
/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------===
/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal