mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 14:23:09 +00:00
update C headers to LLVM 15
release/15.x 37007475ca1b345b4c5d340e228bcd7a62732d81
This commit is contained in:
parent
adb4a95302
commit
d3389eadf4
12
lib/include/__clang_cuda_intrinsics.h
vendored
12
lib/include/__clang_cuda_intrinsics.h
vendored
@ -71,8 +71,8 @@
|
|||||||
} \
|
} \
|
||||||
inline __device__ unsigned long long __FnName( \
|
inline __device__ unsigned long long __FnName( \
|
||||||
unsigned long long __val, __Type __offset, int __width = warpSize) { \
|
unsigned long long __val, __Type __offset, int __width = warpSize) { \
|
||||||
return static_cast<unsigned long long>(::__FnName( \
|
return static_cast<unsigned long long>( \
|
||||||
static_cast<unsigned long long>(__val), __offset, __width)); \
|
::__FnName(static_cast<long long>(__val), __offset, __width)); \
|
||||||
} \
|
} \
|
||||||
inline __device__ double __FnName(double __val, __Type __offset, \
|
inline __device__ double __FnName(double __val, __Type __offset, \
|
||||||
int __width = warpSize) { \
|
int __width = warpSize) { \
|
||||||
@ -139,8 +139,8 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
|
|||||||
inline __device__ unsigned long long __FnName( \
|
inline __device__ unsigned long long __FnName( \
|
||||||
unsigned int __mask, unsigned long long __val, __Type __offset, \
|
unsigned int __mask, unsigned long long __val, __Type __offset, \
|
||||||
int __width = warpSize) { \
|
int __width = warpSize) { \
|
||||||
return static_cast<unsigned long long>(::__FnName( \
|
return static_cast<unsigned long long>( \
|
||||||
__mask, static_cast<unsigned long long>(__val), __offset, __width)); \
|
::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
|
||||||
} \
|
} \
|
||||||
inline __device__ long __FnName(unsigned int __mask, long __val, \
|
inline __device__ long __FnName(unsigned int __mask, long __val, \
|
||||||
__Type __offset, int __width = warpSize) { \
|
__Type __offset, int __width = warpSize) { \
|
||||||
@ -234,7 +234,7 @@ inline __device__ unsigned int __match32_any_sync(unsigned int mask,
|
|||||||
return __nvvm_match_any_sync_i32(mask, value);
|
return __nvvm_match_any_sync_i32(mask, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ unsigned long long
|
inline __device__ unsigned int
|
||||||
__match64_any_sync(unsigned int mask, unsigned long long value) {
|
__match64_any_sync(unsigned int mask, unsigned long long value) {
|
||||||
return __nvvm_match_any_sync_i64(mask, value);
|
return __nvvm_match_any_sync_i64(mask, value);
|
||||||
}
|
}
|
||||||
@ -244,7 +244,7 @@ __match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
|
|||||||
return __nvvm_match_all_sync_i32p(mask, value, pred);
|
return __nvvm_match_all_sync_i32p(mask, value, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ unsigned long long
|
inline __device__ unsigned int
|
||||||
__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
|
__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
|
||||||
return __nvvm_match_all_sync_i64p(mask, value, pred);
|
return __nvvm_match_all_sync_i64p(mask, value, pred);
|
||||||
}
|
}
|
||||||
|
|||||||
20
lib/include/__wmmintrin_pclmul.h
vendored
20
lib/include/__wmmintrin_pclmul.h
vendored
@ -22,23 +22,23 @@
|
|||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
/// \code
|
/// \code
|
||||||
/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
|
/// __m128i _mm_clmulepi64_si128(__m128i X, __m128i Y, const int I);
|
||||||
/// \endcode
|
/// \endcode
|
||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
|
/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
|
||||||
///
|
///
|
||||||
/// \param __X
|
/// \param X
|
||||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||||
/// \param __Y
|
/// \param Y
|
||||||
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
/// A 128-bit vector of [2 x i64] containing one of the source operands.
|
||||||
/// \param __I
|
/// \param I
|
||||||
/// An immediate value specifying which 64-bit values to select from the
|
/// An immediate value specifying which 64-bit values to select from the
|
||||||
/// operands. Bit 0 is used to select a value from operand \a __X, and bit
|
/// operands. Bit 0 is used to select a value from operand \a X, and bit
|
||||||
/// 4 is used to select a value from operand \a __Y: \n
|
/// 4 is used to select a value from operand \a Y: \n
|
||||||
/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
|
/// Bit[0]=0 indicates that bits[63:0] of operand \a X are used. \n
|
||||||
/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
|
/// Bit[0]=1 indicates that bits[127:64] of operand \a X are used. \n
|
||||||
/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
|
/// Bit[4]=0 indicates that bits[63:0] of operand \a Y are used. \n
|
||||||
/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
|
/// Bit[4]=1 indicates that bits[127:64] of operand \a Y are used.
|
||||||
/// \returns The 128-bit integer vector containing the result of the carry-less
|
/// \returns The 128-bit integer vector containing the result of the carry-less
|
||||||
/// multiplication of the selected 64-bit values.
|
/// multiplication of the selected 64-bit values.
|
||||||
#define _mm_clmulepi64_si128(X, Y, I) \
|
#define _mm_clmulepi64_si128(X, Y, I) \
|
||||||
|
|||||||
653
lib/include/altivec.h
vendored
653
lib/include/altivec.h
vendored
File diff suppressed because it is too large
Load Diff
2
lib/include/amxintrin.h
vendored
2
lib/include/amxintrin.h
vendored
@ -439,8 +439,6 @@ static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
|
||||||
///
|
///
|
||||||
/// \param dst
|
|
||||||
/// A destination tile. Max size is 1024 Bytes.
|
|
||||||
/// \param base
|
/// \param base
|
||||||
/// A pointer to base address.
|
/// A pointer to base address.
|
||||||
/// \param stride
|
/// \param stride
|
||||||
|
|||||||
122
lib/include/arm_sve.h
vendored
122
lib/include/arm_sve.h
vendored
@ -2407,15 +2407,15 @@ svuint64_t svcnt_s64_z(svbool_t, svint64_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnt_s16_z)))
|
||||||
svuint16_t svcnt_s16_z(svbool_t, svint16_t);
|
svuint16_t svcnt_s16_z(svbool_t, svint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb)))
|
||||||
uint64_t svcntb();
|
uint64_t svcntb(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb_pat)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntb_pat)))
|
||||||
uint64_t svcntb_pat(enum svpattern);
|
uint64_t svcntb_pat(enum svpattern);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd)))
|
||||||
uint64_t svcntd();
|
uint64_t svcntd(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd_pat)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntd_pat)))
|
||||||
uint64_t svcntd_pat(enum svpattern);
|
uint64_t svcntd_pat(enum svpattern);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth)))
|
||||||
uint64_t svcnth();
|
uint64_t svcnth(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth_pat)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcnth_pat)))
|
||||||
uint64_t svcnth_pat(enum svpattern);
|
uint64_t svcnth_pat(enum svpattern);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b8)))
|
||||||
@ -2427,7 +2427,7 @@ uint64_t svcntp_b64(svbool_t, svbool_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntp_b16)))
|
||||||
uint64_t svcntp_b16(svbool_t, svbool_t);
|
uint64_t svcntp_b16(svbool_t, svbool_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw)))
|
||||||
uint64_t svcntw();
|
uint64_t svcntw(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw_pat)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcntw_pat)))
|
||||||
uint64_t svcntw_pat(enum svpattern);
|
uint64_t svcntw_pat(enum svpattern);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svcompact_u32)))
|
||||||
@ -6521,7 +6521,7 @@ int64_t svorv_s64(svbool_t, svint64_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
|
||||||
int16_t svorv_s16(svbool_t, svint16_t);
|
int16_t svorv_s16(svbool_t, svint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
|
||||||
svbool_t svpfalse_b();
|
svbool_t svpfalse_b(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
|
||||||
svbool_t svpfirst_b(svbool_t, svbool_t);
|
svbool_t svpfirst_b(svbool_t, svbool_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpnext_b8)))
|
||||||
@ -6627,13 +6627,13 @@ svbool_t svptrue_pat_b64(enum svpattern);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_pat_b16)))
|
||||||
svbool_t svptrue_pat_b16(enum svpattern);
|
svbool_t svptrue_pat_b16(enum svpattern);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b8)))
|
||||||
svbool_t svptrue_b8();
|
svbool_t svptrue_b8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b32)))
|
||||||
svbool_t svptrue_b32();
|
svbool_t svptrue_b32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b64)))
|
||||||
svbool_t svptrue_b64();
|
svbool_t svptrue_b64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svptrue_b16)))
|
||||||
svbool_t svptrue_b16();
|
svbool_t svptrue_b16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s8)))
|
||||||
svint8_t svqadd_n_s8(svint8_t, int8_t);
|
svint8_t svqadd_n_s8(svint8_t, int8_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svqadd_n_s32)))
|
||||||
@ -7011,7 +7011,7 @@ svint64_t svrbit_s64_z(svbool_t, svint64_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrbit_s16_z)))
|
||||||
svint16_t svrbit_s16_z(svbool_t, svint16_t);
|
svint16_t svrbit_s16_z(svbool_t, svint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr)))
|
||||||
svbool_t svrdffr();
|
svbool_t svrdffr(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr_z)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrdffr_z)))
|
||||||
svbool_t svrdffr_z(svbool_t);
|
svbool_t svrdffr_z(svbool_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrecpe_f64)))
|
||||||
@ -7411,7 +7411,7 @@ svint64x4_t svset4_s64(svint64x4_t, uint64_t, svint64_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset4_s16)))
|
||||||
svint16x4_t svset4_s16(svint16x4_t, uint64_t, svint16_t);
|
svint16x4_t svset4_s16(svint16x4_t, uint64_t, svint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsetffr)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsetffr)))
|
||||||
void svsetffr();
|
void svsetffr(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u8)))
|
||||||
svuint8_t svsplice_u8(svbool_t, svuint8_t, svuint8_t);
|
svuint8_t svsplice_u8(svbool_t, svuint8_t, svuint8_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svsplice_u32)))
|
||||||
@ -8285,93 +8285,93 @@ svfloat32_t svtssel_f32(svfloat32_t, svuint32_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtssel_f16)))
|
||||||
svfloat16_t svtssel_f16(svfloat16_t, svuint16_t);
|
svfloat16_t svtssel_f16(svfloat16_t, svuint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u8)))
|
||||||
svuint8x2_t svundef2_u8();
|
svuint8x2_t svundef2_u8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u32)))
|
||||||
svuint32x2_t svundef2_u32();
|
svuint32x2_t svundef2_u32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u64)))
|
||||||
svuint64x2_t svundef2_u64();
|
svuint64x2_t svundef2_u64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_u16)))
|
||||||
svuint16x2_t svundef2_u16();
|
svuint16x2_t svundef2_u16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s8)))
|
||||||
svint8x2_t svundef2_s8();
|
svint8x2_t svundef2_s8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f64)))
|
||||||
svfloat64x2_t svundef2_f64();
|
svfloat64x2_t svundef2_f64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f32)))
|
||||||
svfloat32x2_t svundef2_f32();
|
svfloat32x2_t svundef2_f32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_f16)))
|
||||||
svfloat16x2_t svundef2_f16();
|
svfloat16x2_t svundef2_f16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s32)))
|
||||||
svint32x2_t svundef2_s32();
|
svint32x2_t svundef2_s32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s64)))
|
||||||
svint64x2_t svundef2_s64();
|
svint64x2_t svundef2_s64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_s16)))
|
||||||
svint16x2_t svundef2_s16();
|
svint16x2_t svundef2_s16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u8)))
|
||||||
svuint8x3_t svundef3_u8();
|
svuint8x3_t svundef3_u8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u32)))
|
||||||
svuint32x3_t svundef3_u32();
|
svuint32x3_t svundef3_u32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u64)))
|
||||||
svuint64x3_t svundef3_u64();
|
svuint64x3_t svundef3_u64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_u16)))
|
||||||
svuint16x3_t svundef3_u16();
|
svuint16x3_t svundef3_u16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s8)))
|
||||||
svint8x3_t svundef3_s8();
|
svint8x3_t svundef3_s8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f64)))
|
||||||
svfloat64x3_t svundef3_f64();
|
svfloat64x3_t svundef3_f64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f32)))
|
||||||
svfloat32x3_t svundef3_f32();
|
svfloat32x3_t svundef3_f32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_f16)))
|
||||||
svfloat16x3_t svundef3_f16();
|
svfloat16x3_t svundef3_f16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s32)))
|
||||||
svint32x3_t svundef3_s32();
|
svint32x3_t svundef3_s32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s64)))
|
||||||
svint64x3_t svundef3_s64();
|
svint64x3_t svundef3_s64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_s16)))
|
||||||
svint16x3_t svundef3_s16();
|
svint16x3_t svundef3_s16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u8)))
|
||||||
svuint8x4_t svundef4_u8();
|
svuint8x4_t svundef4_u8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u32)))
|
||||||
svuint32x4_t svundef4_u32();
|
svuint32x4_t svundef4_u32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u64)))
|
||||||
svuint64x4_t svundef4_u64();
|
svuint64x4_t svundef4_u64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_u16)))
|
||||||
svuint16x4_t svundef4_u16();
|
svuint16x4_t svundef4_u16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s8)))
|
||||||
svint8x4_t svundef4_s8();
|
svint8x4_t svundef4_s8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f64)))
|
||||||
svfloat64x4_t svundef4_f64();
|
svfloat64x4_t svundef4_f64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f32)))
|
||||||
svfloat32x4_t svundef4_f32();
|
svfloat32x4_t svundef4_f32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_f16)))
|
||||||
svfloat16x4_t svundef4_f16();
|
svfloat16x4_t svundef4_f16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s32)))
|
||||||
svint32x4_t svundef4_s32();
|
svint32x4_t svundef4_s32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s64)))
|
||||||
svint64x4_t svundef4_s64();
|
svint64x4_t svundef4_s64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_s16)))
|
||||||
svint16x4_t svundef4_s16();
|
svint16x4_t svundef4_s16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u8)))
|
||||||
svuint8_t svundef_u8();
|
svuint8_t svundef_u8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u32)))
|
||||||
svuint32_t svundef_u32();
|
svuint32_t svundef_u32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u64)))
|
||||||
svuint64_t svundef_u64();
|
svuint64_t svundef_u64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_u16)))
|
||||||
svuint16_t svundef_u16();
|
svuint16_t svundef_u16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s8)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s8)))
|
||||||
svint8_t svundef_s8();
|
svint8_t svundef_s8(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f64)))
|
||||||
svfloat64_t svundef_f64();
|
svfloat64_t svundef_f64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f32)))
|
||||||
svfloat32_t svundef_f32();
|
svfloat32_t svundef_f32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_f16)))
|
||||||
svfloat16_t svundef_f16();
|
svfloat16_t svundef_f16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s32)))
|
||||||
svint32_t svundef_s32();
|
svint32_t svundef_s32(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s64)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s64)))
|
||||||
svint64_t svundef_s64();
|
svint64_t svundef_s64(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_s16)))
|
||||||
svint16_t svundef_s16();
|
svint16_t svundef_s16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_b)))
|
||||||
svbool_t svunpkhi_b(svbool_t);
|
svbool_t svunpkhi_b(svbool_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svunpkhi_s32)))
|
||||||
@ -13830,8 +13830,8 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s64)))
|
|||||||
int64_t svorv(svbool_t, svint64_t);
|
int64_t svorv(svbool_t, svint64_t);
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svorv_s16)))
|
||||||
int16_t svorv(svbool_t, svint16_t);
|
int16_t svorv(svbool_t, svint16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfalse_b)))
|
||||||
svbool_t svpfalse();
|
svbool_t svpfalse(void);
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svpfirst_b)))
|
||||||
svbool_t svpfirst(svbool_t, svbool_t);
|
svbool_t svpfirst(svbool_t, svbool_t);
|
||||||
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
|
__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svprfb_gather_u32base)))
|
||||||
@ -23456,13 +23456,13 @@ svbfloat16_t svtrn1_bf16(svbfloat16_t, svbfloat16_t);
|
|||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_bf16)))
|
||||||
svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
|
svbfloat16_t svtrn2_bf16(svbfloat16_t, svbfloat16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef2_bf16)))
|
||||||
svbfloat16x2_t svundef2_bf16();
|
svbfloat16x2_t svundef2_bf16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef3_bf16)))
|
||||||
svbfloat16x3_t svundef3_bf16();
|
svbfloat16x3_t svundef3_bf16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef4_bf16)))
|
||||||
svbfloat16x4_t svundef4_bf16();
|
svbfloat16x4_t svundef4_bf16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svundef_bf16)))
|
||||||
svbfloat16_t svundef_bf16();
|
svbfloat16_t svundef_bf16(void);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_bf16)))
|
||||||
svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
|
svbfloat16_t svuzp1_bf16(svbfloat16_t, svbfloat16_t);
|
||||||
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
|
__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_bf16)))
|
||||||
|
|||||||
16
lib/include/avx2intrin.h
vendored
16
lib/include/avx2intrin.h
vendored
@ -92,25 +92,25 @@ _mm256_add_epi64(__m256i __a, __m256i __b)
|
|||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_adds_epi8(__m256i __a, __m256i __b)
|
_mm256_adds_epi8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_adds_epi16(__m256i __a, __m256i __b)
|
_mm256_adds_epi16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_adds_epu8(__m256i __a, __m256i __b)
|
_mm256_adds_epu8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_adds_epu16(__m256i __a, __m256i __b)
|
_mm256_adds_epu16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_alignr_epi8(a, b, n) \
|
#define _mm256_alignr_epi8(a, b, n) \
|
||||||
@ -628,25 +628,25 @@ _mm256_sub_epi64(__m256i __a, __m256i __b)
|
|||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_subs_epi8(__m256i __a, __m256i __b)
|
_mm256_subs_epi8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_subs_epi16(__m256i __a, __m256i __b)
|
_mm256_subs_epi16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_subs_epu8(__m256i __a, __m256i __b)
|
_mm256_subs_epu8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_subs_epu16(__m256i __a, __m256i __b)
|
_mm256_subs_epu16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
|
|||||||
24
lib/include/avx512bwintrin.h
vendored
24
lib/include/avx512bwintrin.h
vendored
@ -617,7 +617,7 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_adds_epi8 (__m512i __A, __m512i __B)
|
_mm512_adds_epi8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
|
return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -639,7 +639,7 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_adds_epi16 (__m512i __A, __m512i __B)
|
_mm512_adds_epi16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
|
return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -661,7 +661,7 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_adds_epu8 (__m512i __A, __m512i __B)
|
_mm512_adds_epu8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
|
return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -683,7 +683,7 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_adds_epu16 (__m512i __A, __m512i __B)
|
_mm512_adds_epu16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
|
return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -950,7 +950,7 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_subs_epi8 (__m512i __A, __m512i __B)
|
_mm512_subs_epi8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
|
return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -972,7 +972,7 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_subs_epi16 (__m512i __A, __m512i __B)
|
_mm512_subs_epi16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
|
return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -994,7 +994,7 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_subs_epu8 (__m512i __A, __m512i __B)
|
_mm512_subs_epu8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
|
return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1016,7 +1016,7 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_subs_epu16 (__m512i __A, __m512i __B)
|
_mm512_subs_epu16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
|
return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1506,7 +1506,7 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_slli_epi16(__m512i __A, unsigned int __B)
|
_mm512_slli_epi16(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
|
return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1598,7 +1598,7 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srai_epi16(__m512i __A, unsigned int __B)
|
_mm512_srai_epi16(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
|
return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1643,7 +1643,7 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srli_epi16(__m512i __A, unsigned int __B)
|
_mm512_srli_epi16(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
|
return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1659,7 +1659,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
|||||||
_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
|
_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
|
||||||
(__v32hi)_mm512_srli_epi16(__A, __B),
|
(__v32hi)_mm512_srli_epi16(__A, (unsigned int)__B),
|
||||||
(__v32hi)_mm512_setzero_si512());
|
(__v32hi)_mm512_setzero_si512());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
127
lib/include/avx512fintrin.h
vendored
127
lib/include/avx512fintrin.h
vendored
@ -1780,7 +1780,7 @@ _mm512_floor_ps(__m512 __A)
|
|||||||
{
|
{
|
||||||
return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
|
return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
|
||||||
_MM_FROUND_FLOOR,
|
_MM_FROUND_FLOOR,
|
||||||
(__v16sf) __A, -1,
|
(__v16sf) __A, (unsigned short)-1,
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1798,7 +1798,7 @@ _mm512_floor_pd(__m512d __A)
|
|||||||
{
|
{
|
||||||
return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
|
return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
|
||||||
_MM_FROUND_FLOOR,
|
_MM_FROUND_FLOOR,
|
||||||
(__v8df) __A, -1,
|
(__v8df) __A, (unsigned char)-1,
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1825,7 +1825,7 @@ _mm512_ceil_ps(__m512 __A)
|
|||||||
{
|
{
|
||||||
return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
|
return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
|
||||||
_MM_FROUND_CEIL,
|
_MM_FROUND_CEIL,
|
||||||
(__v16sf) __A, -1,
|
(__v16sf) __A, (unsigned short)-1,
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1834,7 +1834,7 @@ _mm512_ceil_pd(__m512d __A)
|
|||||||
{
|
{
|
||||||
return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
|
return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
|
||||||
_MM_FROUND_CEIL,
|
_MM_FROUND_CEIL,
|
||||||
(__v8df) __A, -1,
|
(__v8df) __A, (unsigned char)-1,
|
||||||
_MM_FROUND_CUR_DIRECTION);
|
_MM_FROUND_CUR_DIRECTION);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5117,7 +5117,7 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_slli_epi32(__m512i __A, unsigned int __B)
|
_mm512_slli_epi32(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
|
return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -5139,7 +5139,7 @@ _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_slli_epi64(__m512i __A, unsigned int __B)
|
_mm512_slli_epi64(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
|
return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -5161,7 +5161,7 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srli_epi32(__m512i __A, unsigned int __B)
|
_mm512_srli_epi32(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
|
return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -5183,7 +5183,7 @@ _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srli_epi64(__m512i __A, unsigned int __B)
|
_mm512_srli_epi64(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
|
return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -5929,41 +5929,44 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
|
|||||||
(__v8di)_mm512_setzero_si512());
|
(__v8di)_mm512_setzero_si512());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// \enum _MM_TERNLOG_ENUM
|
||||||
|
/// A helper to represent the ternary logic operations among vector \a A,
|
||||||
|
/// \a B and \a C. The representation is passed to \a imm.
|
||||||
|
typedef enum {
|
||||||
|
_MM_TERNLOG_A = 0xF0,
|
||||||
|
_MM_TERNLOG_B = 0xCC,
|
||||||
|
_MM_TERNLOG_C = 0xAA
|
||||||
|
} _MM_TERNLOG_ENUM;
|
||||||
|
|
||||||
#define _mm512_ternarylogic_epi32(A, B, C, imm) \
|
#define _mm512_ternarylogic_epi32(A, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogd512_mask( \
|
||||||
(__v16si)(__m512i)(B), \
|
(__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
|
||||||
(__v16si)(__m512i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask16)-1))
|
||||||
(__mmask16)-1))
|
|
||||||
|
|
||||||
#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogd512_mask( \
|
||||||
(__v16si)(__m512i)(B), \
|
(__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
|
||||||
(__v16si)(__m512i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask16)(U)))
|
||||||
(__mmask16)(U)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogd512_maskz( \
|
||||||
(__v16si)(__m512i)(B), \
|
(__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C), \
|
||||||
(__v16si)(__m512i)(C), \
|
(unsigned char)(imm), (__mmask16)(U)))
|
||||||
(int)(imm), (__mmask16)(U)))
|
|
||||||
|
|
||||||
#define _mm512_ternarylogic_epi64(A, B, C, imm) \
|
#define _mm512_ternarylogic_epi64(A, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogq512_mask( \
|
||||||
(__v8di)(__m512i)(B), \
|
(__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
|
||||||
(__v8di)(__m512i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)-1))
|
||||||
(__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogq512_mask( \
|
||||||
(__v8di)(__m512i)(B), \
|
(__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
|
||||||
(__v8di)(__m512i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
||||||
((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pternlogq512_maskz( \
|
||||||
(__v8di)(__m512i)(B), \
|
(__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C), \
|
||||||
(__v8di)(__m512i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
#define _mm_cvt_roundsd_i64(A, R) \
|
#define _mm_cvt_roundsd_i64(A, R) \
|
||||||
@ -6603,7 +6606,7 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srai_epi32(__m512i __A, unsigned int __B)
|
_mm512_srai_epi32(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
|
return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -6626,7 +6629,7 @@ _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srai_epi64(__m512i __A, unsigned int __B)
|
_mm512_srai_epi64(__m512i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
|
return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -9316,11 +9319,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
|
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
|
||||||
return __builtin_ia32_reduce_add_q512(__W);
|
return __builtin_reduce_add((__v8di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
|
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
|
||||||
return __builtin_ia32_reduce_mul_q512(__W);
|
return __builtin_reduce_mul((__v8di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
|
static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
|
||||||
@ -9334,18 +9337,18 @@ static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i
|
|||||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
|
_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
|
||||||
__W = _mm512_maskz_mov_epi64(__M, __W);
|
__W = _mm512_maskz_mov_epi64(__M, __W);
|
||||||
return __builtin_ia32_reduce_add_q512(__W);
|
return __builtin_reduce_add((__v8di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
|
_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
|
||||||
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
|
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
|
||||||
return __builtin_ia32_reduce_mul_q512(__W);
|
return __builtin_reduce_mul((__v8di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS512
|
static __inline__ long long __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
|
_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
|
||||||
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
|
__W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
|
||||||
return __builtin_reduce_and((__v8di)__W);
|
return __builtin_reduce_and((__v8di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9380,12 +9383,12 @@ _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
|
|||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_reduce_add_epi32(__m512i __W) {
|
_mm512_reduce_add_epi32(__m512i __W) {
|
||||||
return __builtin_ia32_reduce_add_d512((__v16si)__W);
|
return __builtin_reduce_add((__v16si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_reduce_mul_epi32(__m512i __W) {
|
_mm512_reduce_mul_epi32(__m512i __W) {
|
||||||
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
|
return __builtin_reduce_mul((__v16si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
@ -9401,18 +9404,18 @@ _mm512_reduce_or_epi32(__m512i __W) {
|
|||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
|
_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
|
||||||
__W = _mm512_maskz_mov_epi32(__M, __W);
|
__W = _mm512_maskz_mov_epi32(__M, __W);
|
||||||
return __builtin_ia32_reduce_add_d512((__v16si)__W);
|
return __builtin_reduce_add((__v16si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
|
_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
|
||||||
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
|
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
|
||||||
return __builtin_ia32_reduce_mul_d512((__v16si)__W);
|
return __builtin_reduce_mul((__v16si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
|
_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
|
||||||
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
|
__W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
|
||||||
return __builtin_reduce_and((__v16si)__W);
|
return __builtin_reduce_and((__v16si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9484,7 +9487,7 @@ _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
|
|||||||
|
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
|
_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
|
||||||
__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
|
__V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
|
||||||
return __builtin_reduce_min((__v8du)__V);
|
return __builtin_reduce_min((__v8du)__V);
|
||||||
}
|
}
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS512
|
static __inline__ int __DEFAULT_FN_ATTRS512
|
||||||
@ -9527,7 +9530,7 @@ _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
|
|||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
|
_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
|
||||||
__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
|
__V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
|
||||||
return __builtin_reduce_min((__v16su)__V);
|
return __builtin_reduce_min((__v16su)__V);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -9598,7 +9601,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9606,7 +9609,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// dst[MAX:512] := 0
|
/// dst[MAX:512] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_i32logather_pd(vindex, base_addr, scale) \
|
#define _mm512_i32logather_pd(vindex, base_addr, scale) \
|
||||||
_mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
_mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
||||||
|
|
||||||
@ -9618,7 +9621,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9630,7 +9633,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// FI
|
/// FI
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// dst[MAX:512] := 0
|
/// dst[MAX:512] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
|
#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
|
||||||
_mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
|
_mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
|
||||||
(base_addr), (scale))
|
(base_addr), (scale))
|
||||||
@ -9641,7 +9644,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9649,7 +9652,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// dst[i+63:i] := MEM[addr+63:addr]
|
/// dst[i+63:i] := MEM[addr+63:addr]
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// dst[MAX:512] := 0
|
/// dst[MAX:512] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
|
#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
|
||||||
_mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
_mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
|
||||||
|
|
||||||
@ -9660,7 +9663,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9672,7 +9675,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// FI
|
/// FI
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// dst[MAX:512] := 0
|
/// dst[MAX:512] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
|
#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
|
||||||
_mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
|
_mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
|
||||||
(base_addr), (scale))
|
(base_addr), (scale))
|
||||||
@ -9683,14 +9686,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||||
/// MEM[addr+63:addr] := v1[i+63:i]
|
/// MEM[addr+63:addr] := v1[i+63:i]
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
|
#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
|
||||||
_mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
|
_mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
|
||||||
|
|
||||||
@ -9702,7 +9705,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9711,7 +9714,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// MEM[addr+63:addr] := a[i+63:i]
|
/// MEM[addr+63:addr] := a[i+63:i]
|
||||||
/// FI
|
/// FI
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
|
#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
|
||||||
_mm512_mask_i32scatter_pd((base_addr), (mask), \
|
_mm512_mask_i32scatter_pd((base_addr), (mask), \
|
||||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||||
@ -9722,14 +9725,14 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
|
||||||
/// MEM[addr+63:addr] := a[i+63:i]
|
/// MEM[addr+63:addr] := a[i+63:i]
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
|
#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
|
||||||
_mm512_i32scatter_epi64((base_addr), \
|
_mm512_i32scatter_epi64((base_addr), \
|
||||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||||
@ -9741,7 +9744,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// i := j*64
|
/// i := j*64
|
||||||
/// m := j*32
|
/// m := j*32
|
||||||
@ -9750,7 +9753,7 @@ _mm512_cvtsi512_si32(__m512i __A) {
|
|||||||
/// MEM[addr+63:addr] := a[i+63:i]
|
/// MEM[addr+63:addr] := a[i+63:i]
|
||||||
/// FI
|
/// FI
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
|
#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
|
||||||
_mm512_mask_i32scatter_epi64((base_addr), (mask), \
|
_mm512_mask_i32scatter_epi64((base_addr), (mask), \
|
||||||
_mm512_castsi512_si256(vindex), (v1), (scale))
|
_mm512_castsi512_si256(vindex), (v1), (scale))
|
||||||
|
|||||||
2
lib/include/avx512vlbf16intrin.h
vendored
2
lib/include/avx512vlbf16intrin.h
vendored
@ -417,7 +417,7 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
|
|||||||
__v4sf __V = {__A, 0, 0, 0};
|
__v4sf __V = {__A, 0, 0, 0};
|
||||||
__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
|
__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
|
||||||
(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
|
(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
|
||||||
return __R[0];
|
return (__bfloat16)__R[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert Packed BF16 Data to Packed float Data.
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
|
|||||||
16
lib/include/avx512vlbwintrin.h
vendored
16
lib/include/avx512vlbwintrin.h
vendored
@ -1942,7 +1942,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||||
(__v8hi)_mm_slli_epi16(__A, __B),
|
(__v8hi)_mm_slli_epi16(__A, (int)__B),
|
||||||
(__v8hi)__W);
|
(__v8hi)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1950,7 +1950,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||||
(__v8hi)_mm_slli_epi16(__A, __B),
|
(__v8hi)_mm_slli_epi16(__A, (int)__B),
|
||||||
(__v8hi)_mm_setzero_si128());
|
(__v8hi)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1959,7 +1959,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
|
|||||||
unsigned int __B)
|
unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||||
(__v16hi)_mm256_slli_epi16(__A, __B),
|
(__v16hi)_mm256_slli_epi16(__A, (int)__B),
|
||||||
(__v16hi)__W);
|
(__v16hi)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1967,7 +1967,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||||
(__v16hi)_mm256_slli_epi16(__A, __B),
|
(__v16hi)_mm256_slli_epi16(__A, (int)__B),
|
||||||
(__v16hi)_mm256_setzero_si256());
|
(__v16hi)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2095,7 +2095,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||||
(__v8hi)_mm_srai_epi16(__A, __B),
|
(__v8hi)_mm_srai_epi16(__A, (int)__B),
|
||||||
(__v8hi)__W);
|
(__v8hi)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2103,7 +2103,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
|
||||||
(__v8hi)_mm_srai_epi16(__A, __B),
|
(__v8hi)_mm_srai_epi16(__A, (int)__B),
|
||||||
(__v8hi)_mm_setzero_si128());
|
(__v8hi)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2112,7 +2112,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
|
|||||||
unsigned int __B)
|
unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||||
(__v16hi)_mm256_srai_epi16(__A, __B),
|
(__v16hi)_mm256_srai_epi16(__A, (int)__B),
|
||||||
(__v16hi)__W);
|
(__v16hi)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2120,7 +2120,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
|
||||||
(__v16hi)_mm256_srai_epi16(__A, __B),
|
(__v16hi)_mm256_srai_epi16(__A, (int)__B),
|
||||||
(__v16hi)_mm256_setzero_si256());
|
(__v16hi)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
130
lib/include/avx512vlintrin.h
vendored
130
lib/include/avx512vlintrin.h
vendored
@ -4525,7 +4525,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_slli_epi32(__A, __B),
|
(__v4si)_mm_slli_epi32(__A, (int)__B),
|
||||||
(__v4si)__W);
|
(__v4si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4533,7 +4533,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_slli_epi32(__A, __B),
|
(__v4si)_mm_slli_epi32(__A, (int)__B),
|
||||||
(__v4si)_mm_setzero_si128());
|
(__v4si)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4541,7 +4541,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_slli_epi32(__A, __B),
|
(__v8si)_mm256_slli_epi32(__A, (int)__B),
|
||||||
(__v8si)__W);
|
(__v8si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4549,7 +4549,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_slli_epi32(__A, __B),
|
(__v8si)_mm256_slli_epi32(__A, (int)__B),
|
||||||
(__v8si)_mm256_setzero_si256());
|
(__v8si)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4589,7 +4589,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||||
(__v2di)_mm_slli_epi64(__A, __B),
|
(__v2di)_mm_slli_epi64(__A, (int)__B),
|
||||||
(__v2di)__W);
|
(__v2di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4597,7 +4597,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||||
(__v2di)_mm_slli_epi64(__A, __B),
|
(__v2di)_mm_slli_epi64(__A, (int)__B),
|
||||||
(__v2di)_mm_setzero_si128());
|
(__v2di)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4605,7 +4605,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||||
(__v4di)_mm256_slli_epi64(__A, __B),
|
(__v4di)_mm256_slli_epi64(__A, (int)__B),
|
||||||
(__v4di)__W);
|
(__v4di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4613,7 +4613,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||||
(__v4di)_mm256_slli_epi64(__A, __B),
|
(__v4di)_mm256_slli_epi64(__A, (int)__B),
|
||||||
(__v4di)_mm256_setzero_si256());
|
(__v4di)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4869,7 +4869,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_srli_epi32(__A, __B),
|
(__v4si)_mm_srli_epi32(__A, (int)__B),
|
||||||
(__v4si)__W);
|
(__v4si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4877,7 +4877,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_srli_epi32(__A, __B),
|
(__v4si)_mm_srli_epi32(__A, (int)__B),
|
||||||
(__v4si)_mm_setzero_si128());
|
(__v4si)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4885,7 +4885,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_srli_epi32(__A, __B),
|
(__v8si)_mm256_srli_epi32(__A, (int)__B),
|
||||||
(__v8si)__W);
|
(__v8si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4893,7 +4893,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_srli_epi32(__A, __B),
|
(__v8si)_mm256_srli_epi32(__A, (int)__B),
|
||||||
(__v8si)_mm256_setzero_si256());
|
(__v8si)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4933,7 +4933,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||||
(__v2di)_mm_srli_epi64(__A, __B),
|
(__v2di)_mm_srli_epi64(__A, (int)__B),
|
||||||
(__v2di)__W);
|
(__v2di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4941,7 +4941,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
|
||||||
(__v2di)_mm_srli_epi64(__A, __B),
|
(__v2di)_mm_srli_epi64(__A, (int)__B),
|
||||||
(__v2di)_mm_setzero_si128());
|
(__v2di)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4949,7 +4949,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||||
(__v4di)_mm256_srli_epi64(__A, __B),
|
(__v4di)_mm256_srli_epi64(__A, (int)__B),
|
||||||
(__v4di)__W);
|
(__v4di)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4957,7 +4957,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
|
||||||
(__v4di)_mm256_srli_epi64(__A, __B),
|
(__v4di)_mm256_srli_epi64(__A, (int)__B),
|
||||||
(__v4di)_mm256_setzero_si256());
|
(__v4di)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6408,7 +6408,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_srai_epi32(__A, __B),
|
(__v4si)_mm_srai_epi32(__A, (int)__B),
|
||||||
(__v4si)__W);
|
(__v4si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6416,7 +6416,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
|||||||
_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
|
||||||
(__v4si)_mm_srai_epi32(__A, __B),
|
(__v4si)_mm_srai_epi32(__A, (int)__B),
|
||||||
(__v4si)_mm_setzero_si128());
|
(__v4si)_mm_setzero_si128());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6424,7 +6424,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_srai_epi32(__A, __B),
|
(__v8si)_mm256_srai_epi32(__A, (int)__B),
|
||||||
(__v8si)__W);
|
(__v8si)__W);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6432,7 +6432,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
|||||||
_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
|
||||||
(__v8si)_mm256_srai_epi32(__A, __B),
|
(__v8si)_mm256_srai_epi32(__A, (int)__B),
|
||||||
(__v8si)_mm256_setzero_si256());
|
(__v8si)_mm256_setzero_si256());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6483,7 +6483,7 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
_mm_srai_epi64(__m128i __A, unsigned int __imm)
|
_mm_srai_epi64(__m128i __A, unsigned int __imm)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
|
return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, (int)__imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
@ -6505,7 +6505,7 @@ _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
|
|||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_srai_epi64(__m256i __A, unsigned int __imm)
|
_mm256_srai_epi64(__m256i __A, unsigned int __imm)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
|
return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, (int)__imm);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
@ -6526,78 +6526,64 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_ternarylogic_epi32(A, B, C, imm) \
|
#define _mm_ternarylogic_epi32(A, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogd128_mask( \
|
||||||
(__v4si)(__m128i)(B), \
|
(__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \
|
||||||
(__v4si)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)-1))
|
||||||
(__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogd128_mask( \
|
||||||
(__v4si)(__m128i)(B), \
|
(__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \
|
||||||
(__v4si)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogd128_maskz( \
|
||||||
(__v4si)(__m128i)(B), \
|
(__v4si)(__m128i)(A), (__v4si)(__m128i)(B), (__v4si)(__m128i)(C), \
|
||||||
(__v4si)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_ternarylogic_epi32(A, B, C, imm) \
|
#define _mm256_ternarylogic_epi32(A, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogd256_mask( \
|
||||||
(__v8si)(__m256i)(B), \
|
(__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \
|
||||||
(__v8si)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)-1))
|
||||||
(__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogd256_mask( \
|
||||||
(__v8si)(__m256i)(B), \
|
(__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \
|
||||||
(__v8si)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogd256_maskz( \
|
||||||
(__v8si)(__m256i)(B), \
|
(__v8si)(__m256i)(A), (__v8si)(__m256i)(B), (__v8si)(__m256i)(C), \
|
||||||
(__v8si)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_ternarylogic_epi64(A, B, C, imm) \
|
#define _mm_ternarylogic_epi64(A, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogq128_mask( \
|
||||||
(__v2di)(__m128i)(B), \
|
(__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \
|
||||||
(__v2di)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)-1))
|
||||||
(__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogq128_mask( \
|
||||||
(__v2di)(__m128i)(B), \
|
(__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \
|
||||||
(__v2di)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
||||||
((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pternlogq128_maskz( \
|
||||||
(__v2di)(__m128i)(B), \
|
(__v2di)(__m128i)(A), (__v2di)(__m128i)(B), (__v2di)(__m128i)(C), \
|
||||||
(__v2di)(__m128i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_ternarylogic_epi64(A, B, C, imm) \
|
#define _mm256_ternarylogic_epi64(A, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogq256_mask( \
|
||||||
(__v4di)(__m256i)(B), \
|
(__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \
|
||||||
(__v4di)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)-1))
|
||||||
(__mmask8)-1))
|
|
||||||
|
|
||||||
#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogq256_mask( \
|
||||||
(__v4di)(__m256i)(B), \
|
(__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \
|
||||||
(__v4di)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
|
||||||
((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pternlogq256_maskz( \
|
||||||
(__v4di)(__m256i)(B), \
|
(__v4di)(__m256i)(A), (__v4di)(__m256i)(B), (__v4di)(__m256i)(C), \
|
||||||
(__v4di)(__m256i)(C), (int)(imm), \
|
(unsigned char)(imm), (__mmask8)(U)))
|
||||||
(__mmask8)(U)))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define _mm256_shuffle_f32x4(A, B, imm) \
|
#define _mm256_shuffle_f32x4(A, B, imm) \
|
||||||
((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
|
((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
|
||||||
|
|||||||
32
lib/include/avx512vlvnniintrin.h
vendored
32
lib/include/avx512vlvnniintrin.h
vendored
@ -25,7 +25,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||||
@ -34,7 +34,7 @@
|
|||||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm256_dpbusd_epi32(S, A, B) \
|
#define _mm256_dpbusd_epi32(S, A, B) \
|
||||||
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
@ -45,7 +45,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||||
@ -54,7 +54,7 @@
|
|||||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm256_dpbusds_epi32(S, A, B) \
|
#define _mm256_dpbusds_epi32(S, A, B) \
|
||||||
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
@ -65,14 +65,14 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm256_dpwssd_epi32(S, A, B) \
|
#define _mm256_dpwssd_epi32(S, A, B) \
|
||||||
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
@ -83,14 +83,14 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm256_dpwssds_epi32(S, A, B) \
|
#define _mm256_dpwssds_epi32(S, A, B) \
|
||||||
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
@ -101,7 +101,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||||
@ -110,7 +110,7 @@
|
|||||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm_dpbusd_epi32(S, A, B) \
|
#define _mm_dpbusd_epi32(S, A, B) \
|
||||||
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
@ -121,7 +121,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
|
||||||
@ -130,7 +130,7 @@
|
|||||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm_dpbusds_epi32(S, A, B) \
|
#define _mm_dpbusds_epi32(S, A, B) \
|
||||||
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
@ -141,14 +141,14 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||||
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm_dpwssd_epi32(S, A, B) \
|
#define _mm_dpwssd_epi32(S, A, B) \
|
||||||
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
@ -159,14 +159,14 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
|
||||||
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
#define _mm_dpwssds_epi32(S, A, B) \
|
#define _mm_dpwssds_epi32(S, A, B) \
|
||||||
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
|
|||||||
95
lib/include/avxintrin.h
vendored
95
lib/include/avxintrin.h
vendored
@ -1504,7 +1504,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
|
/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
|
||||||
/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
|
/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
|
||||||
/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
|
/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
|
||||||
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
|
/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
|
||||||
|
/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
|
||||||
|
/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
|
||||||
|
/// <c>[b6, b4, b2, b0]</c>.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
|
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
|
||||||
#define _mm256_shuffle_ps(a, b, mask) \
|
#define _mm256_shuffle_ps(a, b, mask) \
|
||||||
((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
|
((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
|
||||||
@ -1953,12 +1956,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// int _mm256_extract_epi32(__m256i X, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A 256-bit vector of [8 x i32].
|
/// A 256-bit vector of [8 x i32].
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer operand with bits [2:0] determining which vector
|
/// An immediate integer operand with bits [2:0] determining which vector
|
||||||
/// element is extracted and returned.
|
/// element is extracted and returned.
|
||||||
/// \returns A 32-bit integer containing the extracted 32 bits of extended
|
/// \returns A 32-bit integer containing the extracted 32 bits of extended
|
||||||
@ -1971,12 +1978,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// int _mm256_extract_epi16(__m256i X, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A 256-bit integer vector of [16 x i16].
|
/// A 256-bit integer vector of [16 x i16].
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer operand with bits [3:0] determining which vector
|
/// An immediate integer operand with bits [3:0] determining which vector
|
||||||
/// element is extracted and returned.
|
/// element is extracted and returned.
|
||||||
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
|
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
|
||||||
@ -1990,12 +2001,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// int _mm256_extract_epi8(__m256i X, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A 256-bit integer vector of [32 x i8].
|
/// A 256-bit integer vector of [32 x i8].
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer operand with bits [4:0] determining which vector
|
/// An immediate integer operand with bits [4:0] determining which vector
|
||||||
/// element is extracted and returned.
|
/// element is extracted and returned.
|
||||||
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
|
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
|
||||||
@ -2010,12 +2025,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// long long _mm256_extract_epi64(__m256i X, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A 256-bit integer vector of [4 x i64].
|
/// A 256-bit integer vector of [4 x i64].
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer operand with bits [1:0] determining which vector
|
/// An immediate integer operand with bits [1:0] determining which vector
|
||||||
/// element is extracted and returned.
|
/// element is extracted and returned.
|
||||||
/// \returns A 64-bit integer containing the extracted 64 bits of extended
|
/// \returns A 64-bit integer containing the extracted 64 bits of extended
|
||||||
@ -2030,18 +2049,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A vector of [8 x i32] to be used by the insert operation.
|
/// A vector of [8 x i32] to be used by the insert operation.
|
||||||
/// \param __b
|
/// \param I
|
||||||
/// An integer value. The replacement value for the insert operation.
|
/// An integer value. The replacement value for the insert operation.
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer specifying the index of the vector element to be
|
/// An immediate integer specifying the index of the vector element to be
|
||||||
/// replaced.
|
/// replaced.
|
||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a X, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a N with \a I.
|
||||||
#define _mm256_insert_epi32(X, I, N) \
|
#define _mm256_insert_epi32(X, I, N) \
|
||||||
((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
|
||||||
(int)(I), (int)(N)))
|
(int)(I), (int)(N)))
|
||||||
@ -2053,18 +2076,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A vector of [16 x i16] to be used by the insert operation.
|
/// A vector of [16 x i16] to be used by the insert operation.
|
||||||
/// \param __b
|
/// \param I
|
||||||
/// An i16 integer value. The replacement value for the insert operation.
|
/// An i16 integer value. The replacement value for the insert operation.
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer specifying the index of the vector element to be
|
/// An immediate integer specifying the index of the vector element to be
|
||||||
/// replaced.
|
/// replaced.
|
||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a X, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a N with \a I.
|
||||||
#define _mm256_insert_epi16(X, I, N) \
|
#define _mm256_insert_epi16(X, I, N) \
|
||||||
((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
|
||||||
(int)(I), (int)(N)))
|
(int)(I), (int)(N)))
|
||||||
@ -2075,18 +2102,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A vector of [32 x i8] to be used by the insert operation.
|
/// A vector of [32 x i8] to be used by the insert operation.
|
||||||
/// \param __b
|
/// \param I
|
||||||
/// An i8 integer value. The replacement value for the insert operation.
|
/// An i8 integer value. The replacement value for the insert operation.
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer specifying the index of the vector element to be
|
/// An immediate integer specifying the index of the vector element to be
|
||||||
/// replaced.
|
/// replaced.
|
||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a X, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a N with \a I.
|
||||||
#define _mm256_insert_epi8(X, I, N) \
|
#define _mm256_insert_epi8(X, I, N) \
|
||||||
((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
|
||||||
(int)(I), (int)(N)))
|
(int)(I), (int)(N)))
|
||||||
@ -2098,18 +2129,22 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
|
/// \code
|
||||||
|
/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
|
||||||
|
/// \endcode
|
||||||
|
///
|
||||||
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
|
||||||
/// instruction.
|
/// instruction.
|
||||||
///
|
///
|
||||||
/// \param __a
|
/// \param X
|
||||||
/// A vector of [4 x i64] to be used by the insert operation.
|
/// A vector of [4 x i64] to be used by the insert operation.
|
||||||
/// \param __b
|
/// \param I
|
||||||
/// A 64-bit integer value. The replacement value for the insert operation.
|
/// A 64-bit integer value. The replacement value for the insert operation.
|
||||||
/// \param __imm
|
/// \param N
|
||||||
/// An immediate integer specifying the index of the vector element to be
|
/// An immediate integer specifying the index of the vector element to be
|
||||||
/// replaced.
|
/// replaced.
|
||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a X, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a N with \a I.
|
||||||
#define _mm256_insert_epi64(X, I, N) \
|
#define _mm256_insert_epi64(X, I, N) \
|
||||||
((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
|
||||||
(long long)(I), (int)(N)))
|
(long long)(I), (int)(N)))
|
||||||
@ -3177,7 +3212,7 @@ _mm256_loadu_si256(__m256i_u const *__p)
|
|||||||
/// A pointer to a 256-bit integer vector containing integer values.
|
/// A pointer to a 256-bit integer vector containing integer values.
|
||||||
/// \returns A 256-bit integer vector containing the moved values.
|
/// \returns A 256-bit integer vector containing the moved values.
|
||||||
static __inline __m256i __DEFAULT_FN_ATTRS
|
static __inline __m256i __DEFAULT_FN_ATTRS
|
||||||
_mm256_lddqu_si256(__m256i const *__p)
|
_mm256_lddqu_si256(__m256i_u const *__p)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
|
return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
|
||||||
}
|
}
|
||||||
|
|||||||
32
lib/include/avxvnniintrin.h
vendored
32
lib/include/avxvnniintrin.h
vendored
@ -50,7 +50,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||||
@ -59,7 +59,7 @@
|
|||||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||||
{
|
{
|
||||||
@ -73,7 +73,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||||
@ -82,7 +82,7 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||||
{
|
{
|
||||||
@ -96,14 +96,14 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||||
{
|
{
|
||||||
@ -117,14 +117,14 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 7
|
/// FOR j := 0 to 7
|
||||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
||||||
{
|
{
|
||||||
@ -138,7 +138,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||||
@ -147,7 +147,7 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|||||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||||
{
|
{
|
||||||
@ -161,7 +161,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
|
||||||
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
|
||||||
@ -170,7 +170,7 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|||||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||||
{
|
{
|
||||||
@ -184,14 +184,14 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||||
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||||
{
|
{
|
||||||
@ -205,14 +205,14 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// FOR j := 0 to 3
|
/// FOR j := 0 to 3
|
||||||
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
|
||||||
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
|
||||||
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS128
|
||||||
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
|
||||||
{
|
{
|
||||||
|
|||||||
8
lib/include/bmiintrin.h
vendored
8
lib/include/bmiintrin.h
vendored
@ -47,6 +47,7 @@ __tzcnt_u16(unsigned short __X)
|
|||||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||||
/// \returns An unsigned 32-bit integer containing the number of trailing zero
|
/// \returns An unsigned 32-bit integer containing the number of trailing zero
|
||||||
/// bits in the operand.
|
/// bits in the operand.
|
||||||
|
/// \see _mm_tzcnt_32
|
||||||
static __inline__ unsigned int __RELAXED_FN_ATTRS
|
static __inline__ unsigned int __RELAXED_FN_ATTRS
|
||||||
__tzcnt_u32(unsigned int __X)
|
__tzcnt_u32(unsigned int __X)
|
||||||
{
|
{
|
||||||
@ -63,10 +64,11 @@ __tzcnt_u32(unsigned int __X)
|
|||||||
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
/// An unsigned 32-bit integer whose trailing zeros are to be counted.
|
||||||
/// \returns An 32-bit integer containing the number of trailing zero bits in
|
/// \returns An 32-bit integer containing the number of trailing zero bits in
|
||||||
/// the operand.
|
/// the operand.
|
||||||
|
/// \see __tzcnt_u32
|
||||||
static __inline__ int __RELAXED_FN_ATTRS
|
static __inline__ int __RELAXED_FN_ATTRS
|
||||||
_mm_tzcnt_32(unsigned int __X)
|
_mm_tzcnt_32(unsigned int __X)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_tzcnt_u32(__X);
|
return (int)__builtin_ia32_tzcnt_u32(__X);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
|
#define _tzcnt_u32(a) (__tzcnt_u32((a)))
|
||||||
@ -83,6 +85,7 @@ _mm_tzcnt_32(unsigned int __X)
|
|||||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||||
/// \returns An unsigned 64-bit integer containing the number of trailing zero
|
/// \returns An unsigned 64-bit integer containing the number of trailing zero
|
||||||
/// bits in the operand.
|
/// bits in the operand.
|
||||||
|
/// \see _mm_tzcnt_64
|
||||||
static __inline__ unsigned long long __RELAXED_FN_ATTRS
|
static __inline__ unsigned long long __RELAXED_FN_ATTRS
|
||||||
__tzcnt_u64(unsigned long long __X)
|
__tzcnt_u64(unsigned long long __X)
|
||||||
{
|
{
|
||||||
@ -99,10 +102,11 @@ __tzcnt_u64(unsigned long long __X)
|
|||||||
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
/// An unsigned 64-bit integer whose trailing zeros are to be counted.
|
||||||
/// \returns An 64-bit integer containing the number of trailing zero bits in
|
/// \returns An 64-bit integer containing the number of trailing zero bits in
|
||||||
/// the operand.
|
/// the operand.
|
||||||
|
/// \see __tzcnt_u64
|
||||||
static __inline__ long long __RELAXED_FN_ATTRS
|
static __inline__ long long __RELAXED_FN_ATTRS
|
||||||
_mm_tzcnt_64(unsigned long long __X)
|
_mm_tzcnt_64(unsigned long long __X)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_tzcnt_u64(__X);
|
return (long long)__builtin_ia32_tzcnt_u64(__X);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
|
#define _tzcnt_u64(a) (__tzcnt_u64((a)))
|
||||||
|
|||||||
18
lib/include/cetintrin.h
vendored
18
lib/include/cetintrin.h
vendored
@ -19,7 +19,7 @@
|
|||||||
__attribute__((__always_inline__, __nodebug__, __target__("shstk")))
|
__attribute__((__always_inline__, __nodebug__, __target__("shstk")))
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
|
static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
|
||||||
__builtin_ia32_incsspd(__a);
|
__builtin_ia32_incsspd((unsigned int)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
@ -34,7 +34,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
|
|||||||
}
|
}
|
||||||
#else /* __x86_64__ */
|
#else /* __x86_64__ */
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
|
static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
|
||||||
__builtin_ia32_incsspd((int)__a);
|
__builtin_ia32_incsspd(__a);
|
||||||
}
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
@ -42,9 +42,12 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
|
|||||||
return __builtin_ia32_rdsspd(__a);
|
return __builtin_ia32_rdsspd(__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32(void) {
|
||||||
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wuninitialized"
|
||||||
unsigned int t;
|
unsigned int t;
|
||||||
return __builtin_ia32_rdsspd(t);
|
return __builtin_ia32_rdsspd(t);
|
||||||
|
#pragma clang diagnostic pop
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
@ -52,9 +55,12 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo
|
|||||||
return __builtin_ia32_rdsspq(__a);
|
return __builtin_ia32_rdsspq(__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64(void) {
|
||||||
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wuninitialized"
|
||||||
unsigned long long t;
|
unsigned long long t;
|
||||||
return __builtin_ia32_rdsspq(t);
|
return __builtin_ia32_rdsspq(t);
|
||||||
|
#pragma clang diagnostic pop
|
||||||
}
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
@ -68,7 +74,7 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
|
|||||||
}
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
|
static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp(void) {
|
||||||
__builtin_ia32_saveprevssp();
|
__builtin_ia32_saveprevssp();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,7 +102,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void *
|
|||||||
}
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
|
static __inline__ void __DEFAULT_FN_ATTRS _setssbsy(void) {
|
||||||
__builtin_ia32_setssbsy();
|
__builtin_ia32_setssbsy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
1286
lib/include/emmintrin.h
vendored
1286
lib/include/emmintrin.h
vendored
File diff suppressed because it is too large
Load Diff
6
lib/include/f16cintrin.h
vendored
6
lib/include/f16cintrin.h
vendored
@ -65,9 +65,9 @@ _cvtsh_ss(unsigned short __a)
|
|||||||
/// 011: Truncate \n
|
/// 011: Truncate \n
|
||||||
/// 1XX: Use MXCSR.RC for rounding
|
/// 1XX: Use MXCSR.RC for rounding
|
||||||
/// \returns The converted 16-bit half-precision float value.
|
/// \returns The converted 16-bit half-precision float value.
|
||||||
#define _cvtss_sh(a, imm) \
|
#define _cvtss_sh(a, imm) __extension__ ({ \
|
||||||
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
||||||
(imm)))[0]))
|
(imm)))[0]); })
|
||||||
|
|
||||||
/// Converts a 128-bit vector containing 32-bit float values into a
|
/// Converts a 128-bit vector containing 32-bit float values into a
|
||||||
/// 128-bit vector containing 16-bit half-precision float values.
|
/// 128-bit vector containing 16-bit half-precision float values.
|
||||||
|
|||||||
15
lib/include/hlsl.h
vendored
Normal file
15
lib/include/hlsl.h
vendored
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
//===----- hlsl.h - HLSL definitions --------------------------------------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef _HLSL_H_
|
||||||
|
#define _HLSL_H_
|
||||||
|
|
||||||
|
#include "hlsl/hlsl_basic_types.h"
|
||||||
|
#include "hlsl/hlsl_intrinsics.h"
|
||||||
|
|
||||||
|
#endif //_HLSL_H_
|
||||||
64
lib/include/hlsl_basic_types.h
vendored
Normal file
64
lib/include/hlsl_basic_types.h
vendored
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
//===----- hlsl_basic_types.h - HLSL definitions for basic types ----------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef _HLSL_HLSL_BASIC_TYPES_H_
|
||||||
|
#define _HLSL_HLSL_BASIC_TYPES_H_
|
||||||
|
|
||||||
|
// built-in scalar data types:
|
||||||
|
|
||||||
|
#ifdef __HLSL_ENABLE_16_BIT
|
||||||
|
// 16-bit integer.
|
||||||
|
typedef unsigned short uint16_t;
|
||||||
|
typedef short int16_t;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// unsigned 32-bit integer.
|
||||||
|
typedef unsigned int uint;
|
||||||
|
|
||||||
|
// 64-bit integer.
|
||||||
|
typedef unsigned long uint64_t;
|
||||||
|
typedef long int64_t;
|
||||||
|
|
||||||
|
// built-in vector data types:
|
||||||
|
|
||||||
|
#ifdef __HLSL_ENABLE_16_BIT
|
||||||
|
typedef vector<int16_t, 2> int16_t2;
|
||||||
|
typedef vector<int16_t, 3> int16_t3;
|
||||||
|
typedef vector<int16_t, 4> int16_t4;
|
||||||
|
typedef vector<uint16_t, 2> uint16_t2;
|
||||||
|
typedef vector<uint16_t, 3> uint16_t3;
|
||||||
|
typedef vector<uint16_t, 4> uint16_t4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef vector<int, 2> int2;
|
||||||
|
typedef vector<int, 3> int3;
|
||||||
|
typedef vector<int, 4> int4;
|
||||||
|
typedef vector<uint, 2> uint2;
|
||||||
|
typedef vector<uint, 3> uint3;
|
||||||
|
typedef vector<uint, 4> uint4;
|
||||||
|
typedef vector<int64_t, 2> int64_t2;
|
||||||
|
typedef vector<int64_t, 3> int64_t3;
|
||||||
|
typedef vector<int64_t, 4> int64_t4;
|
||||||
|
typedef vector<uint64_t, 2> uint64_t2;
|
||||||
|
typedef vector<uint64_t, 3> uint64_t3;
|
||||||
|
typedef vector<uint64_t, 4> uint64_t4;
|
||||||
|
|
||||||
|
#ifdef __HLSL_ENABLE_16_BIT
|
||||||
|
typedef vector<half, 2> half2;
|
||||||
|
typedef vector<half, 3> half3;
|
||||||
|
typedef vector<half, 4> half4;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef vector<float, 2> float2;
|
||||||
|
typedef vector<float, 3> float3;
|
||||||
|
typedef vector<float, 4> float4;
|
||||||
|
typedef vector<double, 2> double2;
|
||||||
|
typedef vector<double, 3> double3;
|
||||||
|
typedef vector<double, 4> double4;
|
||||||
|
|
||||||
|
#endif //_HLSL_HLSL_BASIC_TYPES_H_
|
||||||
15
lib/include/hlsl_intrinsics.h
vendored
Normal file
15
lib/include/hlsl_intrinsics.h
vendored
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
//===----- hlsl_intrinsics.h - HLSL definitions for intrinsics ----------===//
|
||||||
|
//
|
||||||
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#ifndef _HLSL_HLSL_INTRINSICS_H_
|
||||||
|
#define _HLSL_HLSL_INTRINSICS_H_
|
||||||
|
|
||||||
|
__attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
|
||||||
|
WaveActiveCountBits(bool bBit);
|
||||||
|
|
||||||
|
#endif //_HLSL_HLSL_INTRINSICS_H_
|
||||||
4
lib/include/hresetintrin.h
vendored
4
lib/include/hresetintrin.h
vendored
@ -25,7 +25,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> HRESET </c> instruction.
|
/// This intrinsic corresponds to the <c> HRESET </c> instruction.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// IF __eax == 0
|
/// IF __eax == 0
|
||||||
/// // nop
|
/// // nop
|
||||||
/// ELSE
|
/// ELSE
|
||||||
@ -35,7 +35,7 @@
|
|||||||
/// FI
|
/// FI
|
||||||
/// ENDFOR
|
/// ENDFOR
|
||||||
/// FI
|
/// FI
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline void __DEFAULT_FN_ATTRS
|
static __inline void __DEFAULT_FN_ATTRS
|
||||||
_hreset(int __eax)
|
_hreset(int __eax)
|
||||||
{
|
{
|
||||||
|
|||||||
22
lib/include/ia32intrin.h
vendored
22
lib/include/ia32intrin.h
vendored
@ -40,7 +40,7 @@
|
|||||||
*/
|
*/
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bsfd(int __A) {
|
__bsfd(int __A) {
|
||||||
return __builtin_ctz(__A);
|
return __builtin_ctz((unsigned int)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Find the first set bit starting from the msb. Result is undefined if
|
/** Find the first set bit starting from the msb. Result is undefined if
|
||||||
@ -57,7 +57,7 @@ __bsfd(int __A) {
|
|||||||
*/
|
*/
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bsrd(int __A) {
|
__bsrd(int __A) {
|
||||||
return 31 - __builtin_clz(__A);
|
return 31 - __builtin_clz((unsigned int)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Swaps the bytes in the input. Converting little endian to big endian or
|
/** Swaps the bytes in the input. Converting little endian to big endian or
|
||||||
@ -73,12 +73,12 @@ __bsrd(int __A) {
|
|||||||
*/
|
*/
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bswapd(int __A) {
|
__bswapd(int __A) {
|
||||||
return __builtin_bswap32(__A);
|
return (int)__builtin_bswap32((unsigned int)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
_bswap(int __A) {
|
_bswap(int __A) {
|
||||||
return __builtin_bswap32(__A);
|
return (int)__builtin_bswap32((unsigned int)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _bit_scan_forward(A) __bsfd((A))
|
#define _bit_scan_forward(A) __bsfd((A))
|
||||||
@ -99,7 +99,7 @@ _bswap(int __A) {
|
|||||||
*/
|
*/
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bsfq(long long __A) {
|
__bsfq(long long __A) {
|
||||||
return __builtin_ctzll(__A);
|
return (long long)__builtin_ctzll((unsigned long long)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Find the first set bit starting from the msb. Result is undefined if
|
/** Find the first set bit starting from the msb. Result is undefined if
|
||||||
@ -116,7 +116,7 @@ __bsfq(long long __A) {
|
|||||||
*/
|
*/
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bsrq(long long __A) {
|
__bsrq(long long __A) {
|
||||||
return 63 - __builtin_clzll(__A);
|
return 63 - __builtin_clzll((unsigned long long)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Swaps the bytes in the input. Converting little endian to big endian or
|
/** Swaps the bytes in the input. Converting little endian to big endian or
|
||||||
@ -132,7 +132,7 @@ __bsrq(long long __A) {
|
|||||||
*/
|
*/
|
||||||
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__bswapq(long long __A) {
|
__bswapq(long long __A) {
|
||||||
return __builtin_bswap64(__A);
|
return (long long)__builtin_bswap64((unsigned long long)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define _bswap64(A) __bswapq((A))
|
#define _bswap64(A) __bswapq((A))
|
||||||
@ -395,23 +395,23 @@ __rorw(unsigned short __X, int __C) {
|
|||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__rold(unsigned int __X, int __C) {
|
__rold(unsigned int __X, int __C) {
|
||||||
return __builtin_rotateleft32(__X, __C);
|
return __builtin_rotateleft32(__X, (unsigned int)__C);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__rord(unsigned int __X, int __C) {
|
__rord(unsigned int __X, int __C) {
|
||||||
return __builtin_rotateright32(__X, __C);
|
return __builtin_rotateright32(__X, (unsigned int)__C);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__rolq(unsigned long long __X, int __C) {
|
__rolq(unsigned long long __X, int __C) {
|
||||||
return __builtin_rotateleft64(__X, __C);
|
return __builtin_rotateleft64(__X, (unsigned long long)__C);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
__rorq(unsigned long long __X, int __C) {
|
__rorq(unsigned long long __X, int __C) {
|
||||||
return __builtin_rotateright64(__X, __C);
|
return __builtin_rotateright64(__X, (unsigned long long)__C);
|
||||||
}
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
|
|||||||
30
lib/include/immintrin.h
vendored
30
lib/include/immintrin.h
vendored
@ -276,20 +276,20 @@ _rdpid_u32(void) {
|
|||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||||
_rdrand16_step(unsigned short *__p)
|
_rdrand16_step(unsigned short *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdrand16_step(__p);
|
return (int)__builtin_ia32_rdrand16_step(__p);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||||
_rdrand32_step(unsigned int *__p)
|
_rdrand32_step(unsigned int *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdrand32_step(__p);
|
return (int)__builtin_ia32_rdrand32_step(__p);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
|
||||||
_rdrand64_step(unsigned long long *__p)
|
_rdrand64_step(unsigned long long *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdrand64_step(__p);
|
return (int)__builtin_ia32_rdrand64_step(__p);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif /* __RDRND__ */
|
#endif /* __RDRND__ */
|
||||||
@ -360,50 +360,50 @@ _writegsbase_u64(unsigned long long __V)
|
|||||||
static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_loadbe_i16(void const * __P) {
|
_loadbe_i16(void const * __P) {
|
||||||
struct __loadu_i16 {
|
struct __loadu_i16 {
|
||||||
short __v;
|
unsigned short __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
|
return (short)__builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_storebe_i16(void * __P, short __D) {
|
_storebe_i16(void * __P, short __D) {
|
||||||
struct __storeu_i16 {
|
struct __storeu_i16 {
|
||||||
short __v;
|
unsigned short __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D);
|
((struct __storeu_i16*)__P)->__v = __builtin_bswap16((unsigned short)__D);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_loadbe_i32(void const * __P) {
|
_loadbe_i32(void const * __P) {
|
||||||
struct __loadu_i32 {
|
struct __loadu_i32 {
|
||||||
int __v;
|
unsigned int __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
|
return (int)__builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_storebe_i32(void * __P, int __D) {
|
_storebe_i32(void * __P, int __D) {
|
||||||
struct __storeu_i32 {
|
struct __storeu_i32 {
|
||||||
int __v;
|
unsigned int __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D);
|
((struct __storeu_i32*)__P)->__v = __builtin_bswap32((unsigned int)__D);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_loadbe_i64(void const * __P) {
|
_loadbe_i64(void const * __P) {
|
||||||
struct __loadu_i64 {
|
struct __loadu_i64 {
|
||||||
long long __v;
|
unsigned long long __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
|
return (long long)__builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
|
||||||
_storebe_i64(void * __P, long long __D) {
|
_storebe_i64(void * __P, long long __D) {
|
||||||
struct __storeu_i64 {
|
struct __storeu_i64 {
|
||||||
long long __v;
|
unsigned long long __v;
|
||||||
} __attribute__((__packed__, __may_alias__));
|
} __attribute__((__packed__, __may_alias__));
|
||||||
((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D);
|
((struct __storeu_i64*)__P)->__v = __builtin_bswap64((unsigned long long)__D);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif /* __MOVBE */
|
#endif /* __MOVBE */
|
||||||
|
|||||||
33
lib/include/intrin.h
vendored
33
lib/include/intrin.h
vendored
@ -534,27 +534,6 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
|
|||||||
|* Misc
|
|* Misc
|
||||||
\*----------------------------------------------------------------------------*/
|
\*----------------------------------------------------------------------------*/
|
||||||
#if defined(__i386__) || defined(__x86_64__)
|
#if defined(__i386__) || defined(__x86_64__)
|
||||||
#if defined(__i386__)
|
|
||||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
|
||||||
__asm("cpuid" \
|
|
||||||
: "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
|
||||||
: "0"(__leaf), "2"(__count))
|
|
||||||
#else
|
|
||||||
/* x86-64 uses %rbx as the base register, so preserve it. */
|
|
||||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
|
||||||
__asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n" \
|
|
||||||
"cpuid\n" \
|
|
||||||
"xchg{q} {%%rbx, %q1|%q1, rbx}" \
|
|
||||||
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
|
||||||
: "0"(__leaf), "2"(__count))
|
|
||||||
#endif
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
|
|
||||||
__cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
|
|
||||||
}
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
|
|
||||||
int __ecx) {
|
|
||||||
__cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
|
|
||||||
}
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
|
static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
|
||||||
__asm__ volatile("hlt");
|
__asm__ volatile("hlt");
|
||||||
}
|
}
|
||||||
@ -581,6 +560,18 @@ unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
|
|||||||
|
|
||||||
__int64 __mulh(__int64 __a, __int64 __b);
|
__int64 __mulh(__int64 __a, __int64 __b);
|
||||||
unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
|
unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
|
||||||
|
|
||||||
|
void __break(int);
|
||||||
|
|
||||||
|
void __writex18byte(unsigned long offset, unsigned char data);
|
||||||
|
void __writex18word(unsigned long offset, unsigned short data);
|
||||||
|
void __writex18dword(unsigned long offset, unsigned long data);
|
||||||
|
void __writex18qword(unsigned long offset, unsigned __int64 data);
|
||||||
|
|
||||||
|
unsigned char __readx18byte(unsigned long offset);
|
||||||
|
unsigned short __readx18word(unsigned long offset);
|
||||||
|
unsigned long __readx18dword(unsigned long offset);
|
||||||
|
unsigned __int64 __readx18qword(unsigned long offset);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*----------------------------------------------------------------------------*\
|
/*----------------------------------------------------------------------------*\
|
||||||
|
|||||||
44
lib/include/keylockerintrin.h
vendored
44
lib/include/keylockerintrin.h
vendored
@ -46,7 +46,7 @@
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
|
/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
|
/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
|
||||||
/// GP (0)
|
/// GP (0)
|
||||||
/// FI
|
/// FI
|
||||||
@ -91,7 +91,7 @@
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
||||||
__m128i __enkey_lo, __m128i __enkey_hi) {
|
__m128i __enkey_lo, __m128i __enkey_hi) {
|
||||||
@ -106,7 +106,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
|
/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// InputKey[127:0] := __key[127:0]
|
/// InputKey[127:0] := __key[127:0]
|
||||||
/// KeyMetadata[2:0] := __htype[2:0]
|
/// KeyMetadata[2:0] := __htype[2:0]
|
||||||
/// KeyMetadata[23:3] := 0 // Reserved for future usage
|
/// KeyMetadata[23:3] := 0 // Reserved for future usage
|
||||||
@ -126,7 +126,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
||||||
return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
|
return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
|
||||||
@ -141,7 +141,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
|
/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// InputKey[127:0] := __key_lo[127:0]
|
/// InputKey[127:0] := __key_lo[127:0]
|
||||||
/// InputKey[255:128] := __key_hi[255:128]
|
/// InputKey[255:128] := __key_hi[255:128]
|
||||||
/// KeyMetadata[2:0] := __htype[2:0]
|
/// KeyMetadata[2:0] := __htype[2:0]
|
||||||
@ -163,7 +163,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
|
_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
|
||||||
void *__h) {
|
void *__h) {
|
||||||
@ -179,7 +179,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
||||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -202,7 +202,7 @@ _mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||||
return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||||
@ -216,7 +216,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
|
/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
|
||||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -241,7 +241,7 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||||
return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||||
@ -255,7 +255,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
|
||||||
/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
|
/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -280,7 +280,7 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||||
return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||||
@ -294,7 +294,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[511:0] := MEM[__h+511:__h]
|
/// Handle[511:0] := MEM[__h+511:__h]
|
||||||
/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
|
/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -319,7 +319,7 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
||||||
return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
|
||||||
@ -346,7 +346,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle := MEM[__h+383:__h]
|
/// Handle := MEM[__h+383:__h]
|
||||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -377,7 +377,7 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||||
return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
|
return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
|
||||||
@ -392,7 +392,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[511:0] := MEM[__h+511:__h]
|
/// Handle[511:0] := MEM[__h+511:__h]
|
||||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -423,7 +423,7 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||||
return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
|
return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
|
||||||
@ -438,7 +438,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[383:0] := MEM[__h+383:__h]
|
/// Handle[383:0] := MEM[__h+383:__h]
|
||||||
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -469,7 +469,7 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||||
return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
|
return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
|
||||||
@ -484,7 +484,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
|
/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// Handle[511:0] := MEM[__h+511:__h]
|
/// Handle[511:0] := MEM[__h+511:__h]
|
||||||
/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
|
/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
|
||||||
/// (Handle[127:0] AND (CPL > 0)) ||
|
/// (Handle[127:0] AND (CPL > 0)) ||
|
||||||
@ -515,7 +515,7 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
|
|||||||
/// AF := 0
|
/// AF := 0
|
||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// CF := 0
|
/// CF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
|
||||||
return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
|
return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
|
||||||
|
|||||||
6
lib/include/mm_malloc.h
vendored
6
lib/include/mm_malloc.h
vendored
@ -28,9 +28,9 @@ extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size
|
|||||||
|
|
||||||
#if !(defined(_WIN32) && defined(_mm_malloc))
|
#if !(defined(_WIN32) && defined(_mm_malloc))
|
||||||
static __inline__ void *__attribute__((__always_inline__, __nodebug__,
|
static __inline__ void *__attribute__((__always_inline__, __nodebug__,
|
||||||
__malloc__))
|
__malloc__, __alloc_size__(1),
|
||||||
_mm_malloc(size_t __size, size_t __align)
|
__alloc_align__(2)))
|
||||||
{
|
_mm_malloc(size_t __size, size_t __align) {
|
||||||
if (__align == 1) {
|
if (__align == 1) {
|
||||||
return malloc(__size);
|
return malloc(__size);
|
||||||
}
|
}
|
||||||
|
|||||||
10
lib/include/opencl-c-base.h
vendored
10
lib/include/opencl-c-base.h
vendored
@ -21,6 +21,7 @@
|
|||||||
#define cl_khr_subgroup_shuffle 1
|
#define cl_khr_subgroup_shuffle 1
|
||||||
#define cl_khr_subgroup_shuffle_relative 1
|
#define cl_khr_subgroup_shuffle_relative 1
|
||||||
#define cl_khr_subgroup_clustered_reduce 1
|
#define cl_khr_subgroup_clustered_reduce 1
|
||||||
|
#define cl_khr_subgroup_rotate 1
|
||||||
#define cl_khr_extended_bit_ops 1
|
#define cl_khr_extended_bit_ops 1
|
||||||
#define cl_khr_integer_dot_product 1
|
#define cl_khr_integer_dot_product 1
|
||||||
#define __opencl_c_integer_dot_product_input_4x8bit 1
|
#define __opencl_c_integer_dot_product_input_4x8bit 1
|
||||||
@ -67,6 +68,7 @@
|
|||||||
#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
|
#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
|
||||||
// For the SPIR and SPIR-V target all features are supported.
|
// For the SPIR and SPIR-V target all features are supported.
|
||||||
#if defined(__SPIR__) || defined(__SPIRV__)
|
#if defined(__SPIR__) || defined(__SPIRV__)
|
||||||
|
#define __opencl_c_work_group_collective_functions 1
|
||||||
#define __opencl_c_atomic_order_seq_cst 1
|
#define __opencl_c_atomic_order_seq_cst 1
|
||||||
#define __opencl_c_atomic_scope_device 1
|
#define __opencl_c_atomic_scope_device 1
|
||||||
#define __opencl_c_atomic_scope_all_devices 1
|
#define __opencl_c_atomic_scope_all_devices 1
|
||||||
@ -80,6 +82,11 @@
|
|||||||
#define __opencl_c_named_address_space_builtins 1
|
#define __opencl_c_named_address_space_builtins 1
|
||||||
#endif // !defined(__opencl_c_generic_address_space)
|
#endif // !defined(__opencl_c_generic_address_space)
|
||||||
|
|
||||||
|
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
|
||||||
|
// Internal feature macro to provide subgroup builtins.
|
||||||
|
#define __opencl_subgroup_builtins 1
|
||||||
|
#endif
|
||||||
|
|
||||||
// built-in scalar data types:
|
// built-in scalar data types:
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -197,6 +204,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
|
|||||||
typedef double double16 __attribute__((ext_vector_type(16)));
|
typedef double double16 __attribute__((ext_vector_type(16)));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// An internal alias for half, for use by OpenCLBuiltins.td.
|
||||||
|
#define __half half
|
||||||
|
|
||||||
#if defined(__OPENCL_CPP_VERSION__)
|
#if defined(__OPENCL_CPP_VERSION__)
|
||||||
#define NULL nullptr
|
#define NULL nullptr
|
||||||
#elif defined(__OPENCL_C_VERSION__)
|
#elif defined(__OPENCL_C_VERSION__)
|
||||||
|
|||||||
13494
lib/include/opencl-c.h
vendored
13494
lib/include/opencl-c.h
vendored
File diff suppressed because it is too large
Load Diff
2
lib/include/pmmintrin.h
vendored
2
lib/include/pmmintrin.h
vendored
@ -35,7 +35,7 @@
|
|||||||
/// A pointer to a 128-bit integer vector containing integer values.
|
/// A pointer to a 128-bit integer vector containing integer values.
|
||||||
/// \returns A 128-bit vector containing the moved values.
|
/// \returns A 128-bit vector containing the moved values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_lddqu_si128(__m128i const *__p)
|
_mm_lddqu_si128(__m128i_u const *__p)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
|
return (__m128i)__builtin_ia32_lddqu((char const *)__p);
|
||||||
}
|
}
|
||||||
|
|||||||
134
lib/include/ppc_wrappers/bmi2intrin.h
vendored
Normal file
134
lib/include/ppc_wrappers/bmi2intrin.h
vendored
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined X86GPRINTRIN_H_
|
||||||
|
#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef BMI2INTRIN_H_
|
||||||
|
#define BMI2INTRIN_H_
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_bzhi_u32(unsigned int __X, unsigned int __Y) {
|
||||||
|
return ((__X << (32 - __Y)) >> (32 - __Y));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
|
||||||
|
unsigned long long __res = (unsigned long long)__X * __Y;
|
||||||
|
*__P = (unsigned int)(__res >> 32);
|
||||||
|
return (unsigned int)__res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __PPC64__
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
|
||||||
|
return ((__X << (64 - __Y)) >> (64 - __Y));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* __int128 requires base 64-bit. */
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mulx_u64(unsigned long long __X, unsigned long long __Y,
|
||||||
|
unsigned long long *__P) {
|
||||||
|
unsigned __int128 __res = (unsigned __int128)__X * __Y;
|
||||||
|
*__P = (unsigned long long)(__res >> 64);
|
||||||
|
return (unsigned long long)__res;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR7
|
||||||
|
/* popcount and bpermd require power7 minimum. */
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_pdep_u64(unsigned long long __X, unsigned long long __M) {
|
||||||
|
unsigned long __result = 0x0UL;
|
||||||
|
const unsigned long __mask = 0x8000000000000000UL;
|
||||||
|
unsigned long __m = __M;
|
||||||
|
unsigned long __c, __t;
|
||||||
|
unsigned long __p;
|
||||||
|
|
||||||
|
/* The pop-count of the mask gives the number of the bits from
|
||||||
|
source to process. This is also needed to shift bits from the
|
||||||
|
source into the correct position for the result. */
|
||||||
|
__p = 64 - __builtin_popcountl(__M);
|
||||||
|
|
||||||
|
/* The loop is for the number of '1' bits in the mask and clearing
|
||||||
|
each mask bit as it is processed. */
|
||||||
|
while (__m != 0) {
|
||||||
|
__c = __builtin_clzl(__m);
|
||||||
|
__t = __X << (__p - __c);
|
||||||
|
__m ^= (__mask >> __c);
|
||||||
|
__result |= (__t & (__mask >> __c));
|
||||||
|
__p++;
|
||||||
|
}
|
||||||
|
return __result;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_pext_u64(unsigned long long __X, unsigned long long __M) {
|
||||||
|
unsigned long __p = 0x4040404040404040UL; // initial bit permute control
|
||||||
|
const unsigned long __mask = 0x8000000000000000UL;
|
||||||
|
unsigned long __m = __M;
|
||||||
|
unsigned long __c;
|
||||||
|
unsigned long __result;
|
||||||
|
|
||||||
|
/* if the mask is constant and selects 8 bits or less we can use
|
||||||
|
the Power8 Bit permute instruction. */
|
||||||
|
if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
|
||||||
|
/* Also if the pext mask is constant, then the popcount is
|
||||||
|
constant, we can evaluate the following loop at compile
|
||||||
|
time and use a constant bit permute vector. */
|
||||||
|
long __i;
|
||||||
|
for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
|
||||||
|
__c = __builtin_clzl(__m);
|
||||||
|
__p = (__p << 8) | __c;
|
||||||
|
__m ^= (__mask >> __c);
|
||||||
|
}
|
||||||
|
__result = __builtin_bpermd(__p, __X);
|
||||||
|
} else {
|
||||||
|
__p = 64 - __builtin_popcountl(__M);
|
||||||
|
__result = 0;
|
||||||
|
/* We could a use a for loop here, but that combined with
|
||||||
|
-funroll-loops can expand to a lot of code. The while
|
||||||
|
loop avoids unrolling and the compiler commons the xor
|
||||||
|
from clearing the mask bit with the (m != 0) test. The
|
||||||
|
result is a more compact loop setup and body. */
|
||||||
|
while (__m != 0) {
|
||||||
|
unsigned long __t;
|
||||||
|
__c = __builtin_clzl(__m);
|
||||||
|
__t = (__X & (__mask >> __c)) >> (__p - __c);
|
||||||
|
__m ^= (__mask >> __c);
|
||||||
|
__result |= (__t);
|
||||||
|
__p++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return __result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* these 32-bit implementations depend on 64-bit pdep/pext
|
||||||
|
which depend on _ARCH_PWR7. */
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_pdep_u32(unsigned int __X, unsigned int __Y) {
|
||||||
|
return _pdep_u64(__X, __Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_pext_u32(unsigned int __X, unsigned int __Y) {
|
||||||
|
return _pext_u64(__X, __Y);
|
||||||
|
}
|
||||||
|
#endif /* _ARCH_PWR7 */
|
||||||
|
#endif /* __PPC64__ */
|
||||||
|
|
||||||
|
#endif /* BMI2INTRIN_H_ */
|
||||||
165
lib/include/ppc_wrappers/bmiintrin.h
vendored
Normal file
165
lib/include/ppc_wrappers/bmiintrin.h
vendored
Normal file
@ -0,0 +1,165 @@
|
|||||||
|
/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined X86GPRINTRIN_H_
|
||||||
|
#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef BMIINTRIN_H_
|
||||||
|
#define BMIINTRIN_H_
|
||||||
|
|
||||||
|
extern __inline unsigned short
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__tzcnt_u16(unsigned short __X) {
|
||||||
|
return __builtin_ctz(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__andn_u32(unsigned int __X, unsigned int __Y) {
|
||||||
|
return (~__X & __Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
|
||||||
|
return ((__X << (32 - (__L + __P))) >> (32 - __L));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__bextr_u32(unsigned int __X, unsigned int __Y) {
|
||||||
|
unsigned int __P, __L;
|
||||||
|
__P = __Y & 0xFF;
|
||||||
|
__L = (__Y >> 8) & 0xFF;
|
||||||
|
return (_bextr_u32(__X, __P, __L));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsi_u32(unsigned int __X) {
|
||||||
|
return (__X & -__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsi_u32(unsigned int __X) {
|
||||||
|
return __blsi_u32(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsmsk_u32(unsigned int __X) {
|
||||||
|
return (__X ^ (__X - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsmsk_u32(unsigned int __X) {
|
||||||
|
return __blsmsk_u32(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsr_u32(unsigned int __X) {
|
||||||
|
return (__X & (__X - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsr_u32(unsigned int __X) {
|
||||||
|
return __blsr_u32(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__tzcnt_u32(unsigned int __X) {
|
||||||
|
return __builtin_ctz(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_tzcnt_u32(unsigned int __X) {
|
||||||
|
return __builtin_ctz(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* use the 64-bit shift, rotate, and count leading zeros instructions
|
||||||
|
for long long. */
|
||||||
|
#ifdef __PPC64__
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__andn_u64(unsigned long long __X, unsigned long long __Y) {
|
||||||
|
return (~__X & __Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
|
||||||
|
return ((__X << (64 - (__L + __P))) >> (64 - __L));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
|
||||||
|
unsigned int __P, __L;
|
||||||
|
__P = __Y & 0xFF;
|
||||||
|
__L = (__Y & 0xFF00) >> 8;
|
||||||
|
return (_bextr_u64(__X, __P, __L));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsi_u64(unsigned long long __X) {
|
||||||
|
return __X & -__X;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsi_u64(unsigned long long __X) {
|
||||||
|
return __blsi_u64(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsmsk_u64(unsigned long long __X) {
|
||||||
|
return (__X ^ (__X - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsmsk_u64(unsigned long long __X) {
|
||||||
|
return __blsmsk_u64(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__blsr_u64(unsigned long long __X) {
|
||||||
|
return (__X & (__X - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_blsr_u64(unsigned long long __X) {
|
||||||
|
return __blsr_u64(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
__tzcnt_u64(unsigned long long __X) {
|
||||||
|
return __builtin_ctzll(__X);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline unsigned long long
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_tzcnt_u64(unsigned long long __X) {
|
||||||
|
return __builtin_ctzll(__X);
|
||||||
|
}
|
||||||
|
#endif /* __PPC64__ */
|
||||||
|
|
||||||
|
#endif /* BMIINTRIN_H_ */
|
||||||
2841
lib/include/ppc_wrappers/emmintrin.h
vendored
2841
lib/include/ppc_wrappers/emmintrin.h
vendored
File diff suppressed because it is too large
Load Diff
27
lib/include/ppc_wrappers/immintrin.h
vendored
Normal file
27
lib/include/ppc_wrappers/immintrin.h
vendored
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef IMMINTRIN_H_
|
||||||
|
#define IMMINTRIN_H_
|
||||||
|
|
||||||
|
#include <x86gprintrin.h>
|
||||||
|
|
||||||
|
#include <mmintrin.h>
|
||||||
|
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
|
||||||
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
|
#include <tmmintrin.h>
|
||||||
|
|
||||||
|
#include <smmintrin.h>
|
||||||
|
|
||||||
|
#endif /* IMMINTRIN_H_ */
|
||||||
29
lib/include/ppc_wrappers/mm_malloc.h
vendored
29
lib/include/ppc_wrappers/mm_malloc.h
vendored
@ -10,38 +10,33 @@
|
|||||||
#ifndef _MM_MALLOC_H_INCLUDED
|
#ifndef _MM_MALLOC_H_INCLUDED
|
||||||
#define _MM_MALLOC_H_INCLUDED
|
#define _MM_MALLOC_H_INCLUDED
|
||||||
|
|
||||||
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
#if defined(__ppc64__) && \
|
||||||
|
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
|
/* We can't depend on <stdlib.h> since the prototype of posix_memalign
|
||||||
may not be visible. */
|
may not be visible. */
|
||||||
#ifndef __cplusplus
|
#ifndef __cplusplus
|
||||||
extern int posix_memalign (void **, size_t, size_t);
|
extern int posix_memalign(void **, size_t, size_t);
|
||||||
#else
|
#else
|
||||||
extern "C" int posix_memalign (void **, size_t, size_t);
|
extern "C" int posix_memalign(void **, size_t, size_t);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __inline void *
|
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
|
||||||
_mm_malloc (size_t size, size_t alignment)
|
|
||||||
{
|
|
||||||
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
|
/* PowerPC64 ELF V2 ABI requires quadword alignment. */
|
||||||
size_t vec_align = sizeof (__vector float);
|
size_t __vec_align = sizeof(__vector float);
|
||||||
void *ptr;
|
void *__ptr;
|
||||||
|
|
||||||
if (alignment < vec_align)
|
if (__alignment < __vec_align)
|
||||||
alignment = vec_align;
|
__alignment = __vec_align;
|
||||||
if (posix_memalign (&ptr, alignment, size) == 0)
|
if (posix_memalign(&__ptr, __alignment, __size) == 0)
|
||||||
return ptr;
|
return __ptr;
|
||||||
else
|
else
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline void
|
static __inline void _mm_free(void *__ptr) { free(__ptr); }
|
||||||
_mm_free (void * ptr)
|
|
||||||
{
|
|
||||||
free (ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <mm_malloc.h>
|
#include_next <mm_malloc.h>
|
||||||
|
|||||||
774
lib/include/ppc_wrappers/mmintrin.h
vendored
774
lib/include/ppc_wrappers/mmintrin.h
vendored
File diff suppressed because it is too large
Load Diff
162
lib/include/ppc_wrappers/pmmintrin.h
vendored
162
lib/include/ppc_wrappers/pmmintrin.h
vendored
@ -32,120 +32,114 @@
|
|||||||
In the specific case of the monitor and mwait instructions there are
|
In the specific case of the monitor and mwait instructions there are
|
||||||
no direct equivalent in the PowerISA at this time. So those
|
no direct equivalent in the PowerISA at this time. So those
|
||||||
intrinsics are not implemented. */
|
intrinsics are not implemented. */
|
||||||
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
#error \
|
||||||
|
"Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef PMMINTRIN_H_
|
#ifndef PMMINTRIN_H_
|
||||||
#define PMMINTRIN_H_
|
#define PMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
#if defined(__ppc64__) && \
|
||||||
|
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||||
|
|
||||||
/* We need definitions from the SSE2 and SSE header files*/
|
/* We need definitions from the SSE2 and SSE header files*/
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128
|
||||||
_mm_addsub_ps (__m128 __X, __m128 __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_addsub_ps(__m128 __X, __m128 __Y) {
|
||||||
const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0};
|
||||||
__v4sf even_neg_Y = vec_xor(__Y, even_n0);
|
__v4sf __even_neg_Y = vec_xor(__Y, __even_n0);
|
||||||
return (__m128) vec_add (__X, even_neg_Y);
|
return (__m128)vec_add(__X, __even_neg_Y);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128d
|
||||||
_mm_addsub_pd (__m128d __X, __m128d __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_addsub_pd(__m128d __X, __m128d __Y) {
|
||||||
const __v2df even_n0 = {-0.0, 0.0};
|
const __v2df __even_n0 = {-0.0, 0.0};
|
||||||
__v2df even_neg_Y = vec_xor(__Y, even_n0);
|
__v2df __even_neg_Y = vec_xor(__Y, __even_n0);
|
||||||
return (__m128d) vec_add (__X, even_neg_Y);
|
return (__m128d)vec_add(__X, __even_neg_Y);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128
|
||||||
_mm_hadd_ps (__m128 __X, __m128 __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_hadd_ps(__m128 __X, __m128 __Y) {
|
||||||
__vector unsigned char xform2 = {
|
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
|
||||||
0x00, 0x01, 0x02, 0x03,
|
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
|
||||||
0x08, 0x09, 0x0A, 0x0B,
|
0x18, 0x19, 0x1A, 0x1B};
|
||||||
0x10, 0x11, 0x12, 0x13,
|
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
|
||||||
0x18, 0x19, 0x1A, 0x1B
|
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
|
||||||
};
|
0x1C, 0x1D, 0x1E, 0x1F};
|
||||||
__vector unsigned char xform1 = {
|
return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
|
||||||
0x04, 0x05, 0x06, 0x07,
|
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
|
||||||
0x0C, 0x0D, 0x0E, 0x0F,
|
|
||||||
0x14, 0x15, 0x16, 0x17,
|
|
||||||
0x1C, 0x1D, 0x1E, 0x1F
|
|
||||||
};
|
|
||||||
return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
|
||||||
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128
|
||||||
_mm_hsub_ps (__m128 __X, __m128 __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_hsub_ps(__m128 __X, __m128 __Y) {
|
||||||
__vector unsigned char xform2 = {
|
__vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09,
|
||||||
0x00, 0x01, 0x02, 0x03,
|
0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13,
|
||||||
0x08, 0x09, 0x0A, 0x0B,
|
0x18, 0x19, 0x1A, 0x1B};
|
||||||
0x10, 0x11, 0x12, 0x13,
|
__vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D,
|
||||||
0x18, 0x19, 0x1A, 0x1B
|
0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17,
|
||||||
};
|
0x1C, 0x1D, 0x1E, 0x1F};
|
||||||
__vector unsigned char xform1 = {
|
return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2),
|
||||||
0x04, 0x05, 0x06, 0x07,
|
vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1));
|
||||||
0x0C, 0x0D, 0x0E, 0x0F,
|
|
||||||
0x14, 0x15, 0x16, 0x17,
|
|
||||||
0x1C, 0x1D, 0x1E, 0x1F
|
|
||||||
};
|
|
||||||
return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
|
|
||||||
vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128d
|
||||||
_mm_hadd_pd (__m128d __X, __m128d __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_hadd_pd(__m128d __X, __m128d __Y) {
|
||||||
return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y),
|
||||||
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
vec_mergel((__v2df)__X, (__v2df)__Y));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128d
|
||||||
_mm_hsub_pd (__m128d __X, __m128d __Y)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_hsub_pd(__m128d __X, __m128d __Y) {
|
||||||
return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
|
return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y),
|
||||||
vec_mergel ((__v2df) __X, (__v2df)__Y));
|
vec_mergel((__v2df)__X, (__v2df)__Y));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
#ifdef _ARCH_PWR8
|
||||||
_mm_movehdup_ps (__m128 __X)
|
extern __inline __m128
|
||||||
{
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
|
_mm_movehdup_ps(__m128 __X) {
|
||||||
|
return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_moveldup_ps(__m128 __X) {
|
||||||
|
return (__m128)vec_mergee((__v4su)__X, (__v4su)__X);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128d
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_loaddup_pd(double const *__P) {
|
||||||
|
return (__m128d)vec_splats(*__P);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128d
|
||||||
_mm_moveldup_ps (__m128 __X)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_movedup_pd(__m128d __X) {
|
||||||
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
|
return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
extern __inline __m128i
|
||||||
_mm_loaddup_pd (double const *__P)
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
{
|
_mm_lddqu_si128(__m128i const *__P) {
|
||||||
return (__m128d) vec_splats (*__P);
|
return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_movedup_pd (__m128d __X)
|
|
||||||
{
|
|
||||||
return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_lddqu_si128 (__m128i const *__P)
|
|
||||||
{
|
|
||||||
return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
|
/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <pmmintrin.h>
|
#include_next <pmmintrin.h>
|
||||||
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
#endif /* defined(__ppc64__) &&
|
||||||
*/
|
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||||
|
|
||||||
#endif /* PMMINTRIN_H_ */
|
#endif /* PMMINTRIN_H_ */
|
||||||
|
|||||||
587
lib/include/ppc_wrappers/smmintrin.h
vendored
587
lib/include/ppc_wrappers/smmintrin.h
vendored
@ -29,11 +29,254 @@
|
|||||||
#ifndef SMMINTRIN_H_
|
#ifndef SMMINTRIN_H_
|
||||||
#define SMMINTRIN_H_
|
#define SMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
#if defined(__ppc64__) && \
|
||||||
|
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#include <tmmintrin.h>
|
#include <tmmintrin.h>
|
||||||
|
|
||||||
|
/* Rounding mode macros. */
|
||||||
|
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||||||
|
#define _MM_FROUND_TO_ZERO 0x01
|
||||||
|
#define _MM_FROUND_TO_POS_INF 0x02
|
||||||
|
#define _MM_FROUND_TO_NEG_INF 0x03
|
||||||
|
#define _MM_FROUND_CUR_DIRECTION 0x04
|
||||||
|
|
||||||
|
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
|
||||||
|
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
|
||||||
|
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
|
||||||
|
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
|
||||||
|
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
|
||||||
|
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
|
||||||
|
|
||||||
|
#define _MM_FROUND_RAISE_EXC 0x00
|
||||||
|
#define _MM_FROUND_NO_EXC 0x08
|
||||||
|
|
||||||
|
extern __inline __m128d
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_round_pd(__m128d __A, int __rounding) {
|
||||||
|
__v2df __r;
|
||||||
|
union {
|
||||||
|
double __fr;
|
||||||
|
long long __fpscr;
|
||||||
|
} __enables_save, __fpscr_save;
|
||||||
|
|
||||||
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
||||||
|
/* Save enabled exceptions, disable all exceptions,
|
||||||
|
and preserve the rounding mode. */
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
|
||||||
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||||
|
#else
|
||||||
|
__fpscr_save.__fr = __builtin_mffs();
|
||||||
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||||
|
__fpscr_save.__fpscr &= ~0xf8;
|
||||||
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
||||||
|
#endif
|
||||||
|
/* Insert an artificial "read/write" reference to the variable
|
||||||
|
read below, to ensure the compiler does not schedule
|
||||||
|
a read/use of the variable before the FPSCR is modified, above.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : "+wa"(__A));
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (__rounding) {
|
||||||
|
case _MM_FROUND_TO_NEAREST_INT:
|
||||||
|
__fpscr_save.__fr = __builtin_mffsl();
|
||||||
|
__attribute__((fallthrough));
|
||||||
|
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
||||||
|
__builtin_set_fpscr_rn(0b00);
|
||||||
|
/* Insert an artificial "read/write" reference to the variable
|
||||||
|
read below, to ensure the compiler does not schedule
|
||||||
|
a read/use of the variable before the FPSCR is modified, above.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : "+wa"(__A));
|
||||||
|
|
||||||
|
__r = vec_rint((__v2df)__A);
|
||||||
|
|
||||||
|
/* Insert an artificial "read" reference to the variable written
|
||||||
|
above, to ensure the compiler does not schedule the computation
|
||||||
|
of the value after the manipulation of the FPSCR, below.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : : "wa"(__r));
|
||||||
|
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_NEG_INF:
|
||||||
|
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_floor((__v2df)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_POS_INF:
|
||||||
|
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_ceil((__v2df)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_ZERO:
|
||||||
|
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_trunc((__v2df)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_CUR_DIRECTION:
|
||||||
|
__r = vec_rint((__v2df)__A);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
||||||
|
/* Insert an artificial "read" reference to the variable written
|
||||||
|
above, to ensure the compiler does not schedule the computation
|
||||||
|
of the value after the manipulation of the FPSCR, below.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : : "wa"(__r));
|
||||||
|
/* Restore enabled exceptions. */
|
||||||
|
__fpscr_save.__fr = __builtin_mffsl();
|
||||||
|
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
||||||
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
||||||
|
}
|
||||||
|
return (__m128d)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128d
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
|
||||||
|
__B = _mm_round_pd(__B, __rounding);
|
||||||
|
__v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
|
||||||
|
return (__m128d)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_round_ps(__m128 __A, int __rounding) {
|
||||||
|
__v4sf __r;
|
||||||
|
union {
|
||||||
|
double __fr;
|
||||||
|
long long __fpscr;
|
||||||
|
} __enables_save, __fpscr_save;
|
||||||
|
|
||||||
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
||||||
|
/* Save enabled exceptions, disable all exceptions,
|
||||||
|
and preserve the rounding mode. */
|
||||||
|
#ifdef _ARCH_PWR9
|
||||||
|
__asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
|
||||||
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||||
|
#else
|
||||||
|
__fpscr_save.__fr = __builtin_mffs();
|
||||||
|
__enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
|
||||||
|
__fpscr_save.__fpscr &= ~0xf8;
|
||||||
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
||||||
|
#endif
|
||||||
|
/* Insert an artificial "read/write" reference to the variable
|
||||||
|
read below, to ensure the compiler does not schedule
|
||||||
|
a read/use of the variable before the FPSCR is modified, above.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : "+wa"(__A));
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (__rounding) {
|
||||||
|
case _MM_FROUND_TO_NEAREST_INT:
|
||||||
|
__fpscr_save.__fr = __builtin_mffsl();
|
||||||
|
__attribute__((fallthrough));
|
||||||
|
case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
|
||||||
|
__builtin_set_fpscr_rn(0b00);
|
||||||
|
/* Insert an artificial "read/write" reference to the variable
|
||||||
|
read below, to ensure the compiler does not schedule
|
||||||
|
a read/use of the variable before the FPSCR is modified, above.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : "+wa"(__A));
|
||||||
|
|
||||||
|
__r = vec_rint((__v4sf)__A);
|
||||||
|
|
||||||
|
/* Insert an artificial "read" reference to the variable written
|
||||||
|
above, to ensure the compiler does not schedule the computation
|
||||||
|
of the value after the manipulation of the FPSCR, below.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : : "wa"(__r));
|
||||||
|
__builtin_set_fpscr_rn(__fpscr_save.__fpscr);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_NEG_INF:
|
||||||
|
case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_floor((__v4sf)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_POS_INF:
|
||||||
|
case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_ceil((__v4sf)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_TO_ZERO:
|
||||||
|
case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
|
||||||
|
__r = vec_trunc((__v4sf)__A);
|
||||||
|
break;
|
||||||
|
case _MM_FROUND_CUR_DIRECTION:
|
||||||
|
__r = vec_rint((__v4sf)__A);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (__rounding & _MM_FROUND_NO_EXC) {
|
||||||
|
/* Insert an artificial "read" reference to the variable written
|
||||||
|
above, to ensure the compiler does not schedule the computation
|
||||||
|
of the value after the manipulation of the FPSCR, below.
|
||||||
|
This can be removed if and when GCC PR102783 is fixed.
|
||||||
|
*/
|
||||||
|
__asm__("" : : "wa"(__r));
|
||||||
|
/* Restore enabled exceptions. */
|
||||||
|
__fpscr_save.__fr = __builtin_mffsl();
|
||||||
|
__fpscr_save.__fpscr |= __enables_save.__fpscr;
|
||||||
|
__builtin_mtfsf(0b00000011, __fpscr_save.__fr);
|
||||||
|
}
|
||||||
|
return (__m128)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
|
||||||
|
__B = _mm_round_ps(__B, __rounding);
|
||||||
|
__v4sf __r = (__v4sf)__A;
|
||||||
|
__r[0] = ((__v4sf)__B)[0];
|
||||||
|
return (__m128)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
|
||||||
|
#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
|
||||||
|
|
||||||
|
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
|
||||||
|
#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
|
||||||
|
|
||||||
|
#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
|
||||||
|
#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
|
||||||
|
|
||||||
|
#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
|
||||||
|
#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
|
||||||
|
__v16qi __result = (__v16qi)__A;
|
||||||
|
|
||||||
|
__result[__N & 0xf] = __D;
|
||||||
|
|
||||||
|
return (__m128i)__result;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
|
||||||
|
__v4si __result = (__v4si)__A;
|
||||||
|
|
||||||
|
__result[__N & 3] = __D;
|
||||||
|
|
||||||
|
return (__m128i)__result;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
|
||||||
|
__v2di __result = (__v2di)__A;
|
||||||
|
|
||||||
|
__result[__N & 1] = __D;
|
||||||
|
|
||||||
|
return (__m128i)__result;
|
||||||
|
}
|
||||||
|
|
||||||
extern __inline int
|
extern __inline int
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_extract_epi8(__m128i __X, const int __N) {
|
_mm_extract_epi8(__m128i __X, const int __N) {
|
||||||
@ -58,6 +301,7 @@ extern __inline int
|
|||||||
return ((__v4si)__X)[__N & 3];
|
return ((__v4si)__X)[__N & 3];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
|
_mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
|
||||||
@ -69,42 +313,351 @@ extern __inline __m128i
|
|||||||
#endif
|
#endif
|
||||||
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
|
return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
|
_mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
|
||||||
|
#else
|
||||||
const __v16qu __seven = vec_splats((unsigned char)0x07);
|
const __v16qu __seven = vec_splats((unsigned char)0x07);
|
||||||
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
|
__v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
|
||||||
return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
|
return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
|
||||||
|
__v16qu __pcv[] = {
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
|
||||||
|
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
|
||||||
|
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
|
||||||
|
{16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
|
||||||
|
{0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||||
|
{16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||||
|
{0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||||
|
};
|
||||||
|
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
|
||||||
|
return (__m128)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
|
||||||
|
#else
|
||||||
|
const __v4si __zero = {0};
|
||||||
|
const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
|
||||||
|
return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128d
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
|
||||||
|
__v16qu __pcv[] = {
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||||
|
{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
|
||||||
|
{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
|
||||||
|
__v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
|
||||||
|
return (__m128d)__r;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128d
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
|
||||||
|
#ifdef _ARCH_PWR10
|
||||||
|
return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
|
||||||
|
#else
|
||||||
|
const __v2di __zero = {0};
|
||||||
|
const __vector __bool long long __boolmask =
|
||||||
|
vec_cmplt((__v2di)__mask, __zero);
|
||||||
|
return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_testz_si128(__m128i __A, __m128i __B) {
|
||||||
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||||
|
const __v16qu __zero = {0};
|
||||||
|
return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_testc_si128(__m128i __A, __m128i __B) {
|
||||||
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||||
|
const __v16qu __zero = {0};
|
||||||
|
const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
|
||||||
|
return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline int
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_testnzc_si128(__m128i __A, __m128i __B) {
|
||||||
|
/* Note: This implementation does NOT set "zero" or "carry" flags. */
|
||||||
|
return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
|
||||||
|
|
||||||
|
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
|
||||||
|
|
||||||
|
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_min_epi8(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
|
_mm_min_epu16(__m128i __X, __m128i __Y) {
|
||||||
__v16qi result = (__v16qi)__A;
|
return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
|
||||||
result[__N & 0xf] = __D;
|
|
||||||
return (__m128i)result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
|
_mm_min_epi32(__m128i __X, __m128i __Y) {
|
||||||
__v4si result = (__v4si)__A;
|
return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
|
||||||
result[__N & 3] = __D;
|
|
||||||
return (__m128i)result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
|
_mm_min_epu32(__m128i __X, __m128i __Y) {
|
||||||
__v2di result = (__v2di)__A;
|
return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
|
||||||
result[__N & 1] = __D;
|
|
||||||
return (__m128i)result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_max_epi8(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_max_epu16(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_max_epi32(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_max_epu32(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_mullo_epi32(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_mul_epi32(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi8_epi16(__m128i __A) {
|
||||||
|
return (__m128i)vec_unpackh((__v16qi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi8_epi32(__m128i __A) {
|
||||||
|
__A = (__m128i)vec_unpackh((__v16qi)__A);
|
||||||
|
return (__m128i)vec_unpackh((__v8hi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi8_epi64(__m128i __A) {
|
||||||
|
__A = (__m128i)vec_unpackh((__v16qi)__A);
|
||||||
|
__A = (__m128i)vec_unpackh((__v8hi)__A);
|
||||||
|
return (__m128i)vec_unpackh((__v4si)__A);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi16_epi32(__m128i __A) {
|
||||||
|
return (__m128i)vec_unpackh((__v8hi)__A);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi16_epi64(__m128i __A) {
|
||||||
|
__A = (__m128i)vec_unpackh((__v8hi)__A);
|
||||||
|
return (__m128i)vec_unpackh((__v4si)__A);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepi32_epi64(__m128i __A) {
|
||||||
|
return (__m128i)vec_unpackh((__v4si)__A);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu8_epi16(__m128i __A) {
|
||||||
|
const __v16qu __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu8_epi32(__m128i __A) {
|
||||||
|
const __v16qu __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu8_epi64(__m128i __A) {
|
||||||
|
const __v16qu __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
|
||||||
|
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
|
||||||
|
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu16_epi32(__m128i __A) {
|
||||||
|
const __v8hu __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu16_epi64(__m128i __A) {
|
||||||
|
const __v8hu __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
|
||||||
|
__A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
|
||||||
|
__A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cvtepu32_epi64(__m128i __A) {
|
||||||
|
const __v4su __zero = {0};
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
__A = (__m128i)vec_mergeh((__v4su)__A, __zero);
|
||||||
|
#else /* __BIG_ENDIAN__. */
|
||||||
|
__A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
|
||||||
|
#endif /* __BIG_ENDIAN__. */
|
||||||
|
return __A;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return horizontal packed word minimum and its index in bits [15:0]
|
||||||
|
and bits [18:16] respectively. */
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_minpos_epu16(__m128i __A) {
|
||||||
|
union __u {
|
||||||
|
__m128i __m;
|
||||||
|
__v8hu __uh;
|
||||||
|
};
|
||||||
|
union __u __u = {.__m = __A}, __r = {.__m = {0}};
|
||||||
|
unsigned short __ridx = 0;
|
||||||
|
unsigned short __rmin = __u.__uh[__ridx];
|
||||||
|
unsigned long __i;
|
||||||
|
for (__i = 1; __i < 8; __i++) {
|
||||||
|
if (__u.__uh[__i] < __rmin) {
|
||||||
|
__rmin = __u.__uh[__i];
|
||||||
|
__ridx = __i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__r.__uh[0] = __rmin;
|
||||||
|
__r.__uh[1] = __ridx;
|
||||||
|
return __r.__m;
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_packus_epi32(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
|
||||||
|
return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <smmintrin.h>
|
#include_next <smmintrin.h>
|
||||||
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
#endif /* defined(__ppc64__) &&
|
||||||
*/
|
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||||
|
|
||||||
#endif /* _SMMINTRIN_H_ */
|
#endif /* SMMINTRIN_H_ */
|
||||||
|
|||||||
677
lib/include/ppc_wrappers/tmmintrin.h
vendored
677
lib/include/ppc_wrappers/tmmintrin.h
vendored
@ -25,7 +25,8 @@
|
|||||||
#ifndef TMMINTRIN_H_
|
#ifndef TMMINTRIN_H_
|
||||||
#define TMMINTRIN_H_
|
#define TMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
#if defined(__ppc64__) && \
|
||||||
|
(defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
@ -33,63 +34,55 @@
|
|||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_epi16 (__m128i __A)
|
_mm_abs_epi16(__m128i __A) {
|
||||||
{
|
return (__m128i)vec_abs((__v8hi)__A);
|
||||||
return (__m128i) vec_abs ((__v8hi) __A);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_epi32 (__m128i __A)
|
_mm_abs_epi32(__m128i __A) {
|
||||||
{
|
return (__m128i)vec_abs((__v4si)__A);
|
||||||
return (__m128i) vec_abs ((__v4si) __A);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_epi8 (__m128i __A)
|
_mm_abs_epi8(__m128i __A) {
|
||||||
{
|
return (__m128i)vec_abs((__v16qi)__A);
|
||||||
return (__m128i) vec_abs ((__v16qi) __A);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_pi16 (__m64 __A)
|
_mm_abs_pi16(__m64 __A) {
|
||||||
{
|
__v8hi __B = (__v8hi)(__v2du){__A, __A};
|
||||||
__v8hi __B = (__v8hi) (__v2du) { __A, __A };
|
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_pi32 (__m64 __A)
|
_mm_abs_pi32(__m64 __A) {
|
||||||
{
|
__v4si __B = (__v4si)(__v2du){__A, __A};
|
||||||
__v4si __B = (__v4si) (__v2du) { __A, __A };
|
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_abs_pi8 (__m64 __A)
|
_mm_abs_pi8(__m64 __A) {
|
||||||
{
|
__v16qi __B = (__v16qi)(__v2du){__A, __A};
|
||||||
__v16qi __B = (__v16qi) (__v2du) { __A, __A };
|
return (__m64)((__v2du)vec_abs(__B))[0];
|
||||||
return (__m64) ((__v2du) vec_abs (__B))[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
|
_mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
|
||||||
{
|
if (__builtin_constant_p(__count) && __count < 16) {
|
||||||
if (__builtin_constant_p (__count) && __count < 16)
|
|
||||||
{
|
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__A = (__m128i) vec_reve ((__v16qu) __A);
|
__A = (__m128i)vec_reve((__v16qu)__A);
|
||||||
__B = (__m128i) vec_reve ((__v16qu) __B);
|
__B = (__m128i)vec_reve((__v16qu)__B);
|
||||||
#endif
|
#endif
|
||||||
__A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
|
__A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__A = (__m128i) vec_reve ((__v16qu) __A);
|
__A = (__m128i)vec_reve((__v16qu)__A);
|
||||||
#endif
|
#endif
|
||||||
return __A;
|
return __A;
|
||||||
}
|
}
|
||||||
@ -97,400 +90,364 @@ _mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
|
|||||||
if (__count == 0)
|
if (__count == 0)
|
||||||
return __B;
|
return __B;
|
||||||
|
|
||||||
if (__count >= 16)
|
if (__count >= 16) {
|
||||||
{
|
if (__count >= 32) {
|
||||||
if (__count >= 32)
|
const __v16qu __zero = {0};
|
||||||
{
|
return (__m128i)__zero;
|
||||||
const __v16qu zero = { 0 };
|
} else {
|
||||||
return (__m128i) zero;
|
const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
const __v16qu __shift =
|
|
||||||
vec_splats ((unsigned char) ((__count - 16) * 8));
|
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
return (__m128i) vec_sro ((__v16qu) __A, __shift);
|
return (__m128i)vec_sro((__v16qu)__A, __shift);
|
||||||
#else
|
#else
|
||||||
return (__m128i) vec_slo ((__v16qu) __A, __shift);
|
return (__m128i)vec_slo((__v16qu)__A, __shift);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else
|
const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
|
||||||
{
|
const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
|
||||||
const __v16qu __shiftA =
|
|
||||||
vec_splats ((unsigned char) ((16 - __count) * 8));
|
|
||||||
const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
|
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
|
__A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
|
||||||
__B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
|
__B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
|
||||||
#else
|
#else
|
||||||
__A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
|
__A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
|
||||||
__B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
|
__B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
|
||||||
#endif
|
#endif
|
||||||
return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
|
return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
|
_mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
|
||||||
{
|
if (__count < 16) {
|
||||||
if (__count < 16)
|
__v2du __C = {__B, __A};
|
||||||
{
|
|
||||||
__v2du __C = { __B, __A };
|
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
const __v4su __shift = { __count << 3, 0, 0, 0 };
|
const __v4su __shift = {__count << 3, 0, 0, 0};
|
||||||
__C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
|
__C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
|
||||||
#else
|
#else
|
||||||
const __v4su __shift = { 0, 0, 0, __count << 3 };
|
const __v4su __shift = {0, 0, 0, __count << 3};
|
||||||
__C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
|
__C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
|
||||||
#endif
|
#endif
|
||||||
return (__m64) __C[0];
|
return (__m64)__C[0];
|
||||||
}
|
} else {
|
||||||
else
|
const __m64 __zero = {0};
|
||||||
{
|
|
||||||
const __m64 __zero = { 0 };
|
|
||||||
return __zero;
|
return __zero;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadd_epi16 (__m128i __A, __m128i __B)
|
_mm_hadd_epi16(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
const __v16qu __P =
|
16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||||
const __v16qu __Q =
|
18, 19, 22, 23, 26, 27, 30, 31};
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
return (__m128i)vec_add(__C, __D);
|
||||||
return (__m128i) vec_add (__C, __D);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadd_epi32 (__m128i __A, __m128i __B)
|
_mm_hadd_epi32(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
|
||||||
const __v16qu __P =
|
16, 17, 18, 19, 24, 25, 26, 27};
|
||||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
|
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
|
||||||
const __v16qu __Q =
|
20, 21, 22, 23, 28, 29, 30, 31};
|
||||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
|
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
|
||||||
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
|
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
|
||||||
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
|
return (__m128i)vec_add(__C, __D);
|
||||||
return (__m128i) vec_add (__C, __D);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadd_pi16 (__m64 __A, __m64 __B)
|
_mm_hadd_pi16(__m64 __A, __m64 __B) {
|
||||||
{
|
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||||
const __v16qu __P =
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
__v8hi __D = vec_perm(__C, __C, __Q);
|
||||||
const __v16qu __Q =
|
__C = vec_perm(__C, __C, __P);
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
__C = vec_add(__C, __D);
|
||||||
__v8hi __D = vec_perm (__C, __C, __Q);
|
return (__m64)((__v2du)__C)[1];
|
||||||
__C = vec_perm (__C, __C, __P);
|
|
||||||
__C = vec_add (__C, __D);
|
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadd_pi32 (__m64 __A, __m64 __B)
|
_mm_hadd_pi32(__m64 __A, __m64 __B) {
|
||||||
{
|
__v4si __C = (__v4si)(__v2du){__A, __B};
|
||||||
__v4si __C = (__v4si) (__v2du) { __A, __B };
|
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
|
||||||
const __v16qu __P =
|
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
|
||||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
|
__v4si __D = vec_perm(__C, __C, __Q);
|
||||||
const __v16qu __Q =
|
__C = vec_perm(__C, __C, __P);
|
||||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
|
__C = vec_add(__C, __D);
|
||||||
__v4si __D = vec_perm (__C, __C, __Q);
|
return (__m64)((__v2du)__C)[1];
|
||||||
__C = vec_perm (__C, __C, __P);
|
|
||||||
__C = vec_add (__C, __D);
|
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadds_epi16 (__m128i __A, __m128i __B)
|
_mm_hadds_epi16(__m128i __A, __m128i __B) {
|
||||||
{
|
__v4si __C = {0}, __D = {0};
|
||||||
__v4si __C = { 0 }, __D = { 0 };
|
__C = vec_sum4s((__v8hi)__A, __C);
|
||||||
__C = vec_sum4s ((__v8hi) __A, __C);
|
__D = vec_sum4s((__v8hi)__B, __D);
|
||||||
__D = vec_sum4s ((__v8hi) __B, __D);
|
__C = (__v4si)vec_packs(__C, __D);
|
||||||
__C = (__v4si) vec_packs (__C, __D);
|
return (__m128i)__C;
|
||||||
return (__m128i) __C;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hadds_pi16 (__m64 __A, __m64 __B)
|
_mm_hadds_pi16(__m64 __A, __m64 __B) {
|
||||||
{
|
const __v4si __zero = {0};
|
||||||
const __v4si __zero = { 0 };
|
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
__v4si __D = vec_sum4s(__C, __zero);
|
||||||
__v4si __D = vec_sum4s (__C, __zero);
|
__C = vec_packs(__D, __D);
|
||||||
__C = vec_packs (__D, __D);
|
return (__m64)((__v2du)__C)[1];
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsub_epi16 (__m128i __A, __m128i __B)
|
_mm_hsub_epi16(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
const __v16qu __P =
|
16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||||
const __v16qu __Q =
|
18, 19, 22, 23, 26, 27, 30, 31};
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
return (__m128i)vec_sub(__C, __D);
|
||||||
return (__m128i) vec_sub (__C, __D);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsub_epi32 (__m128i __A, __m128i __B)
|
_mm_hsub_epi32(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11,
|
||||||
const __v16qu __P =
|
16, 17, 18, 19, 24, 25, 26, 27};
|
||||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
|
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15,
|
||||||
const __v16qu __Q =
|
20, 21, 22, 23, 28, 29, 30, 31};
|
||||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
|
__v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
|
||||||
__v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
|
__v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
|
||||||
__v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
|
return (__m128i)vec_sub(__C, __D);
|
||||||
return (__m128i) vec_sub (__C, __D);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsub_pi16 (__m64 __A, __m64 __B)
|
_mm_hsub_pi16(__m64 __A, __m64 __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||||
const __v16qu __P =
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||||
const __v16qu __Q =
|
__v8hi __D = vec_perm(__C, __C, __Q);
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
__C = vec_perm(__C, __C, __P);
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
__C = vec_sub(__C, __D);
|
||||||
__v8hi __D = vec_perm (__C, __C, __Q);
|
return (__m64)((__v2du)__C)[1];
|
||||||
__C = vec_perm (__C, __C, __P);
|
|
||||||
__C = vec_sub (__C, __D);
|
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsub_pi32 (__m64 __A, __m64 __B)
|
_mm_hsub_pi32(__m64 __A, __m64 __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
|
||||||
const __v16qu __P =
|
const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
|
||||||
{ 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
|
__v4si __C = (__v4si)(__v2du){__A, __B};
|
||||||
const __v16qu __Q =
|
__v4si __D = vec_perm(__C, __C, __Q);
|
||||||
{ 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
|
__C = vec_perm(__C, __C, __P);
|
||||||
__v4si __C = (__v4si) (__v2du) { __A, __B };
|
__C = vec_sub(__C, __D);
|
||||||
__v4si __D = vec_perm (__C, __C, __Q);
|
return (__m64)((__v2du)__C)[1];
|
||||||
__C = vec_perm (__C, __C, __P);
|
|
||||||
__C = vec_sub (__C, __D);
|
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsubs_epi16 (__m128i __A, __m128i __B)
|
_mm_hsubs_epi16(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
const __v16qu __P =
|
16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||||
const __v16qu __Q =
|
18, 19, 22, 23, 26, 27, 30, 31};
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
__v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
|
||||||
__v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
|
__v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
|
||||||
__v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
|
return (__m128i)vec_subs(__C, __D);
|
||||||
return (__m128i) vec_subs (__C, __D);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_hsubs_pi16 (__m64 __A, __m64 __B)
|
_mm_hsubs_pi16(__m64 __A, __m64 __B) {
|
||||||
{
|
const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
|
||||||
const __v16qu __P =
|
const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
|
__v8hi __C = (__v8hi)(__v2du){__A, __B};
|
||||||
const __v16qu __Q =
|
__v8hi __D = vec_perm(__C, __C, __P);
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
|
__v8hi __E = vec_perm(__C, __C, __Q);
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __B };
|
__C = vec_subs(__D, __E);
|
||||||
__v8hi __D = vec_perm (__C, __C, __P);
|
return (__m64)((__v2du)__C)[1];
|
||||||
__v8hi __E = vec_perm (__C, __C, __Q);
|
|
||||||
__C = vec_subs (__D, __E);
|
|
||||||
return (__m64) ((__v2du) __C)[1];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_shuffle_epi8 (__m128i __A, __m128i __B)
|
_mm_shuffle_epi8(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qi __zero = {0};
|
||||||
const __v16qi __zero = { 0 };
|
__vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
|
||||||
__vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
|
__v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
|
||||||
__v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
|
return (__m128i)vec_sel(__C, __zero, __select);
|
||||||
return (__m128i) vec_sel (__C, __zero, __select);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_shuffle_pi8 (__m64 __A, __m64 __B)
|
_mm_shuffle_pi8(__m64 __A, __m64 __B) {
|
||||||
{
|
const __v16qi __zero = {0};
|
||||||
const __v16qi __zero = { 0 };
|
__v16qi __C = (__v16qi)(__v2du){__A, __A};
|
||||||
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
|
__v16qi __D = (__v16qi)(__v2du){__B, __B};
|
||||||
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
|
__vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
|
||||||
__vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
|
__C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
|
||||||
__C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
|
__C = vec_sel(__C, __zero, __select);
|
||||||
__C = vec_sel (__C, __zero, __select);
|
return (__m64)((__v2du)(__C))[0];
|
||||||
return (__m64) ((__v2du) (__C))[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_sign_epi8 (__m128i __A, __m128i __B)
|
_mm_sign_epi8(__m128i __A, __m128i __B) {
|
||||||
{
|
const __v16qi __zero = {0};
|
||||||
const __v16qi __zero = { 0 };
|
__v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
|
||||||
__v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
|
|
||||||
__v16qi __selectpos =
|
__v16qi __selectpos =
|
||||||
(__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
|
(__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
|
||||||
__v16qi __conv = vec_add (__selectneg, __selectpos);
|
__v16qi __conv = vec_add(__selectneg, __selectpos);
|
||||||
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
|
return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_sign_epi16(__m128i __A, __m128i __B) {
|
||||||
|
const __v8hi __zero = {0};
|
||||||
|
__v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
|
||||||
|
__v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
|
||||||
|
__v8hi __conv = vec_add(__selectneg, __selectpos);
|
||||||
|
return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_sign_epi32(__m128i __A, __m128i __B) {
|
||||||
|
const __v4si __zero = {0};
|
||||||
|
__v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
|
||||||
|
__v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
|
||||||
|
__v4si __conv = vec_add(__selectneg, __selectpos);
|
||||||
|
return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m64
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_sign_pi8(__m64 __A, __m64 __B) {
|
||||||
|
const __v16qi __zero = {0};
|
||||||
|
__v16qi __C = (__v16qi)(__v2du){__A, __A};
|
||||||
|
__v16qi __D = (__v16qi)(__v2du){__B, __B};
|
||||||
|
__C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
|
||||||
|
return (__m64)((__v2du)(__C))[0];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m64
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_sign_pi16(__m64 __A, __m64 __B) {
|
||||||
|
const __v8hi __zero = {0};
|
||||||
|
__v8hi __C = (__v8hi)(__v2du){__A, __A};
|
||||||
|
__v8hi __D = (__v8hi)(__v2du){__B, __B};
|
||||||
|
__C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
|
||||||
|
return (__m64)((__v2du)(__C))[0];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef _ARCH_PWR8
|
||||||
|
extern __inline __m64
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_sign_pi32(__m64 __A, __m64 __B) {
|
||||||
|
const __v4si __zero = {0};
|
||||||
|
__v4si __C = (__v4si)(__v2du){__A, __A};
|
||||||
|
__v4si __D = (__v4si)(__v2du){__B, __B};
|
||||||
|
__C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
|
||||||
|
return (__m64)((__v2du)(__C))[0];
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern __inline __m128i
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_maddubs_epi16(__m128i __A, __m128i __B) {
|
||||||
|
__v8hi __unsigned = vec_splats((signed short)0x00ff);
|
||||||
|
__v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
|
||||||
|
__v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
|
||||||
|
__v8hi __E = vec_unpackh((__v16qi)__B);
|
||||||
|
__v8hi __F = vec_unpackl((__v16qi)__B);
|
||||||
|
__C = vec_mul(__C, __E);
|
||||||
|
__D = vec_mul(__D, __F);
|
||||||
|
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
|
16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
|
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||||
|
18, 19, 22, 23, 26, 27, 30, 31};
|
||||||
|
__E = vec_perm(__C, __D, __odds);
|
||||||
|
__F = vec_perm(__C, __D, __evens);
|
||||||
|
return (__m128i)vec_adds(__E, __F);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern __inline __m64
|
||||||
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
|
_mm_maddubs_pi16(__m64 __A, __m64 __B) {
|
||||||
|
__v8hi __C = (__v8hi)(__v2du){__A, __A};
|
||||||
|
__C = vec_unpackl((__v16qi)__C);
|
||||||
|
const __v8hi __unsigned = vec_splats((signed short)0x00ff);
|
||||||
|
__C = vec_and(__C, __unsigned);
|
||||||
|
__v8hi __D = (__v8hi)(__v2du){__B, __B};
|
||||||
|
__D = vec_unpackl((__v16qi)__D);
|
||||||
|
__D = vec_mul(__C, __D);
|
||||||
|
const __v16qu __odds = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
|
16, 17, 20, 21, 24, 25, 28, 29};
|
||||||
|
const __v16qu __evens = {2, 3, 6, 7, 10, 11, 14, 15,
|
||||||
|
18, 19, 22, 23, 26, 27, 30, 31};
|
||||||
|
__C = vec_perm(__D, __D, __odds);
|
||||||
|
__D = vec_perm(__D, __D, __evens);
|
||||||
|
__C = vec_adds(__C, __D);
|
||||||
|
return (__m64)((__v2du)(__C))[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m128i
|
extern __inline __m128i
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_sign_epi16 (__m128i __A, __m128i __B)
|
_mm_mulhrs_epi16(__m128i __A, __m128i __B) {
|
||||||
{
|
__v4si __C = vec_unpackh((__v8hi)__A);
|
||||||
const __v8hi __zero = { 0 };
|
__v4si __D = vec_unpackh((__v8hi)__B);
|
||||||
__v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
|
__C = vec_mul(__C, __D);
|
||||||
__v8hi __selectpos =
|
__D = vec_unpackl((__v8hi)__A);
|
||||||
(__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
|
__v4si __E = vec_unpackl((__v8hi)__B);
|
||||||
__v8hi __conv = vec_add (__selectneg, __selectpos);
|
__D = vec_mul(__D, __E);
|
||||||
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
|
const __v4su __shift = vec_splats((unsigned int)14);
|
||||||
}
|
__C = vec_sr(__C, __shift);
|
||||||
|
__D = vec_sr(__D, __shift);
|
||||||
extern __inline __m128i
|
const __v4si __ones = vec_splats((signed int)1);
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__C = vec_add(__C, __ones);
|
||||||
_mm_sign_epi32 (__m128i __A, __m128i __B)
|
__C = vec_sr(__C, (__v4su)__ones);
|
||||||
{
|
__D = vec_add(__D, __ones);
|
||||||
const __v4si __zero = { 0 };
|
__D = vec_sr(__D, (__v4su)__ones);
|
||||||
__v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
|
return (__m128i)vec_pack(__C, __D);
|
||||||
__v4si __selectpos =
|
|
||||||
(__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
|
|
||||||
__v4si __conv = vec_add (__selectneg, __selectpos);
|
|
||||||
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern __inline __m64
|
extern __inline __m64
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
_mm_sign_pi8 (__m64 __A, __m64 __B)
|
_mm_mulhrs_pi16(__m64 __A, __m64 __B) {
|
||||||
{
|
__v4si __C = (__v4si)(__v2du){__A, __A};
|
||||||
const __v16qi __zero = { 0 };
|
__C = vec_unpackh((__v8hi)__C);
|
||||||
__v16qi __C = (__v16qi) (__v2du) { __A, __A };
|
__v4si __D = (__v4si)(__v2du){__B, __B};
|
||||||
__v16qi __D = (__v16qi) (__v2du) { __B, __B };
|
__D = vec_unpackh((__v8hi)__D);
|
||||||
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
|
__C = vec_mul(__C, __D);
|
||||||
return (__m64) ((__v2du) (__C))[0];
|
const __v4su __shift = vec_splats((unsigned int)14);
|
||||||
}
|
__C = vec_sr(__C, __shift);
|
||||||
|
const __v4si __ones = vec_splats((signed int)1);
|
||||||
extern __inline __m64
|
__C = vec_add(__C, __ones);
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__C = vec_sr(__C, (__v4su)__ones);
|
||||||
_mm_sign_pi16 (__m64 __A, __m64 __B)
|
__v8hi __E = vec_pack(__C, __D);
|
||||||
{
|
return (__m64)((__v2du)(__E))[0];
|
||||||
const __v8hi __zero = { 0 };
|
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
|
|
||||||
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
|
|
||||||
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
|
|
||||||
return (__m64) ((__v2du) (__C))[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m64
|
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_sign_pi32 (__m64 __A, __m64 __B)
|
|
||||||
{
|
|
||||||
const __v4si __zero = { 0 };
|
|
||||||
__v4si __C = (__v4si) (__v2du) { __A, __A };
|
|
||||||
__v4si __D = (__v4si) (__v2du) { __B, __B };
|
|
||||||
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
|
|
||||||
return (__m64) ((__v2du) (__C))[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m128i
|
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_maddubs_epi16 (__m128i __A, __m128i __B)
|
|
||||||
{
|
|
||||||
__v8hi __unsigned = vec_splats ((signed short) 0x00ff);
|
|
||||||
__v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
|
|
||||||
__v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
|
|
||||||
__v8hi __E = vec_unpackh ((__v16qi) __B);
|
|
||||||
__v8hi __F = vec_unpackl ((__v16qi) __B);
|
|
||||||
__C = vec_mul (__C, __E);
|
|
||||||
__D = vec_mul (__D, __F);
|
|
||||||
const __v16qu __odds =
|
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
|
||||||
const __v16qu __evens =
|
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
|
||||||
__E = vec_perm (__C, __D, __odds);
|
|
||||||
__F = vec_perm (__C, __D, __evens);
|
|
||||||
return (__m128i) vec_adds (__E, __F);
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m64
|
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_maddubs_pi16 (__m64 __A, __m64 __B)
|
|
||||||
{
|
|
||||||
__v8hi __C = (__v8hi) (__v2du) { __A, __A };
|
|
||||||
__C = vec_unpackl ((__v16qi) __C);
|
|
||||||
const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
|
|
||||||
__C = vec_and (__C, __unsigned);
|
|
||||||
__v8hi __D = (__v8hi) (__v2du) { __B, __B };
|
|
||||||
__D = vec_unpackl ((__v16qi) __D);
|
|
||||||
__D = vec_mul (__C, __D);
|
|
||||||
const __v16qu __odds =
|
|
||||||
{ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
|
|
||||||
const __v16qu __evens =
|
|
||||||
{ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
|
|
||||||
__C = vec_perm (__D, __D, __odds);
|
|
||||||
__D = vec_perm (__D, __D, __evens);
|
|
||||||
__C = vec_adds (__C, __D);
|
|
||||||
return (__m64) ((__v2du) (__C))[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m128i
|
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
|
|
||||||
{
|
|
||||||
__v4si __C = vec_unpackh ((__v8hi) __A);
|
|
||||||
__v4si __D = vec_unpackh ((__v8hi) __B);
|
|
||||||
__C = vec_mul (__C, __D);
|
|
||||||
__D = vec_unpackl ((__v8hi) __A);
|
|
||||||
__v4si __E = vec_unpackl ((__v8hi) __B);
|
|
||||||
__D = vec_mul (__D, __E);
|
|
||||||
const __v4su __shift = vec_splats ((unsigned int) 14);
|
|
||||||
__C = vec_sr (__C, __shift);
|
|
||||||
__D = vec_sr (__D, __shift);
|
|
||||||
const __v4si __ones = vec_splats ((signed int) 1);
|
|
||||||
__C = vec_add (__C, __ones);
|
|
||||||
__C = vec_sr (__C, (__v4su) __ones);
|
|
||||||
__D = vec_add (__D, __ones);
|
|
||||||
__D = vec_sr (__D, (__v4su) __ones);
|
|
||||||
return (__m128i) vec_pack (__C, __D);
|
|
||||||
}
|
|
||||||
|
|
||||||
extern __inline __m64
|
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|
||||||
_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
|
|
||||||
{
|
|
||||||
__v4si __C = (__v4si) (__v2du) { __A, __A };
|
|
||||||
__C = vec_unpackh ((__v8hi) __C);
|
|
||||||
__v4si __D = (__v4si) (__v2du) { __B, __B };
|
|
||||||
__D = vec_unpackh ((__v8hi) __D);
|
|
||||||
__C = vec_mul (__C, __D);
|
|
||||||
const __v4su __shift = vec_splats ((unsigned int) 14);
|
|
||||||
__C = vec_sr (__C, __shift);
|
|
||||||
const __v4si __ones = vec_splats ((signed int) 1);
|
|
||||||
__C = vec_add (__C, __ones);
|
|
||||||
__C = vec_sr (__C, (__v4su) __ones);
|
|
||||||
__v8hi __E = vec_pack (__C, __D);
|
|
||||||
return (__m64) ((__v2du) (__E))[0];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <tmmintrin.h>
|
#include_next <tmmintrin.h>
|
||||||
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
#endif /* defined(__ppc64__) &&
|
||||||
*/
|
* (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
|
||||||
|
|
||||||
#endif /* TMMINTRIN_H_ */
|
#endif /* TMMINTRIN_H_ */
|
||||||
|
|||||||
17
lib/include/ppc_wrappers/x86gprintrin.h
vendored
Normal file
17
lib/include/ppc_wrappers/x86gprintrin.h
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef X86GPRINTRIN_H_
|
||||||
|
#define X86GPRINTRIN_H_
|
||||||
|
|
||||||
|
#include <bmiintrin.h>
|
||||||
|
|
||||||
|
#include <bmi2intrin.h>
|
||||||
|
|
||||||
|
#endif /* X86GPRINTRIN_H_ */
|
||||||
28
lib/include/ppc_wrappers/x86intrin.h
vendored
Normal file
28
lib/include/ppc_wrappers/x86intrin.h
vendored
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef NO_WARN_X86_INTRINSICS
|
||||||
|
/* This header is distributed to simplify porting x86_64 code that
|
||||||
|
makes explicit use of Intel intrinsics to powerpc64le.
|
||||||
|
It is the user's responsibility to determine if the results are
|
||||||
|
acceptable and make additional changes as necessary.
|
||||||
|
Note that much code that uses Intel intrinsics can be rewritten in
|
||||||
|
standard C or GNU C extensions, which are more portable and better
|
||||||
|
optimized across multiple targets. */
|
||||||
|
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef X86INTRIN_H_
|
||||||
|
#define X86INTRIN_H_
|
||||||
|
|
||||||
|
#ifdef __ALTIVEC__
|
||||||
|
#include <immintrin.h>
|
||||||
|
#endif /* __ALTIVEC__ */
|
||||||
|
|
||||||
|
#endif /* X86INTRIN_H_ */
|
||||||
1989
lib/include/ppc_wrappers/xmmintrin.h
vendored
1989
lib/include/ppc_wrappers/xmmintrin.h
vendored
File diff suppressed because it is too large
Load Diff
57
lib/include/rdpruintrin.h
vendored
Normal file
57
lib/include/rdpruintrin.h
vendored
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
/*===---- rdpruintrin.h - RDPRU intrinsics ---------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if !defined __X86INTRIN_H
|
||||||
|
#error "Never use <rdpruintrin.h> directly; include <x86intrin.h> instead."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef __RDPRUINTRIN_H
|
||||||
|
#define __RDPRUINTRIN_H
|
||||||
|
|
||||||
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("rdpru")))
|
||||||
|
|
||||||
|
|
||||||
|
/// Reads the content of a processor register.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> RDPRU </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param reg_id
|
||||||
|
/// A processor register identifier.
|
||||||
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||||
|
__rdpru (int reg_id)
|
||||||
|
{
|
||||||
|
return __builtin_ia32_rdpru(reg_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define __RDPRU_MPERF 0
|
||||||
|
#define __RDPRU_APERF 1
|
||||||
|
|
||||||
|
/// Reads the content of processor register MPERF.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
|
||||||
|
/// register MPERF.
|
||||||
|
#define __mperf() __builtin_ia32_rdpru(__RDPRU_MPERF)
|
||||||
|
|
||||||
|
/// Reads the content of processor register APERF.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic generates instruction <c> RDPRU </c> to read the value of
|
||||||
|
/// register APERF.
|
||||||
|
#define __aperf() __builtin_ia32_rdpru(__RDPRU_APERF)
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __RDPRUINTRIN_H */
|
||||||
6
lib/include/rdseedintrin.h
vendored
6
lib/include/rdseedintrin.h
vendored
@ -20,20 +20,20 @@
|
|||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS
|
||||||
_rdseed16_step(unsigned short *__p)
|
_rdseed16_step(unsigned short *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdseed16_step(__p);
|
return (int) __builtin_ia32_rdseed16_step(__p);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS
|
||||||
_rdseed32_step(unsigned int *__p)
|
_rdseed32_step(unsigned int *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdseed32_step(__p);
|
return (int) __builtin_ia32_rdseed32_step(__p);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS
|
||||||
_rdseed64_step(unsigned long long *__p)
|
_rdseed64_step(unsigned long long *__p)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_rdseed64_step(__p);
|
return (int) __builtin_ia32_rdseed64_step(__p);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
95319
lib/include/riscv_vector.h
vendored
95319
lib/include/riscv_vector.h
vendored
File diff suppressed because it is too large
Load Diff
2
lib/include/rtmintrin.h
vendored
2
lib/include/rtmintrin.h
vendored
@ -29,7 +29,7 @@
|
|||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
_xbegin(void)
|
_xbegin(void)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_xbegin();
|
return (unsigned int)__builtin_ia32_xbegin();
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
|
|||||||
291
lib/include/smmintrin.h
vendored
291
lib/include/smmintrin.h
vendored
@ -17,7 +17,9 @@
|
|||||||
#include <tmmintrin.h>
|
#include <tmmintrin.h>
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
|
||||||
|
__min_vector_width__(128)))
|
||||||
|
|
||||||
/* SSE4 Rounding macros. */
|
/* SSE4 Rounding macros. */
|
||||||
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
#define _MM_FROUND_TO_NEAREST_INT 0x00
|
||||||
@ -276,8 +278,8 @@
|
|||||||
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
|
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
|
||||||
/// values.
|
/// values.
|
||||||
#define _mm_round_ss(X, Y, M) \
|
#define _mm_round_ss(X, Y, M) \
|
||||||
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
|
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
|
||||||
(__v4sf)(__m128)(Y), (M)))
|
(M)))
|
||||||
|
|
||||||
/// Rounds each element of the 128-bit vector of [2 x double] to an
|
/// Rounds each element of the 128-bit vector of [2 x double] to an
|
||||||
/// integer value according to the rounding control specified by the second
|
/// integer value according to the rounding control specified by the second
|
||||||
@ -351,8 +353,8 @@
|
|||||||
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
|
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
|
||||||
/// values.
|
/// values.
|
||||||
#define _mm_round_sd(X, Y, M) \
|
#define _mm_round_sd(X, Y, M) \
|
||||||
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
|
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
|
||||||
(__v2df)(__m128d)(Y), (M)))
|
(M)))
|
||||||
|
|
||||||
/* SSE4 Packed Blending Intrinsics. */
|
/* SSE4 Packed Blending Intrinsics. */
|
||||||
/// Returns a 128-bit vector of [2 x double] where the values are
|
/// Returns a 128-bit vector of [2 x double] where the values are
|
||||||
@ -380,7 +382,7 @@
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
||||||
#define _mm_blend_pd(V1, V2, M) \
|
#define _mm_blend_pd(V1, V2, M) \
|
||||||
((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
|
((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
|
||||||
(__v2df)(__m128d)(V2), (int)(M)))
|
(__v2df)(__m128d)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Returns a 128-bit vector of [4 x float] where the values are selected
|
/// Returns a 128-bit vector of [4 x float] where the values are selected
|
||||||
@ -408,8 +410,8 @@
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
||||||
#define _mm_blend_ps(V1, V2, M) \
|
#define _mm_blend_ps(V1, V2, M) \
|
||||||
((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
|
((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
|
||||||
(__v4sf)(__m128)(V2), (int)(M)))
|
(int)(M)))
|
||||||
|
|
||||||
/// Returns a 128-bit vector of [2 x double] where the values are
|
/// Returns a 128-bit vector of [2 x double] where the values are
|
||||||
/// selected from either the first or second operand as specified by the
|
/// selected from either the first or second operand as specified by the
|
||||||
@ -431,10 +433,10 @@
|
|||||||
/// position in the result. When a mask bit is 1, the corresponding 64-bit
|
/// position in the result. When a mask bit is 1, the corresponding 64-bit
|
||||||
/// element in operand \a __V2 is copied to the same position in the result.
|
/// element in operand \a __V2 is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
||||||
static __inline__ __m128d __DEFAULT_FN_ATTRS
|
static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
|
||||||
_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
|
__m128d __V2,
|
||||||
{
|
__m128d __M) {
|
||||||
return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
|
return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
|
||||||
(__v2df)__M);
|
(__v2df)__M);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -458,10 +460,10 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
|
|||||||
/// position in the result. When a mask bit is 1, the corresponding 32-bit
|
/// position in the result. When a mask bit is 1, the corresponding 32-bit
|
||||||
/// element in operand \a __V2 is copied to the same position in the result.
|
/// element in operand \a __V2 is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
||||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
|
||||||
_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
|
__m128 __V2,
|
||||||
{
|
__m128 __M) {
|
||||||
return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
|
return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
|
||||||
(__v4sf)__M);
|
(__v4sf)__M);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -485,10 +487,10 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
|
|||||||
/// position in the result. When a mask bit is 1, the corresponding 8-bit
|
/// position in the result. When a mask bit is 1, the corresponding 8-bit
|
||||||
/// element in operand \a __V2 is copied to the same position in the result.
|
/// element in operand \a __V2 is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [16 x i8] containing the copied values.
|
/// \returns A 128-bit vector of [16 x i8] containing the copied values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
|
||||||
_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
__m128i __V2,
|
||||||
{
|
__m128i __M) {
|
||||||
return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
|
return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
|
||||||
(__v16qi)__M);
|
(__v16qi)__M);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -517,7 +519,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
|
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
|
||||||
#define _mm_blend_epi16(V1, V2, M) \
|
#define _mm_blend_epi16(V1, V2, M) \
|
||||||
((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
|
((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
|
||||||
(__v8hi)(__m128i)(V2), (int)(M)))
|
(__v8hi)(__m128i)(V2), (int)(M)))
|
||||||
|
|
||||||
/* SSE4 Dword Multiply Instructions. */
|
/* SSE4 Dword Multiply Instructions. */
|
||||||
@ -534,10 +536,9 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit integer vector.
|
/// A 128-bit integer vector.
|
||||||
/// \returns A 128-bit integer vector containing the products of both operands.
|
/// \returns A 128-bit integer vector containing the products of both operands.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
|
||||||
_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)((__v4su)__V1 * (__v4su)__V2);
|
||||||
return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Multiplies corresponding even-indexed elements of two 128-bit
|
/// Multiplies corresponding even-indexed elements of two 128-bit
|
||||||
@ -554,10 +555,9 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// A 128-bit vector of [4 x i32].
|
/// A 128-bit vector of [4 x i32].
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the products of both
|
/// \returns A 128-bit vector of [2 x i64] containing the products of both
|
||||||
/// operands.
|
/// operands.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
|
||||||
_mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
|
||||||
return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Floating Point Dot Product Instructions. */
|
/* SSE4 Floating Point Dot Product Instructions. */
|
||||||
@ -594,8 +594,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// in the corresponding element; otherwise that element is set to zero.
|
/// in the corresponding element; otherwise that element is set to zero.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the dot product.
|
/// \returns A 128-bit vector of [4 x float] containing the dot product.
|
||||||
#define _mm_dp_ps(X, Y, M) \
|
#define _mm_dp_ps(X, Y, M) \
|
||||||
((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
|
((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
|
||||||
(__v4sf)(__m128)(Y), (M)))
|
|
||||||
|
|
||||||
/// Computes the dot product of the two 128-bit vectors of [2 x double]
|
/// Computes the dot product of the two 128-bit vectors of [2 x double]
|
||||||
/// and returns it in the elements of the 128-bit result vector of
|
/// and returns it in the elements of the 128-bit result vector of
|
||||||
@ -629,8 +628,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// each [2 x double] vector. If a bit is set, the dot product is returned in
|
/// each [2 x double] vector. If a bit is set, the dot product is returned in
|
||||||
/// the corresponding element; otherwise that element is set to zero.
|
/// the corresponding element; otherwise that element is set to zero.
|
||||||
#define _mm_dp_pd(X, Y, M) \
|
#define _mm_dp_pd(X, Y, M) \
|
||||||
((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
|
((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
|
||||||
(__v2df)(__m128d)(Y), (M)))
|
(M)))
|
||||||
|
|
||||||
/* SSE4 Streaming Load Hint Instruction. */
|
/* SSE4 Streaming Load Hint Instruction. */
|
||||||
/// Loads integer values from a 128-bit aligned memory location to a
|
/// Loads integer values from a 128-bit aligned memory location to a
|
||||||
@ -646,9 +645,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \returns A 128-bit integer vector containing the data stored at the
|
/// \returns A 128-bit integer vector containing the data stored at the
|
||||||
/// specified memory location.
|
/// specified memory location.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_stream_load_si128 (__m128i const *__V)
|
_mm_stream_load_si128(__m128i const *__V) {
|
||||||
{
|
return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
|
||||||
return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Packed Integer Min/Max Instructions. */
|
/* SSE4 Packed Integer Min/Max Instructions. */
|
||||||
@ -665,10 +663,9 @@ _mm_stream_load_si128 (__m128i const *__V)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [16 x i8]
|
/// A 128-bit vector of [16 x i8]
|
||||||
/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
|
/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
|
||||||
_mm_min_epi8 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
|
||||||
return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -684,10 +681,9 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [16 x i8].
|
/// A 128-bit vector of [16 x i8].
|
||||||
/// \returns A 128-bit vector of [16 x i8] containing the greater values.
|
/// \returns A 128-bit vector of [16 x i8] containing the greater values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
|
||||||
_mm_max_epi8 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
|
||||||
return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -703,10 +699,9 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [8 x u16].
|
/// A 128-bit vector of [8 x u16].
|
||||||
/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
|
/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
|
||||||
_mm_min_epu16 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
|
||||||
return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -722,10 +717,9 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [8 x u16].
|
/// A 128-bit vector of [8 x u16].
|
||||||
/// \returns A 128-bit vector of [8 x u16] containing the greater values.
|
/// \returns A 128-bit vector of [8 x u16] containing the greater values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
|
||||||
_mm_max_epu16 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
|
||||||
return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -741,10 +735,9 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [4 x i32].
|
/// A 128-bit vector of [4 x i32].
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
|
/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
|
||||||
_mm_min_epi32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
|
||||||
return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -760,10 +753,9 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [4 x i32].
|
/// A 128-bit vector of [4 x i32].
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the greater values.
|
/// \returns A 128-bit vector of [4 x i32] containing the greater values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
|
||||||
_mm_max_epi32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
|
||||||
return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -779,10 +771,9 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [4 x u32].
|
/// A 128-bit vector of [4 x u32].
|
||||||
/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
|
/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
|
||||||
_mm_min_epu32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
|
||||||
return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -798,10 +789,9 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit vector of [4 x u32].
|
/// A 128-bit vector of [4 x u32].
|
||||||
/// \returns A 128-bit vector of [4 x u32] containing the greater values.
|
/// \returns A 128-bit vector of [4 x u32] containing the greater values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
|
||||||
_mm_max_epu32 (__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
|
||||||
return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
||||||
@ -870,20 +860,23 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 11: Bits [127:96] of parameter \a X are returned.
|
/// 11: Bits [127:96] of parameter \a X are returned.
|
||||||
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
|
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
|
||||||
#define _mm_extract_ps(X, N) \
|
#define _mm_extract_ps(X, N) \
|
||||||
__builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
|
__builtin_bit_cast( \
|
||||||
|
int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
|
||||||
|
|
||||||
/* Miscellaneous insert and extract macros. */
|
/* Miscellaneous insert and extract macros. */
|
||||||
/* Extract a single-precision float from X at index N into D. */
|
/* Extract a single-precision float from X at index N into D. */
|
||||||
#define _MM_EXTRACT_FLOAT(D, X, N) \
|
#define _MM_EXTRACT_FLOAT(D, X, N) \
|
||||||
do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
|
do { \
|
||||||
|
(D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
||||||
an index suitable for _mm_insert_ps. */
|
an index suitable for _mm_insert_ps. */
|
||||||
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
|
#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
|
||||||
|
|
||||||
/* Extract a float from X at index N into the first index of the return. */
|
/* Extract a float from X at index N into the first index of the return. */
|
||||||
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
|
#define _MM_PICK_OUT_PS(X, N) \
|
||||||
_MM_MK_INSERTPS_NDX((N), 0, 0x0e))
|
_mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
|
||||||
|
|
||||||
/* Insert int into packed integer array at index. */
|
/* Insert int into packed integer array at index. */
|
||||||
/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
|
/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
|
||||||
@ -927,8 +920,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 1111: Bits [127:120] of the result are used for insertion.
|
/// 1111: Bits [127:120] of the result are used for insertion.
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi8(X, I, N) \
|
#define _mm_insert_epi8(X, I, N) \
|
||||||
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
|
||||||
(int)(I), (int)(N)))
|
(int)(N)))
|
||||||
|
|
||||||
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
|
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
|
||||||
/// the 128-bit integer vector parameter, and then inserting the 32-bit
|
/// the 128-bit integer vector parameter, and then inserting the 32-bit
|
||||||
@ -959,8 +952,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 11: Bits [127:96] of the result are used for insertion.
|
/// 11: Bits [127:96] of the result are used for insertion.
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi32(X, I, N) \
|
#define _mm_insert_epi32(X, I, N) \
|
||||||
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
|
||||||
(int)(I), (int)(N)))
|
(int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
|
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
|
||||||
@ -990,8 +983,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 1: Bits [127:64] of the result are used for insertion. \n
|
/// 1: Bits [127:64] of the result are used for insertion. \n
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi64(X, I, N) \
|
#define _mm_insert_epi64(X, I, N) \
|
||||||
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
|
||||||
(long long)(I), (int)(N)))
|
(int)(N)))
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
/* Extract int from packed integer array at index. This returns the element
|
/* Extract int from packed integer array at index. This returns the element
|
||||||
@ -1061,7 +1054,6 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
#define _mm_extract_epi32(X, N) \
|
#define _mm_extract_epi32(X, N) \
|
||||||
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
|
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
|
||||||
/// Extracts a 64-bit element from the 128-bit integer vector of
|
/// Extracts a 64-bit element from the 128-bit integer vector of
|
||||||
/// [2 x i64], using the immediate value parameter \a N as a selector.
|
/// [2 x i64], using the immediate value parameter \a N as a selector.
|
||||||
///
|
///
|
||||||
@ -1071,7 +1063,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// long long _mm_extract_epi64(__m128i X, const int N);
|
/// long long _mm_extract_epi64(__m128i X, const int N);
|
||||||
/// \endcode
|
/// \endcode
|
||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
|
/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
|
||||||
|
/// in 64-bit mode.
|
||||||
///
|
///
|
||||||
/// \param X
|
/// \param X
|
||||||
/// A 128-bit integer vector.
|
/// A 128-bit integer vector.
|
||||||
@ -1083,7 +1076,6 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \returns A 64-bit integer.
|
/// \returns A 64-bit integer.
|
||||||
#define _mm_extract_epi64(X, N) \
|
#define _mm_extract_epi64(X, N) \
|
||||||
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
|
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
|
||||||
#endif /* __x86_64 */
|
|
||||||
|
|
||||||
/* SSE4 128-bit Packed Integer Comparisons. */
|
/* SSE4 128-bit Packed Integer Comparisons. */
|
||||||
/// Tests whether the specified bits in a 128-bit integer vector are all
|
/// Tests whether the specified bits in a 128-bit integer vector are all
|
||||||
@ -1098,9 +1090,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \param __V
|
/// \param __V
|
||||||
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
||||||
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
|
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
|
||||||
_mm_testz_si128(__m128i __M, __m128i __V)
|
__m128i __V) {
|
||||||
{
|
|
||||||
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
|
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1116,9 +1107,8 @@ _mm_testz_si128(__m128i __M, __m128i __V)
|
|||||||
/// \param __V
|
/// \param __V
|
||||||
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
||||||
/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
|
/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
|
||||||
_mm_testc_si128(__m128i __M, __m128i __V)
|
__m128i __V) {
|
||||||
{
|
|
||||||
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
|
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1135,9 +1125,8 @@ _mm_testc_si128(__m128i __M, __m128i __V)
|
|||||||
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
/// A 128-bit integer vector selecting which bits to test in operand \a __M.
|
||||||
/// \returns TRUE if the specified bits are neither all zeros nor all ones;
|
/// \returns TRUE if the specified bits are neither all zeros nor all ones;
|
||||||
/// FALSE otherwise.
|
/// FALSE otherwise.
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS
|
static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
|
||||||
_mm_testnzc_si128(__m128i __M, __m128i __V)
|
__m128i __V) {
|
||||||
{
|
|
||||||
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
|
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1193,7 +1182,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
|
|||||||
/// \param V
|
/// \param V
|
||||||
/// A 128-bit integer vector selecting which bits to test in operand \a M.
|
/// A 128-bit integer vector selecting which bits to test in operand \a M.
|
||||||
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
|
/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
|
||||||
#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
|
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
|
||||||
|
|
||||||
/* SSE4 64-bit Packed Integer Comparisons. */
|
/* SSE4 64-bit Packed Integer Comparisons. */
|
||||||
/// Compares each of the corresponding 64-bit values of the 128-bit
|
/// Compares each of the corresponding 64-bit values of the 128-bit
|
||||||
@ -1208,9 +1197,8 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit integer vector.
|
/// A 128-bit integer vector.
|
||||||
/// \returns A 128-bit integer vector containing the comparison results.
|
/// \returns A 128-bit integer vector containing the comparison results.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
|
||||||
_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
|
||||||
return (__m128i)((__v2di)__V1 == (__v2di)__V2);
|
return (__m128i)((__v2di)__V1 == (__v2di)__V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1225,15 +1213,16 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
|
|||||||
/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
|
/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
|
||||||
///
|
///
|
||||||
/// \param __V
|
/// \param __V
|
||||||
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
|
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
|
||||||
/// extended to 16-bit values.
|
/// sign-extended to 16-bit values.
|
||||||
/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
|
/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
|
||||||
_mm_cvtepi8_epi16(__m128i __V)
|
|
||||||
{
|
|
||||||
/* This function always performs a signed extension, but __v16qi is a char
|
/* This function always performs a signed extension, but __v16qi is a char
|
||||||
which may be signed or unsigned, so use __v16qs. */
|
which may be signed or unsigned, so use __v16qs. */
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
|
return (__m128i) __builtin_convertvector(
|
||||||
|
__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
|
||||||
|
7),
|
||||||
|
__v8hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sign-extends each of the lower four 8-bit integer elements of a
|
/// Sign-extends each of the lower four 8-bit integer elements of a
|
||||||
@ -1249,12 +1238,11 @@ _mm_cvtepi8_epi16(__m128i __V)
|
|||||||
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
|
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
|
||||||
/// sign-extended to 32-bit values.
|
/// sign-extended to 32-bit values.
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
|
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
|
||||||
_mm_cvtepi8_epi32(__m128i __V)
|
|
||||||
{
|
|
||||||
/* This function always performs a signed extension, but __v16qi is a char
|
/* This function always performs a signed extension, but __v16qi is a char
|
||||||
which may be signed or unsigned, so use __v16qs. */
|
which may be signed or unsigned, so use __v16qs. */
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
|
return (__m128i) __builtin_convertvector(
|
||||||
|
__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sign-extends each of the lower two 8-bit integer elements of a
|
/// Sign-extends each of the lower two 8-bit integer elements of a
|
||||||
@ -1270,12 +1258,11 @@ _mm_cvtepi8_epi32(__m128i __V)
|
|||||||
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
|
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
|
||||||
/// sign-extended to 64-bit values.
|
/// sign-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
|
||||||
_mm_cvtepi8_epi64(__m128i __V)
|
|
||||||
{
|
|
||||||
/* This function always performs a signed extension, but __v16qi is a char
|
/* This function always performs a signed extension, but __v16qi is a char
|
||||||
which may be signed or unsigned, so use __v16qs. */
|
which may be signed or unsigned, so use __v16qs. */
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
|
return (__m128i) __builtin_convertvector(
|
||||||
|
__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sign-extends each of the lower four 16-bit integer elements of a
|
/// Sign-extends each of the lower four 16-bit integer elements of a
|
||||||
@ -1291,10 +1278,9 @@ _mm_cvtepi8_epi64(__m128i __V)
|
|||||||
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
|
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
|
||||||
/// sign-extended to 32-bit values.
|
/// sign-extended to 32-bit values.
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
|
/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
|
||||||
_mm_cvtepi16_epi32(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sign-extends each of the lower two 16-bit integer elements of a
|
/// Sign-extends each of the lower two 16-bit integer elements of a
|
||||||
@ -1310,10 +1296,9 @@ _mm_cvtepi16_epi32(__m128i __V)
|
|||||||
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
|
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
|
||||||
/// sign-extended to 64-bit values.
|
/// sign-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
|
||||||
_mm_cvtepi16_epi64(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sign-extends each of the lower two 32-bit integer elements of a
|
/// Sign-extends each of the lower two 32-bit integer elements of a
|
||||||
@ -1329,10 +1314,9 @@ _mm_cvtepi16_epi64(__m128i __V)
|
|||||||
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
|
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
|
||||||
/// sign-extended to 64-bit values.
|
/// sign-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
|
||||||
_mm_cvtepi32_epi64(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Packed Integer Zero-Extension. */
|
/* SSE4 Packed Integer Zero-Extension. */
|
||||||
@ -1349,10 +1333,11 @@ _mm_cvtepi32_epi64(__m128i __V)
|
|||||||
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
|
/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
|
||||||
/// zero-extended to 16-bit values.
|
/// zero-extended to 16-bit values.
|
||||||
/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
|
/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
|
||||||
_mm_cvtepu8_epi16(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
|
7),
|
||||||
|
__v8hi);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-extends each of the lower four 8-bit integer elements of a
|
/// Zero-extends each of the lower four 8-bit integer elements of a
|
||||||
@ -1368,10 +1353,9 @@ _mm_cvtepu8_epi16(__m128i __V)
|
|||||||
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
|
/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
|
||||||
/// zero-extended to 32-bit values.
|
/// zero-extended to 32-bit values.
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
|
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
|
||||||
_mm_cvtepu8_epi32(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-extends each of the lower two 8-bit integer elements of a
|
/// Zero-extends each of the lower two 8-bit integer elements of a
|
||||||
@ -1387,10 +1371,9 @@ _mm_cvtepu8_epi32(__m128i __V)
|
|||||||
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
|
/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
|
||||||
/// zero-extended to 64-bit values.
|
/// zero-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
|
||||||
_mm_cvtepu8_epi64(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-extends each of the lower four 16-bit integer elements of a
|
/// Zero-extends each of the lower four 16-bit integer elements of a
|
||||||
@ -1406,10 +1389,9 @@ _mm_cvtepu8_epi64(__m128i __V)
|
|||||||
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
|
/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
|
||||||
/// zero-extended to 32-bit values.
|
/// zero-extended to 32-bit values.
|
||||||
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
|
/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
|
||||||
_mm_cvtepu16_epi32(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-extends each of the lower two 16-bit integer elements of a
|
/// Zero-extends each of the lower two 16-bit integer elements of a
|
||||||
@ -1425,10 +1407,9 @@ _mm_cvtepu16_epi32(__m128i __V)
|
|||||||
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
|
/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
|
||||||
/// zero-extended to 64-bit values.
|
/// zero-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
|
||||||
_mm_cvtepu16_epi64(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Zero-extends each of the lower two 32-bit integer elements of a
|
/// Zero-extends each of the lower two 32-bit integer elements of a
|
||||||
@ -1444,10 +1425,9 @@ _mm_cvtepu16_epi64(__m128i __V)
|
|||||||
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
|
/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
|
||||||
/// zero-extended to 64-bit values.
|
/// zero-extended to 64-bit values.
|
||||||
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
|
||||||
_mm_cvtepu32_epi64(__m128i __V)
|
return (__m128i) __builtin_convertvector(
|
||||||
{
|
__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
|
||||||
return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Pack with Unsigned Saturation. */
|
/* SSE4 Pack with Unsigned Saturation. */
|
||||||
@ -1473,10 +1453,9 @@ _mm_cvtepu32_epi64(__m128i __V)
|
|||||||
/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
|
/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
|
||||||
/// are written to the higher 64 bits of the result.
|
/// are written to the higher 64 bits of the result.
|
||||||
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
|
/// \returns A 128-bit vector of [8 x i16] containing the converted values.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
|
||||||
_mm_packus_epi32(__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
|
||||||
return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
||||||
@ -1516,7 +1495,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
|
|||||||
/// \returns A 128-bit integer vector containing the sums of the sets of
|
/// \returns A 128-bit integer vector containing the sums of the sets of
|
||||||
/// absolute differences between both operands.
|
/// absolute differences between both operands.
|
||||||
#define _mm_mpsadbw_epu8(X, Y, M) \
|
#define _mm_mpsadbw_epu8(X, Y, M) \
|
||||||
((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
|
((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
|
||||||
(__v16qi)(__m128i)(Y), (M)))
|
(__v16qi)(__m128i)(Y), (M)))
|
||||||
|
|
||||||
/// Finds the minimum unsigned 16-bit element in the input 128-bit
|
/// Finds the minimum unsigned 16-bit element in the input 128-bit
|
||||||
@ -1532,10 +1511,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
|
|||||||
/// \returns A 128-bit value where bits [15:0] contain the minimum value found
|
/// \returns A 128-bit value where bits [15:0] contain the minimum value found
|
||||||
/// in parameter \a __V, bits [18:16] contain the index of the minimum value
|
/// in parameter \a __V, bits [18:16] contain the index of the minimum value
|
||||||
/// and the remaining bits are set to 0.
|
/// and the remaining bits are set to 0.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
|
||||||
_mm_minpos_epu16(__m128i __V)
|
return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
|
||||||
{
|
|
||||||
return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Handle the sse4.2 definitions here. */
|
/* Handle the sse4.2 definitions here. */
|
||||||
@ -1544,7 +1521,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
so we'll do the same. */
|
so we'll do the same. */
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
||||||
|
|
||||||
/* These specify the type of data that we're comparing. */
|
/* These specify the type of data that we're comparing. */
|
||||||
#define _SIDD_UBYTE_OPS 0x00
|
#define _SIDD_UBYTE_OPS 0x00
|
||||||
@ -2336,9 +2314,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \param __V2
|
/// \param __V2
|
||||||
/// A 128-bit integer vector.
|
/// A 128-bit integer vector.
|
||||||
/// \returns A 128-bit integer vector containing the comparison results.
|
/// \returns A 128-bit integer vector containing the comparison results.
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
|
||||||
_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
|
__m128i __V2) {
|
||||||
{
|
|
||||||
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
7
lib/include/stdatomic.h
vendored
7
lib/include/stdatomic.h
vendored
@ -17,7 +17,8 @@
|
|||||||
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
|
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
|
||||||
* to the clang resource header until that is fully supported.
|
* to the clang resource header until that is fully supported.
|
||||||
*/
|
*/
|
||||||
#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
|
#if __STDC_HOSTED__ && \
|
||||||
|
__has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
|
||||||
# include_next <stdatomic.h>
|
# include_next <stdatomic.h>
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -158,10 +159,6 @@ typedef _Atomic(uintmax_t) atomic_uintmax_t;
|
|||||||
typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
|
typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
|
||||||
|
|
||||||
#define ATOMIC_FLAG_INIT { 0 }
|
#define ATOMIC_FLAG_INIT { 0 }
|
||||||
#if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
|
||||||
/* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
|
|
||||||
#pragma clang deprecated(ATOMIC_FLAG_INIT)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* These should be provided by the libc implementation. */
|
/* These should be provided by the libc implementation. */
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
11
lib/include/stdbool.h
vendored
11
lib/include/stdbool.h
vendored
@ -10,8 +10,13 @@
|
|||||||
#ifndef __STDBOOL_H
|
#ifndef __STDBOOL_H
|
||||||
#define __STDBOOL_H
|
#define __STDBOOL_H
|
||||||
|
|
||||||
/* Don't define bool, true, and false in C++, except as a GNU extension. */
|
#define __bool_true_false_are_defined 1
|
||||||
#ifndef __cplusplus
|
|
||||||
|
#if __STDC_VERSION__ > 201710L
|
||||||
|
/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
|
||||||
|
* to system headers which include this header file unconditionally.
|
||||||
|
*/
|
||||||
|
#elif !defined(__cplusplus)
|
||||||
#define bool _Bool
|
#define bool _Bool
|
||||||
#define true 1
|
#define true 1
|
||||||
#define false 0
|
#define false 0
|
||||||
@ -26,6 +31,4 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define __bool_true_false_are_defined 1
|
|
||||||
|
|
||||||
#endif /* __STDBOOL_H */
|
#endif /* __STDBOOL_H */
|
||||||
|
|||||||
2
lib/include/stddef.h
vendored
2
lib/include/stddef.h
vendored
@ -62,7 +62,7 @@ typedef __SIZE_TYPE__ rsize_t;
|
|||||||
#endif /* defined(__need_STDDEF_H_misc) */
|
#endif /* defined(__need_STDDEF_H_misc) */
|
||||||
|
|
||||||
#if defined(__need_wchar_t)
|
#if defined(__need_wchar_t)
|
||||||
#ifndef __cplusplus
|
#if !defined(__cplusplus) || (defined(_MSC_VER) && !_NATIVE_WCHAR_T_DEFINED)
|
||||||
/* Always define wchar_t when modules are available. */
|
/* Always define wchar_t when modules are available. */
|
||||||
#if !defined(_WCHAR_T) || __has_feature(modules)
|
#if !defined(_WCHAR_T) || __has_feature(modules)
|
||||||
#if !__has_feature(modules)
|
#if !__has_feature(modules)
|
||||||
|
|||||||
13
lib/include/stdnoreturn.h
vendored
13
lib/include/stdnoreturn.h
vendored
@ -13,4 +13,17 @@
|
|||||||
#define noreturn _Noreturn
|
#define noreturn _Noreturn
|
||||||
#define __noreturn_is_defined 1
|
#define __noreturn_is_defined 1
|
||||||
|
|
||||||
|
#if __STDC_VERSION__ > 201710L && \
|
||||||
|
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
||||||
|
/* The noreturn macro is deprecated in C2x. We do not mark it as such because
|
||||||
|
including the header file in C2x is also deprecated and we do not want to
|
||||||
|
issue a confusing diagnostic for code which includes <stdnoreturn.h>
|
||||||
|
followed by code that writes [[noreturn]]. The issue with such code is not
|
||||||
|
with the attribute, or the use of 'noreturn', but the inclusion of the
|
||||||
|
header. */
|
||||||
|
/* FIXME: We should be issuing a deprecation warning here, but cannot yet due
|
||||||
|
* to system headers which include this header file unconditionally.
|
||||||
|
*/
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* __STDNORETURN_H */
|
#endif /* __STDNORETURN_H */
|
||||||
|
|||||||
16
lib/include/uintrintrin.h
vendored
16
lib/include/uintrintrin.h
vendored
@ -39,9 +39,9 @@ struct __uintr_frame
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> CLUI </c> instruction.
|
/// This intrinsic corresponds to the <c> CLUI </c> instruction.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// UIF := 0
|
/// UIF := 0
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
_clui (void)
|
_clui (void)
|
||||||
{
|
{
|
||||||
@ -60,9 +60,9 @@ _clui (void)
|
|||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> STUI </c> instruction.
|
/// This intrinsic corresponds to the <c> STUI </c> instruction.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// UIF := 1
|
/// UIF := 1
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
_stui (void)
|
_stui (void)
|
||||||
{
|
{
|
||||||
@ -81,7 +81,7 @@ _stui (void)
|
|||||||
///
|
///
|
||||||
/// \returns The current value of the user interrupt flag (UIF).
|
/// \returns The current value of the user interrupt flag (UIF).
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// CF := UIF
|
/// CF := UIF
|
||||||
/// ZF := 0
|
/// ZF := 0
|
||||||
/// AF := 0
|
/// AF := 0
|
||||||
@ -89,7 +89,7 @@ _stui (void)
|
|||||||
/// PF := 0
|
/// PF := 0
|
||||||
/// SF := 0
|
/// SF := 0
|
||||||
/// dst := CF
|
/// dst := CF
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
static __inline__ unsigned char __DEFAULT_FN_ATTRS
|
||||||
_testui (void)
|
_testui (void)
|
||||||
{
|
{
|
||||||
@ -110,7 +110,7 @@ _testui (void)
|
|||||||
/// Index of user-interrupt target table entry in user-interrupt target
|
/// Index of user-interrupt target table entry in user-interrupt target
|
||||||
/// table.
|
/// table.
|
||||||
///
|
///
|
||||||
/// \operation
|
/// \code{.operation}
|
||||||
/// IF __a > UITTSZ
|
/// IF __a > UITTSZ
|
||||||
/// GP (0)
|
/// GP (0)
|
||||||
/// FI
|
/// FI
|
||||||
@ -143,7 +143,7 @@ _testui (void)
|
|||||||
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
|
/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
|
||||||
/// FI
|
/// FI
|
||||||
/// FI
|
/// FI
|
||||||
/// \endoperation
|
/// \endcode
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
_senduipi (unsigned long long __a)
|
_senduipi (unsigned long long __a)
|
||||||
{
|
{
|
||||||
|
|||||||
13
lib/include/unwind.h
vendored
13
lib/include/unwind.h
vendored
@ -62,7 +62,8 @@ typedef intptr_t _sleb128_t;
|
|||||||
typedef uintptr_t _uleb128_t;
|
typedef uintptr_t _uleb128_t;
|
||||||
|
|
||||||
struct _Unwind_Context;
|
struct _Unwind_Context;
|
||||||
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
|
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
|
||||||
|
defined(__ARM_DWARF_EH__) || defined(__SEH__))
|
||||||
struct _Unwind_Control_Block;
|
struct _Unwind_Control_Block;
|
||||||
typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
|
typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
|
||||||
#else
|
#else
|
||||||
@ -72,7 +73,7 @@ typedef struct _Unwind_Exception _Unwind_Exception;
|
|||||||
typedef enum {
|
typedef enum {
|
||||||
_URC_NO_REASON = 0,
|
_URC_NO_REASON = 0,
|
||||||
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
||||||
!defined(__ARM_DWARF_EH__)
|
!defined(__ARM_DWARF_EH__) && !defined(__SEH__)
|
||||||
_URC_OK = 0, /* used by ARM EHABI */
|
_URC_OK = 0, /* used by ARM EHABI */
|
||||||
#endif
|
#endif
|
||||||
_URC_FOREIGN_EXCEPTION_CAUGHT = 1,
|
_URC_FOREIGN_EXCEPTION_CAUGHT = 1,
|
||||||
@ -86,7 +87,7 @@ typedef enum {
|
|||||||
_URC_INSTALL_CONTEXT = 7,
|
_URC_INSTALL_CONTEXT = 7,
|
||||||
_URC_CONTINUE_UNWIND = 8,
|
_URC_CONTINUE_UNWIND = 8,
|
||||||
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
|
||||||
!defined(__ARM_DWARF_EH__)
|
!defined(__ARM_DWARF_EH__) && !defined(__SEH__)
|
||||||
_URC_FAILURE = 9 /* used by ARM EHABI */
|
_URC_FAILURE = 9 /* used by ARM EHABI */
|
||||||
#endif
|
#endif
|
||||||
} _Unwind_Reason_Code;
|
} _Unwind_Reason_Code;
|
||||||
@ -103,7 +104,8 @@ typedef enum {
|
|||||||
typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
|
typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
|
||||||
_Unwind_Exception *);
|
_Unwind_Exception *);
|
||||||
|
|
||||||
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
|
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
|
||||||
|
defined(__ARM_DWARF_EH__) || defined(__SEH__))
|
||||||
typedef struct _Unwind_Control_Block _Unwind_Control_Block;
|
typedef struct _Unwind_Control_Block _Unwind_Control_Block;
|
||||||
typedef uint32_t _Unwind_EHT_Header;
|
typedef uint32_t _Unwind_EHT_Header;
|
||||||
|
|
||||||
@ -167,7 +169,8 @@ typedef _Unwind_Personality_Fn __personality_routine;
|
|||||||
typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
|
typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
|
||||||
void *);
|
void *);
|
||||||
|
|
||||||
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
|
#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
|
||||||
|
defined(__ARM_DWARF_EH__) || defined(__SEH__))
|
||||||
typedef enum {
|
typedef enum {
|
||||||
_UVRSC_CORE = 0, /* integer register */
|
_UVRSC_CORE = 0, /* integer register */
|
||||||
_UVRSC_VFP = 1, /* vfp */
|
_UVRSC_VFP = 1, /* vfp */
|
||||||
|
|||||||
71
lib/include/velintrin.h
vendored
Normal file
71
lib/include/velintrin.h
vendored
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
/*===---- velintrin.h - VEL intrinsics for VE ------------------------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __VEL_INTRIN_H__
|
||||||
|
#define __VEL_INTRIN_H__
|
||||||
|
|
||||||
|
// Vector registers
|
||||||
|
typedef double __vr __attribute__((__vector_size__(2048)));
|
||||||
|
|
||||||
|
// Vector mask registers
|
||||||
|
#if __STDC_VERSION__ >= 199901L
|
||||||
|
// For C99
|
||||||
|
typedef _Bool __vm __attribute__((ext_vector_type(256)));
|
||||||
|
typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
|
||||||
|
typedef _Bool __vm512 __attribute__((ext_vector_type(512)));
|
||||||
|
#else
|
||||||
|
#ifdef __cplusplus
|
||||||
|
// For C++
|
||||||
|
typedef bool __vm __attribute__((ext_vector_type(256)));
|
||||||
|
typedef bool __vm256 __attribute__((ext_vector_type(256)));
|
||||||
|
typedef bool __vm512 __attribute__((ext_vector_type(512)));
|
||||||
|
#else
|
||||||
|
#error need C++ or C99 to use vector intrinsics for VE
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
enum VShuffleCodes {
|
||||||
|
VE_VSHUFFLE_YUYU = 0,
|
||||||
|
VE_VSHUFFLE_YUYL = 1,
|
||||||
|
VE_VSHUFFLE_YUZU = 2,
|
||||||
|
VE_VSHUFFLE_YUZL = 3,
|
||||||
|
VE_VSHUFFLE_YLYU = 4,
|
||||||
|
VE_VSHUFFLE_YLYL = 5,
|
||||||
|
VE_VSHUFFLE_YLZU = 6,
|
||||||
|
VE_VSHUFFLE_YLZL = 7,
|
||||||
|
VE_VSHUFFLE_ZUYU = 8,
|
||||||
|
VE_VSHUFFLE_ZUYL = 9,
|
||||||
|
VE_VSHUFFLE_ZUZU = 10,
|
||||||
|
VE_VSHUFFLE_ZUZL = 11,
|
||||||
|
VE_VSHUFFLE_ZLYU = 12,
|
||||||
|
VE_VSHUFFLE_ZLYL = 13,
|
||||||
|
VE_VSHUFFLE_ZLZU = 14,
|
||||||
|
VE_VSHUFFLE_ZLZL = 15,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Use generated intrinsic name definitions
|
||||||
|
#include <velintrin_gen.h>
|
||||||
|
|
||||||
|
// Use helper functions
|
||||||
|
#include <velintrin_approx.h>
|
||||||
|
|
||||||
|
// pack
|
||||||
|
|
||||||
|
#define _vel_pack_f32p __builtin_ve_vl_pack_f32p
|
||||||
|
#define _vel_pack_f32a __builtin_ve_vl_pack_f32a
|
||||||
|
|
||||||
|
static inline unsigned long int _vel_pack_i32(unsigned int a, unsigned int b) {
|
||||||
|
return (((unsigned long int)a) << 32) | b;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define _vel_extract_vm512u(vm) __builtin_ve_vl_extract_vm512u(vm)
|
||||||
|
#define _vel_extract_vm512l(vm) __builtin_ve_vl_extract_vm512l(vm)
|
||||||
|
#define _vel_insert_vm512u(vm512, vm) __builtin_ve_vl_insert_vm512u(vm512, vm)
|
||||||
|
#define _vel_insert_vm512l(vm512, vm) __builtin_ve_vl_insert_vm512l(vm512, vm)
|
||||||
|
|
||||||
|
#endif
|
||||||
120
lib/include/velintrin_approx.h
vendored
Normal file
120
lib/include/velintrin_approx.h
vendored
Normal file
@ -0,0 +1,120 @@
|
|||||||
|
/*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
#ifndef __VEL_INTRIN_APPROX_H__
|
||||||
|
#define __VEL_INTRIN_APPROX_H__
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
|
||||||
|
float s0;
|
||||||
|
__vr v2, v3, v4, v5;
|
||||||
|
v5 = _vel_vrcps_vvl(v1, l);
|
||||||
|
s0 = 1.0;
|
||||||
|
v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
|
||||||
|
v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
|
||||||
|
v2 = _vel_vfmuls_vvvl(v0, v3, l);
|
||||||
|
v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
|
||||||
|
v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
|
||||||
|
v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
|
||||||
|
v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
|
||||||
|
float s0;
|
||||||
|
__vr v2, v3, v4, v5;
|
||||||
|
v5 = _vel_pvrcp_vvl(v1, l);
|
||||||
|
s0 = 1.0;
|
||||||
|
v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
|
||||||
|
v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
|
||||||
|
v2 = _vel_pvfmul_vvvl(v0, v3, l);
|
||||||
|
v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
|
||||||
|
v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
|
||||||
|
v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
|
||||||
|
v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
|
||||||
|
float s1;
|
||||||
|
__vr v1, v2, v3, v4;
|
||||||
|
v4 = _vel_vrcps_vvl(v0, l);
|
||||||
|
s1 = 1.0;
|
||||||
|
v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
|
||||||
|
v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
|
||||||
|
v1 = _vel_vfmuls_vsvl(s0, v2, l);
|
||||||
|
v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
|
||||||
|
v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
|
||||||
|
v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
|
||||||
|
v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
|
||||||
|
float s1;
|
||||||
|
__vr v1, v2;
|
||||||
|
s1 = 1.0f / s0;
|
||||||
|
v1 = _vel_vfmuls_vsvl(s1, v0, l);
|
||||||
|
v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
|
||||||
|
v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
|
||||||
|
__vr v1, v2, v3;
|
||||||
|
v2 = _vel_vrcpd_vvl(v0, l);
|
||||||
|
double s1 = 1.0;
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
|
||||||
|
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
|
||||||
|
v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
|
||||||
|
v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
|
||||||
|
v1 = _vel_vaddul_vsvl(1, v1, l);
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
|
||||||
|
v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
|
||||||
|
v1 = _vel_vfmuld_vsvl(s0, v3, l);
|
||||||
|
v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
|
||||||
|
v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
|
||||||
|
double s0, s1;
|
||||||
|
__vr v1, v2, v3;
|
||||||
|
v2 = _vel_vrsqrtdnex_vvl(v0, l);
|
||||||
|
v1 = _vel_vfmuld_vvvl(v0, v2, l);
|
||||||
|
s0 = 1.0;
|
||||||
|
s1 = 0.5;
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
|
||||||
|
v3 = _vel_vfmuld_vsvl(s1, v3, l);
|
||||||
|
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
|
||||||
|
v1 = _vel_vfmuld_vvvl(v0, v2, l);
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
|
||||||
|
v3 = _vel_vfmuld_vsvl(s1, v3, l);
|
||||||
|
v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
|
||||||
|
float s0, s1;
|
||||||
|
__vr v1, v2, v3;
|
||||||
|
v0 = _vel_vcvtds_vvl(v0, l);
|
||||||
|
v2 = _vel_vrsqrtdnex_vvl(v0, l);
|
||||||
|
v1 = _vel_vfmuld_vvvl(v0, v2, l);
|
||||||
|
s0 = 1.0;
|
||||||
|
s1 = 0.5;
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
|
||||||
|
v3 = _vel_vfmuld_vsvl(s1, v3, l);
|
||||||
|
v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
|
||||||
|
v1 = _vel_vfmuld_vvvl(v0, v2, l);
|
||||||
|
v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
|
||||||
|
v3 = _vel_vfmuld_vsvl(s1, v3, l);
|
||||||
|
v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
|
||||||
|
v0 = _vel_vcvtsd_vvl(v0, l);
|
||||||
|
return v0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
1257
lib/include/velintrin_gen.h
vendored
Normal file
1257
lib/include/velintrin_gen.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
4
lib/include/wasm_simd128.h
vendored
4
lib/include/wasm_simd128.h
vendored
@ -1405,12 +1405,12 @@ wasm_f64x2_convert_low_u32x4(v128_t __a) {
|
|||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_i32x4_trunc_sat_f64x2_zero(v128_t __a) {
|
wasm_i32x4_trunc_sat_f64x2_zero(v128_t __a) {
|
||||||
return (v128_t)__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4((__f64x2)__a);
|
return (v128_t)__builtin_wasm_trunc_sat_s_zero_f64x2_i32x4((__f64x2)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_u32x4_trunc_sat_f64x2_zero(v128_t __a) {
|
wasm_u32x4_trunc_sat_f64x2_zero(v128_t __a) {
|
||||||
return (v128_t)__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4((__f64x2)__a);
|
return (v128_t)__builtin_wasm_trunc_sat_u_zero_f64x2_i32x4((__f64x2)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
|||||||
26
lib/include/x86gprintrin.h
vendored
26
lib/include/x86gprintrin.h
vendored
@ -25,11 +25,29 @@
|
|||||||
#include <crc32intrin.h>
|
#include <crc32intrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define __SSC_MARK(Tag) \
|
#if defined(__i386__)
|
||||||
__asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \
|
#define __FULLBX "ebx"
|
||||||
|
#define __TMPGPR "eax"
|
||||||
|
#else
|
||||||
|
// When in 64-bit target, the 32-bit operands generate a 32-bit result,
|
||||||
|
// zero-extended to a 64-bit result in the destination general-purpose,
|
||||||
|
// It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
|
||||||
|
// should preserve the 64-bit register rbx.
|
||||||
|
#define __FULLBX "rbx"
|
||||||
|
#define __TMPGPR "rax"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
|
||||||
|
|
||||||
|
#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
|
||||||
|
#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
|
||||||
|
|
||||||
|
#define __SSC_MARK(__Tag) \
|
||||||
|
__asm__ __volatile__( __SAVE_GPRBX \
|
||||||
"mov {%0, %%ebx|ebx, %0}; " \
|
"mov {%0, %%ebx|ebx, %0}; " \
|
||||||
".byte 0x64, 0x67, 0x90; " \
|
".byte 0x64, 0x67, 0x90; " \
|
||||||
"mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \
|
__RESTORE_GPRBX \
|
||||||
: "%eax");
|
::"i"(__Tag) \
|
||||||
|
: __TMPGPR );
|
||||||
|
|
||||||
#endif /* __X86GPRINTRIN_H */
|
#endif /* __X86GPRINTRIN_H */
|
||||||
|
|||||||
4
lib/include/x86intrin.h
vendored
4
lib/include/x86intrin.h
vendored
@ -59,5 +59,9 @@
|
|||||||
#include <clzerointrin.h>
|
#include <clzerointrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||||
|
defined(__RDPRU__)
|
||||||
|
#include <rdpruintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* __X86INTRIN_H */
|
#endif /* __X86INTRIN_H */
|
||||||
|
|||||||
12
lib/include/xmmintrin.h
vendored
12
lib/include/xmmintrin.h
vendored
@ -2086,7 +2086,7 @@ _mm_storer_ps(float *__p, __m128 __a)
|
|||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
/// \code
|
/// \code
|
||||||
/// void _mm_prefetch(const void * a, const int sel);
|
/// void _mm_prefetch(const void *a, const int sel);
|
||||||
/// \endcode
|
/// \endcode
|
||||||
///
|
///
|
||||||
/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
|
/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
|
||||||
@ -2360,7 +2360,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
|
|||||||
/// 00: assigned from bits [15:0] of \a a. \n
|
/// 00: assigned from bits [15:0] of \a a. \n
|
||||||
/// 01: assigned from bits [31:16] of \a a. \n
|
/// 01: assigned from bits [31:16] of \a a. \n
|
||||||
/// 10: assigned from bits [47:32] of \a a. \n
|
/// 10: assigned from bits [47:32] of \a a. \n
|
||||||
/// 11: assigned from bits [63:48] of \a a.
|
/// 11: assigned from bits [63:48] of \a a. \n
|
||||||
|
/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
|
||||||
|
/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
|
||||||
|
/// <c>[b6, b4, b2, b0]</c>.
|
||||||
/// \returns A 64-bit integer vector containing the shuffled values.
|
/// \returns A 64-bit integer vector containing the shuffled values.
|
||||||
#define _mm_shuffle_pi16(a, n) \
|
#define _mm_shuffle_pi16(a, n) \
|
||||||
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
|
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
|
||||||
@ -2602,7 +2605,10 @@ void _mm_setcsr(unsigned int __i);
|
|||||||
/// 00: Bits [31:0] copied from the specified operand. \n
|
/// 00: Bits [31:0] copied from the specified operand. \n
|
||||||
/// 01: Bits [63:32] copied from the specified operand. \n
|
/// 01: Bits [63:32] copied from the specified operand. \n
|
||||||
/// 10: Bits [95:64] copied from the specified operand. \n
|
/// 10: Bits [95:64] copied from the specified operand. \n
|
||||||
/// 11: Bits [127:96] copied from the specified operand.
|
/// 11: Bits [127:96] copied from the specified operand. \n
|
||||||
|
/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
|
||||||
|
/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
|
||||||
|
/// <c>[b6, b4, b2, b0]</c>.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
|
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
|
||||||
#define _mm_shuffle_ps(a, b, mask) \
|
#define _mm_shuffle_ps(a, b, mask) \
|
||||||
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
|
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user