update C language headers to clang release/14.x

upstream commit 91632c8ac97fa3daffe4ff8f1391735b5d6805e6
This commit is contained in:
Andrew Kelley 2022-02-03 14:18:29 -07:00
parent 60954598e9
commit 397e055ddd
72 changed files with 113784 additions and 133285 deletions

View File

@ -16,7 +16,7 @@
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
#pragma push_macro("__DEVICE__")
#ifdef __OPENMP_NVPTX__
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp declare target
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
#else
@ -26,7 +26,7 @@
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
// different but equivalent function versions. TODO: For OpenMP we currently
// select the native builtins as the overload support for templates is lacking.
#if !defined(__OPENMP_NVPTX__)
#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
#define _ISNANd std::isnan
#define _ISNANf std::isnan
#define _ISINFd std::isinf
@ -276,7 +276,7 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
#undef _fmaxd
#undef _fmaxf
#ifdef __OPENMP_NVPTX__
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp end declare target
#endif

View File

@ -483,4 +483,36 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
#if CUDA_VERSION >= 11000
extern "C" {
__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline uint32_t __nvvm_get_smem_pointer(void *__ptr) {
return __nv_cvta_generic_to_shared_impl(__ptr);
}
} // extern "C"
#endif // CUDA_VERSION >= 11000
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)

View File

@ -16,6 +16,7 @@ extern "C" {
#if defined(__OPENMP_NVPTX__)
#define __DEVICE__
#pragma omp begin assumes ext_spmd_amenable no_openmp
#elif defined(__CUDA__)
#define __DEVICE__ __device__
#endif
@ -456,6 +457,11 @@ __DEVICE__ double __nv_y1(double __a);
__DEVICE__ float __nv_y1f(float __a);
__DEVICE__ float __nv_ynf(int __a, float __b);
__DEVICE__ double __nv_yn(int __a, double __b);
#if defined(__OPENMP_NVPTX__)
#pragma omp end assumes ext_spmd_amenable no_openmp
#endif
#if defined(__cplusplus)
} // extern "C"
#endif

View File

@ -345,4 +345,4 @@ __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
#pragma pop_macro("__DEVICE_VOID__")
#pragma pop_macro("__FAST_OR_SLOW")
#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
#endif // __CLANG_CUDA_MATH_H__

View File

@ -41,6 +41,7 @@
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#include <string.h>
#undef __CUDACC__
// Preserve common macros that will be changed below by us or by CUDA
@ -64,9 +65,9 @@
#endif
// Make largest subset of device functions available during host
// compilation -- SM_35 for the time being.
// compilation.
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 350
#define __CUDA_ARCH__ 9999
#endif
#include "__clang_cuda_builtin_vars.h"
@ -205,11 +206,6 @@ inline __host__ double __signbitd(double x) {
#endif
#if CUDA_VERSION >= 9000
// CUDA-9.2 needs host-side memcpy for some host functions in
// device_functions.hpp
#if CUDA_VERSION >= 9020
#include <string.h>
#endif
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
@ -275,7 +271,38 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
#undef __CUDABE__
#endif
#include "sm_20_atomic_functions.hpp"
// Predicate functions used in `__builtin_assume` need to have no side effect.
// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
// them as pure ones.
#pragma push_macro("__isGlobal")
#pragma push_macro("__isShared")
#pragma push_macro("__isConstant")
#pragma push_macro("__isLocal")
#define __isGlobal __ignored_cuda___isGlobal
#define __isShared __ignored_cuda___isShared
#define __isConstant __ignored_cuda___isConstant
#define __isLocal __ignored_cuda___isLocal
#include "sm_20_intrinsics.hpp"
#pragma pop_macro("__isGlobal")
#pragma pop_macro("__isShared")
#pragma pop_macro("__isConstant")
#pragma pop_macro("__isLocal")
#pragma push_macro("__DEVICE__")
#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
__DEVICE__ unsigned int __isGlobal(const void *p) {
return __nvvm_isspacep_global(p);
}
__DEVICE__ unsigned int __isShared(const void *p) {
return __nvvm_isspacep_shared(p);
}
__DEVICE__ unsigned int __isConstant(const void *p) {
return __nvvm_isspacep_const(p);
}
__DEVICE__ unsigned int __isLocal(const void *p) {
return __nvvm_isspacep_local(p);
}
#pragma pop_macro("__DEVICE__")
#include "sm_32_atomic_functions.hpp"
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
@ -330,6 +357,34 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
#pragma pop_macro("__host__")
// __clang_cuda_texture_intrinsics.h must be included first in order to provide
// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
// The implementation requires c++11 and only works with CUDA-9 or newer.
#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
// clang-format off
#include <__clang_cuda_texture_intrinsics.h>
// clang-format on
#else
#if CUDA_VERSION >= 9000
// Provide a hint that texture support needs C++11.
template <typename T> struct __nv_tex_needs_cxx11 {
const static bool value = false;
};
template <class T>
__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
cudaTextureObject_t obj,
float x) {
_Static_assert(__nv_tex_needs_cxx11<T>::value,
"Texture support requires C++11");
}
#else
// Textures in CUDA-8 and older are not supported by clang.There's no
// convenient way to intercept texture use in these versions, so we can't
// produce a meaningful error. The source code that attempts to use textures
// will continue to fail as it does now.
#endif // CUDA_VERSION
#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
#include "texture_fetch_functions.h"
#include "texture_indirect_functions.h"
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.

View File

@ -0,0 +1,740 @@
/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*
* This header provides in-header implmentations for NVCC's built-in
* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
* built-in is unusual as it's actually a set of function overloads that use the
* first string literal argument as one of the overload parameters.
*/
#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// __nv_tex_surf_handler() provided by this header as a macro.
#define __nv_tex_surf_handler(__op, __ptr, ...) \
::__cuda_tex::__tex_fetch< \
::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
__VA_ARGS__)
#pragma push_macro("__ASM_OUT")
#pragma push_macro("__ASM_OUTP")
#pragma push_macro("__Args")
#pragma push_macro("__ID")
#pragma push_macro("__IDV")
#pragma push_macro("__IMPL_2DGATHER")
#pragma push_macro("__IMPL_ALIAS")
#pragma push_macro("__IMPL_ALIASI")
#pragma push_macro("__IMPL_F1")
#pragma push_macro("__IMPL_F3")
#pragma push_macro("__IMPL_F3N")
#pragma push_macro("__IMPL_F3S")
#pragma push_macro("__IMPL_S")
#pragma push_macro("__IMPL_S3")
#pragma push_macro("__IMPL_S3I")
#pragma push_macro("__IMPL_S3N")
#pragma push_macro("__IMPL_S3NI")
#pragma push_macro("__IMPL_S3S")
#pragma push_macro("__IMPL_S3SI")
#pragma push_macro("__IMPL_SI")
#pragma push_macro("__L")
#pragma push_macro("__STRIP_PARENS")
// Put all functions into anonymous namespace so they have internal linkage.
// The device-only function here must be internal in order to avoid ODR
// violations in case they are used from the files compiled with
// -fgpu-rdc. E.g. a library and an app using it may be built with a different
// version of this header file.
namespace {
// Put the implmentation into its own namespace so we don't pollute the TU.
namespace __cuda_tex {
// First, we need a perfect hash function and a few constexpr helper functions
// for converting a string literal into a numeric value which can be used to
// parametrize a template. We can not use string literals for that as that would
// require C++20.
//
// The hash function was generated with 'gperf' and then manually converted into
// its constexpr equivalent.
//
// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
// function has a collision for any of the texture operations, the compilation
// will fail due to an attempt to redefine a tag with the same value. If the
// header compiles, then the hash function is good enough for the job.
constexpr int __tex_len(const char *s) {
return (s[0] == 0) ? 0
: (s[1] == 0) ? 1
: (s[2] == 0) ? 2
: (s[3] == 0) ? 3
: (s[4] == 0) ? 4
: (s[5] == 0) ? 5
: (s[6] == 0) ? 6
: (s[7] == 0) ? 7
: (s[8] == 0) ? 8
: (s[9] == 0) ? 9
: (s[10] == 0) ? 10
: (s[11] == 0) ? 11
: (s[12] == 0) ? 12
: (s[13] == 0) ? 13
: (s[14] == 0) ? 14
: (s[15] == 0) ? 15
: (s[16] == 0) ? 16
: (s[17] == 0) ? 17
: (s[18] == 0) ? 18
: (s[19] == 0) ? 19
: (s[20] == 0) ? 20
: (s[21] == 0) ? 21
: (s[22] == 0) ? 22
: (s[23] == 0) ? 23
: (s[24] == 0) ? 24
: (s[25] == 0) ? 25
: (s[26] == 0) ? 26
: (s[27] == 0) ? 27
: (s[28] == 0) ? 28
: (s[29] == 0) ? 29
: (s[30] == 0) ? 30
: (s[31] == 0) ? 31
: 32;
}
constexpr int __tex_hash_map(int c) {
return (c == 49) ? 10
: (c == 50) ? 0
: (c == 51) ? 100
: (c == 52) ? 30
: (c == 67) ? 10
: (c == 68) ? 0
: (c == 69) ? 25
: (c == 72) ? 70
: (c == 77) ? 0
: (c == 96) ? 44
: (c == 99) ? 10
: (c == 100) ? 5
: (c == 101) ? 60
: (c == 102) ? 40
: (c == 103) ? 70
: (c == 104) ? 25
: (c == 112) ? 0
: (c == 114) ? 45
: (c == 117) ? 5
: (c == 118) ? 85
: (c == 120) ? 20
: 225;
}
constexpr int __tex_op_hash(const char *str) {
return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
}
// Tag type to identify particular texture operation.
template <int N> struct __Tag;
#define __ID(__op) __Tag<__tex_op_hash(__op)>
// Tags for variants of particular operation. E.g. tex2Dgather can translate
// into 4 different instructions.
#define __IDV(__op, __variant) \
__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
// Helper classes for figuring out key data types for derived types.
// E.g. char2 has __base_t = char, __fetch_t = char4
template <class> struct __TypeInfoT;
// Type info for the fundamental types.
template <> struct __TypeInfoT<float> {
using __base_t = float;
using __fetch_t = float4;
};
template <> struct __TypeInfoT<char> {
using __base_t = char;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<signed char> {
using __base_t = signed char;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned char> {
using __base_t = unsigned char;
using __fetch_t = uint4;
};
template <> struct __TypeInfoT<short> {
using __base_t = short;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned short> {
using __base_t = unsigned short;
using __fetch_t = uint4;
};
template <> struct __TypeInfoT<int> {
using __base_t = int;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned int> {
using __base_t = unsigned int;
using __fetch_t = uint4;
};
// Derived base/fetch types for N-element vectors.
template <class __T> struct __TypeInfoT {
using __base_t = decltype(__T::x);
using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
};
// Classes that implement specific texture ops.
template <class __op> struct __tex_fetch_v4;
// Helper macros to strip parens from a macro argument.
#define __Args(...) __VA_ARGS__
#define __STRIP_PARENS(__X) __X
#define __L(__X) __STRIP_PARENS(__Args __X)
// Construct inline assembly output args.
// Results are stored in a temp var __r.
// isResident bool is pointed to by __ir
// Asm args for return values. It's a 4-element vector
#define __ASM_OUT(__t) \
("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
// .. possibly combined with a predicate.
#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
// Implements a single variant of texture fetch instruction.
#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
template <> \
__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
__rt __r; \
asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
return __r; \
}
// Implements texture fetch instructions for int4/uint4/float4 data types.
#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(float4, float4, __args, \
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
__asm_args)
// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
// types. Similar to above, but returns a boolean 'isPresent' value in addition
// to texture data,
#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUTP("r"), __asm_args) \
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
__ASM_OUTP("r"), __asm_args) \
__IMPL_F1(float4, float4, __args, \
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
__asm_args)
// Similar to F3, but for integer data which is returned as normalized floats.
// Only instantiates fetch functions for int4/uint4.
#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(float4, uint4, __args, \
__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
__asm_args)
// Instantiates __tex_fetch_v4 with regular fetch functions.
#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
// Same, but for sparse ops. Only available on sm_60+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
#else
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#endif
// Same, but for normalized float ops.
#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
// Regular and normalized float ops share a lot of similarities. This macro
// instantiates both variants -- normal for __op and normalized for __opn.
#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
// Convenience macros which converts string literal __op into a __Tag,
#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
__asm_args)
// CUDA headers have some 'legacy' texture oprerations that duplicate
// functionality. So, we just inherit it, instead of refining a copy.
#define __IMPL_ALIASI(__op, __opn) \
template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
// Now we can instantiate everything we need for each specific texture fetch
// variant.
__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
__IMPL_ALIAS("__itex1D", "__tex1D_v2");
__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
("f"(__x), "f"(__dPdx), "f"(__dPdy)));
__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
(float __x, int __layer), "tex.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
(float __x, int __layer, float __dPdx, float __dPdy),
"tex.grad.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
("r"(__layer), "f"(__x), "f"(__level)));
__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
("f"(__x), "f"(__level)));
__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
// 2D
__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
__IMPL_ALIAS("__itex2D", "__tex2D_v2");
__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
" selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y)));
__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
"tex.grad.2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
"f"(__dPdy->y)));
__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
__IMPL_S3S("__itex2DGrad_sparse",
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
"f"(__dPdy->y)));
__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("r"(__layer), "f"(__x), "f"(__y)));
__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
__IMPL_S3S("__itex2DLayered_sparse",
(float __x, float __y, int __layer, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y)));
__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
(float __x, float __y, int __layer, const float2 *__dPdx,
const float2 *__dPdy),
"tex.grad.a2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdy->x), "f"(__dPdy->y)));
__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
__IMPL_S3S(
"__itex2DLayeredGrad_sparse",
(float __x, float __y, int __layer, const float2 *__dPdx,
const float2 *__dPdy, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdy->x), "f"(__dPdy->y)));
__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
__IMPL_S3S("__itex2DLayeredLod_sparse",
(float __x, float __y, int __layer, float __level,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
("f"(__x), "f"(__y), "f"(__level)));
__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
__IMPL_S3S("__itex2DLod_sparse",
(float __x, float __y, float __level, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__level)));
// 2D gather is special. Unlike other variants that translate into exactly one
// asm instruction, it uses one of the four different instructions selected by
// __comp. We implement each instruction variant separately, and dispatch the
// right one from the manually implemented 'umbrella' fetch.
#define __IMPL_2DGATHER(variant, instr) \
__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
__IDV("__tex2Dgather_rmnf_v2", variant), \
(float __x, float __y, int __comp), instr, "f32", \
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
__IDV("__tex2Dgather_v2", variant)); \
__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
(float __x, float __y, unsigned char *__ir, int __comp), \
"{.reg .pred %%p0;\n\t" instr, "f32", \
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
"selp.u16 %4, 1, 0, %%p0; }", \
("f"(__x), "f"(__y)));
__IMPL_2DGATHER(0, "tld4.r.2d.v4");
__IMPL_2DGATHER(1, "tld4.g.2d.v4");
__IMPL_2DGATHER(2, "tld4.b.2d.v4");
__IMPL_2DGATHER(3, "tld4.a.2d.v4");
// Umbrella dispatcher -- calls into specific 2Dgather variant.
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
template <class __T>
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
__obj, __x, __y, __comp);
case 1:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
__obj, __x, __y, __comp);
case 2:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
__obj, __x, __y, __comp);
case 3:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
__obj, __x, __y, __comp);
}
}
};
__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
template <class __T>
__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
float __y, int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
__obj, __x, __y, __comp);
case 1:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
__obj, __x, __y, __comp);
case 2:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
__obj, __x, __y, __comp);
case 3:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
__obj, __x, __y, __comp);
}
}
};
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
template <class __T>
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
unsigned char *__ir, int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 1:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 2:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 3:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
}
}
};
#endif
// 3D
__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itex3D", "__tex3D_v2");
__IMPL_S3S("__itex3D_sparse",
(float __x, float __y, float __z, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.3d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
__IMPL_S3S("__itex3DGrad_sparse",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
__IMPL_S3S("__itex3DLod_sparse",
(float __x, float __y, float __z, float __level,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
// Cubemap
__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
(float __x, float __y, float __z), "tex.cube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
__IMPL_S3S("__itexCubemap_sparse",
(float __x, float __y, float __z, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.cube.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.cube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
"f"(__dPdy->z)));
__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
(float __x, float __y, float __z, int __layer, float __level),
"tex.level.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
// Helper class for extracting slice of data from V4 fetch results.
template <class __DestT, class __SrcT> struct __convert {
template <int __NElements = sizeof(__DestT) /
sizeof(typename __TypeInfoT<__DestT>::__base_t)>
__device__ static __DestT __run(__SrcT __v);
template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
template <> __device__ static __DestT __run<2>(__SrcT __v) {
return {__v.x, __v.y};
}
template <> __device__ static __DestT __run<3>(__SrcT __v) {
return {__v.x, __v.y, __v.z};
}
template <> __device__ static __DestT __run<4>(__SrcT __v) {
return {__v.x, __v.y, __v.z, __v.w};
}
};
// These are the top-level function overloads the __nv_tex_surf_handler expands
// to. Each overload deals with one of the several ways __nv_tex_surf_handler
// is called by CUDA headers. In the end, each of the overloads does the same
// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
// fetch texture data and which `__convert::run` is needed to convert it into
// appropriate return type.
// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
// Data type and return type are based on ret.
template <class __op, class __T, class... __Args>
__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
*__ptr = __convert<__T, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
}
// texture<> objects get magically converted into a texture reference. However,
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
// cheat a bit and use inline assembly to do it. It costs us an extra register
// and a move, but that is easy for ptxas to optimize away.
template <class __T>
__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
cudaTextureObject_t __obj;
asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
return __obj;
}
// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
// Data type and return type is based on ret.
template <class __op, class __T, class __HandleT, class... __Args>
__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
*__ptr = __convert<__T, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
// cudaReadModeNormalizedFloat fetches always return float4.
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
__device__ static void
__tex_fetch(__DataT *, __RetT *__ptr,
texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
*__ptr = __convert<__RetT, float4>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
// For cudaReadModeElementType fetch return type is based on type_dummy.
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
__device__ static void
__tex_fetch(__DataT *, __RetT *__ptr,
texture<__DataT, __TexT, cudaReadModeElementType> __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
*__ptr = __convert<__RetT, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
} // namespace __cuda_tex
} // namespace
#pragma pop_macro("__ASM_OUT")
#pragma pop_macro("__ASM_OUTP")
#pragma pop_macro("__Args")
#pragma pop_macro("__ID")
#pragma pop_macro("__IDV")
#pragma pop_macro("__IMPL_2DGATHER")
#pragma pop_macro("__IMPL_ALIAS")
#pragma pop_macro("__IMPL_ALIASI")
#pragma pop_macro("__IMPL_F1")
#pragma pop_macro("__IMPL_F3")
#pragma pop_macro("__IMPL_F3N")
#pragma pop_macro("__IMPL_F3S")
#pragma pop_macro("__IMPL_S")
#pragma pop_macro("__IMPL_S3")
#pragma pop_macro("__IMPL_S3I")
#pragma pop_macro("__IMPL_S3N")
#pragma pop_macro("__IMPL_S3NI")
#pragma pop_macro("__IMPL_S3S")
#pragma pop_macro("__IMPL_S3SI")
#pragma pop_macro("__IMPL_SI")
#pragma pop_macro("__L")
#pragma pop_macro("__STRIP_PARENS")
#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__

View File

@ -50,6 +50,9 @@ extern "C" {
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#if __has_include("hip/hip_version.h")
#include "hip/hip_version.h"
#endif // __has_include("hip/hip_version.h")
#else
typedef __SIZE_TYPE__ size_t;
// Define macros which are needed to declare HIP device API's without standard
@ -74,25 +77,35 @@ typedef __SIZE_TYPE__ __hip_size_t;
extern "C" {
#endif //__cplusplus
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
return (void *) __ockl_dm_alloc(__size);
}
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__ockl_dm_dealloc((unsigned long long)__ptr);
}
#else // HIP version check
#if __HIP_ENABLE_DEVICE_MALLOC__
__device__ void *__hip_malloc(__hip_size_t __size);
__device__ void *__hip_free(void *__ptr);
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
return __hip_malloc(__size);
}
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
return __hip_free(__ptr);
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__hip_free(__ptr);
}
#else
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
__builtin_trap();
return (void *)0;
}
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__builtin_trap();
return (void *)0;
}
#endif
#endif // HIP version check
#ifdef __cplusplus
} // extern "C"

View File

@ -133,7 +133,7 @@ _mm_aesimc_si128(__m128i __V)
/// An 8-bit round constant used to generate the AES encryption key.
/// \returns A 128-bit round key for AES encryption.
#define _mm_aeskeygenassist_si128(C, R) \
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
#undef __DEFAULT_FN_ATTRS

294
lib/include/altivec.h vendored
View File

@ -19,6 +19,10 @@
#define __CR6_EQ_REV 1
#define __CR6_LT 2
#define __CR6_LT_REV 3
#define __CR6_GT 4
#define __CR6_GT_REV 5
#define __CR6_SO 6
#define __CR6_SO_REV 7
/* Constants for vec_test_data_class */
#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
@ -1810,6 +1814,11 @@ vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
return (vector bool __int128)__builtin_altivec_vcmpequq(
(vector bool __int128)__a, (vector bool __int128)__b);
}
static __inline__ vector bool __int128 __ATTRS_o_ai
vec_cmpeq(vector bool __int128 __a, vector bool __int128 __b) {
return (vector bool __int128)__builtin_altivec_vcmpequq(__a, __b);
}
#endif
#ifdef __POWER9_VECTOR__
@ -1887,6 +1896,11 @@ vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) {
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
(vector bool __int128)__a, (vector bool __int128)__b));
}
static __inline__ vector bool __int128 __ATTRS_o_ai
vec_cmpne(vector bool __int128 __a, vector bool __int128 __b) {
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(__a, __b));
}
#endif
/* vec_cmpnez */
@ -2472,7 +2486,7 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
#ifdef __POWER8_VECTOR__
/* vec_popcnt */
static __inline__ vector signed char __ATTRS_o_ai
static __inline__ vector unsigned char __ATTRS_o_ai
vec_popcnt(vector signed char __a) {
return __builtin_altivec_vpopcntb(__a);
}
@ -2480,7 +2494,7 @@ static __inline__ vector unsigned char __ATTRS_o_ai
vec_popcnt(vector unsigned char __a) {
return __builtin_altivec_vpopcntb(__a);
}
static __inline__ vector signed short __ATTRS_o_ai
static __inline__ vector unsigned short __ATTRS_o_ai
vec_popcnt(vector signed short __a) {
return __builtin_altivec_vpopcnth(__a);
}
@ -2488,7 +2502,7 @@ static __inline__ vector unsigned short __ATTRS_o_ai
vec_popcnt(vector unsigned short __a) {
return __builtin_altivec_vpopcnth(__a);
}
static __inline__ vector signed int __ATTRS_o_ai
static __inline__ vector unsigned int __ATTRS_o_ai
vec_popcnt(vector signed int __a) {
return __builtin_altivec_vpopcntw(__a);
}
@ -2496,7 +2510,7 @@ static __inline__ vector unsigned int __ATTRS_o_ai
vec_popcnt(vector unsigned int __a) {
return __builtin_altivec_vpopcntw(__a);
}
static __inline__ vector signed long long __ATTRS_o_ai
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_popcnt(vector signed long long __a) {
return __builtin_altivec_vpopcntd(__a);
}
@ -3049,13 +3063,10 @@ static __inline__ vector unsigned char __ATTRS_o_ai
vec_xl_len_r(const unsigned char *__a, size_t __b) {
vector unsigned char __res =
(vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
#ifdef __LITTLE_ENDIAN__
vector unsigned char __mask =
(vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
__res = (vector unsigned char)__builtin_altivec_vperm_4si(
return (vector unsigned char)__builtin_altivec_vperm_4si(
(vector int)__res, (vector int)__res, __mask);
#endif
return __res;
}
// vec_xst_len
@ -3130,15 +3141,11 @@ static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
unsigned char *__b,
size_t __c) {
#ifdef __LITTLE_ENDIAN__
vector unsigned char __mask =
(vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
vector unsigned char __res =
__builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
#else
return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
#endif
}
#endif
#endif
@ -7106,6 +7113,11 @@ vec_orc(vector float __a, vector bool int __b) {
return (vector float)((vector unsigned int)__a | ~__b);
}
static __inline__ vector float __ATTRS_o_ai vec_orc(vector float __a,
vector float __b) {
return (vector float)((vector unsigned int)__a | ~(vector unsigned int)__b);
}
static __inline__ vector signed long long __ATTRS_o_ai
vec_orc(vector signed long long __a, vector signed long long __b) {
return __a | ~__b;
@ -7150,6 +7162,12 @@ static __inline__ vector double __ATTRS_o_ai
vec_orc(vector bool long long __a, vector double __b) {
return (vector double)(__a | ~(vector unsigned long long)__b);
}
static __inline__ vector double __ATTRS_o_ai vec_orc(vector double __a,
vector double __b) {
return (vector double)((vector bool long long)__a |
~(vector unsigned long long)__b);
}
#endif
/* vec_vor */
@ -8399,9 +8417,20 @@ static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
}
#ifdef __VSX__
#ifdef __XL_COMPAT_ALTIVEC__
static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a);
static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
double __fpscr = __builtin_readflm();
__builtin_setrnd(0);
vector double __rounded = vec_rint(__a);
__builtin_setflm(__fpscr);
return __rounded;
}
#else
static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
return __builtin_vsx_xvrdpi(__a);
}
#endif
/* vec_rint */
@ -8839,7 +8868,7 @@ static __inline__ vector long long __ATTRS_o_ai
vec_sl(vector long long __a, vector unsigned long long __b) {
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
}
#else
#elif defined(__VSX__)
static __inline__ vector unsigned char __ATTRS_o_ai
vec_vspltb(vector unsigned char __a, unsigned char __b);
static __inline__ vector unsigned long long __ATTRS_o_ai
@ -8885,7 +8914,7 @@ static __inline__ vector long long __ATTRS_o_ai
vec_sl(vector long long __a, vector unsigned long long __b) {
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
}
#endif
#endif /* __VSX__ */
/* vec_vslb */
@ -10350,7 +10379,7 @@ static __inline__ vector long long __ATTRS_o_ai
vec_sr(vector long long __a, vector unsigned long long __b) {
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
}
#else
#elif defined(__VSX__)
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
@ -10394,7 +10423,7 @@ static __inline__ vector long long __ATTRS_o_ai
vec_sr(vector long long __a, vector unsigned long long __b) {
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
}
#endif
#endif /* __VSX__ */
/* vec_vsrb */
@ -10480,7 +10509,7 @@ static __inline__ vector unsigned long long __ATTRS_o_ai
vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
return (vector unsigned long long)((vector signed long long)__a >> __b);
}
#else
#elif defined(__VSX__)
static __inline__ vector signed long long __ATTRS_o_ai
vec_sra(vector signed long long __a, vector unsigned long long __b) {
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
@ -10492,7 +10521,7 @@ vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
return (vector unsigned long long)((vector signed long long)__a >> __b);
}
#endif
#endif /* __VSX__ */
/* vec_vsrab */
@ -13441,74 +13470,74 @@ vec_vxor(vector bool long long __a, vector bool long long __b) {
/* vec_extract */
static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0xf];
}
static __inline__ unsigned char __ATTRS_o_ai
vec_extract(vector unsigned char __a, unsigned int __b) {
vec_extract(vector unsigned char __a, signed int __b) {
return __a[__b & 0xf];
}
static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0xf];
}
static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x7];
}
static __inline__ unsigned short __ATTRS_o_ai
vec_extract(vector unsigned short __a, unsigned int __b) {
vec_extract(vector unsigned short __a, signed int __b) {
return __a[__b & 0x7];
}
static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x7];
}
static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x3];
}
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x3];
}
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x3];
}
#ifdef __VSX__
static __inline__ signed long long __ATTRS_o_ai
vec_extract(vector signed long long __a, unsigned int __b) {
vec_extract(vector signed long long __a, signed int __b) {
return __a[__b & 0x1];
}
static __inline__ unsigned long long __ATTRS_o_ai
vec_extract(vector unsigned long long __a, unsigned int __b) {
vec_extract(vector unsigned long long __a, signed int __b) {
return __a[__b & 0x1];
}
static __inline__ unsigned long long __ATTRS_o_ai
vec_extract(vector bool long long __a, unsigned int __b) {
vec_extract(vector bool long long __a, signed int __b) {
return __a[__b & 0x1];
}
static __inline__ double __ATTRS_o_ai vec_extract(vector double __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x1];
}
#endif
static __inline__ float __ATTRS_o_ai vec_extract(vector float __a,
unsigned int __b) {
signed int __b) {
return __a[__b & 0x3];
}
@ -13568,82 +13597,82 @@ vec_extract_fp32_from_shortl(vector unsigned short __a) {
static __inline__ vector signed char __ATTRS_o_ai
vec_insert(signed char __a, vector signed char __b, int __c) {
__b[__c] = __a;
__b[__c & 0xF] = __a;
return __b;
}
static __inline__ vector unsigned char __ATTRS_o_ai
vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
__b[__c] = __a;
__b[__c & 0xF] = __a;
return __b;
}
static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
vector bool char __b,
int __c) {
__b[__c] = __a;
__b[__c & 0xF] = __a;
return __b;
}
static __inline__ vector signed short __ATTRS_o_ai
vec_insert(signed short __a, vector signed short __b, int __c) {
__b[__c] = __a;
__b[__c & 0x7] = __a;
return __b;
}
static __inline__ vector unsigned short __ATTRS_o_ai
vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
__b[__c] = __a;
__b[__c & 0x7] = __a;
return __b;
}
static __inline__ vector bool short __ATTRS_o_ai
vec_insert(unsigned short __a, vector bool short __b, int __c) {
__b[__c] = __a;
__b[__c & 0x7] = __a;
return __b;
}
static __inline__ vector signed int __ATTRS_o_ai
vec_insert(signed int __a, vector signed int __b, int __c) {
__b[__c] = __a;
__b[__c & 0x3] = __a;
return __b;
}
static __inline__ vector unsigned int __ATTRS_o_ai
vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
__b[__c] = __a;
__b[__c & 0x3] = __a;
return __b;
}
static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
vector bool int __b,
int __c) {
__b[__c] = __a;
__b[__c & 0x3] = __a;
return __b;
}
#ifdef __VSX__
static __inline__ vector signed long long __ATTRS_o_ai
vec_insert(signed long long __a, vector signed long long __b, int __c) {
__b[__c] = __a;
__b[__c & 0x1] = __a;
return __b;
}
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
__b[__c] = __a;
__b[__c & 0x1] = __a;
return __b;
}
static __inline__ vector bool long long __ATTRS_o_ai
vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
__b[__c] = __a;
__b[__c & 0x1] = __a;
return __b;
}
static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
vector double __b,
int __c) {
__b[__c] = __a;
__b[__c & 0x1] = __a;
return __b;
}
#endif
@ -13651,7 +13680,7 @@ static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
vector float __b,
int __c) {
__b[__c] = __a;
__b[__c & 0x3] = __a;
return __b;
}
@ -14812,42 +14841,43 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
#ifdef __VSX__
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
vector signed long long __b) {
#ifdef __POWER8_VECTOR__
return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
#else
// No vcmpequd on Power7 so we xor the two vectors and compare against zero as
// 32-bit elements.
return vec_all_eq((vector signed int)vec_xor(__a, __b), (vector signed int)0);
#endif
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
vector unsigned long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
(vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
(vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
vector long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
(vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
vector unsigned long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
(vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
(vector long long)__b);
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
}
#endif
@ -14877,6 +14907,11 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a,
vector unsigned __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
}
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool __int128 __a,
vector bool __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
}
#endif
/* vec_all_ge */
@ -15822,6 +15857,11 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a,
vector unsigned __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
}
static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool __int128 __a,
vector bool __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
}
#endif
/* vec_all_nge */
@ -16111,6 +16151,11 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a,
vector unsigned __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
}
static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool __int128 __a,
vector bool __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
}
#endif
/* vec_any_ge */
@ -17020,43 +17065,43 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
#ifdef __VSX__
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
vector signed long long __b) {
#ifdef __POWER8_VECTOR__
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
#else
// Take advantage of the optimized sequence for vec_all_eq when vcmpequd is
// not available.
return !vec_all_eq(__a, __b);
#endif
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
vector unsigned long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
(vector long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
(vector signed long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
vector signed long long __b) {
return __builtin_altivec_vcmpequd_p(
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
vector unsigned long long __b) {
return __builtin_altivec_vcmpequd_p(
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
vector bool long long __b) {
return __builtin_altivec_vcmpequd_p(
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
}
#endif
@ -17086,6 +17131,11 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a,
vector unsigned __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
}
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool __int128 __a,
vector bool __int128 __b) {
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
}
#endif
/* vec_any_nge */
@ -17203,6 +17253,7 @@ provided.
#define vec_ncipher_be __builtin_altivec_crypto_vncipher
#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
#ifdef __VSX__
static __inline__ vector unsigned long long __attribute__((__always_inline__))
__builtin_crypto_vsbox(vector unsigned long long __a) {
return __builtin_altivec_crypto_vsbox(__a);
@ -17231,6 +17282,7 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a,
vector unsigned long long __b) {
return __builtin_altivec_crypto_vncipherlast(__a, __b);
}
#endif /* __VSX__ */
#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
@ -17346,12 +17398,22 @@ vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
}
#if defined(__powerpc64__) && defined(__SIZEOF_INT128__)
static __inline__ vector unsigned long long __attribute__((__always_inline__))
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
return __builtin_altivec_vbpermq((vector unsigned char)__a,
(vector unsigned char)__b);
}
#endif
static __inline__ vector unsigned char __ATTRS_o_ai
vec_bperm(vector unsigned char __a, vector unsigned char __b) {
return __builtin_altivec_vbpermq(__a, __b);
}
#endif // __POWER8_VECTOR__
#ifdef __POWER9_VECTOR__
static __inline__ vector unsigned long long __ATTRS_o_ai
vec_bperm(vector unsigned long long __a, vector unsigned char __b) {
return __builtin_altivec_vbpermd(__a, __b);
}
#endif
@ -18198,13 +18260,13 @@ vec_expandm(vector unsigned __int128 __a) {
#define vec_cntm(__a, __mp) \
_Generic((__a), vector unsigned char \
: __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)), \
: __builtin_altivec_vcntmbb((__a), (unsigned char)(__mp)), \
vector unsigned short \
: __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)), \
: __builtin_altivec_vcntmbh((__a), (unsigned char)(__mp)), \
vector unsigned int \
: __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)), \
: __builtin_altivec_vcntmbw((__a), (unsigned char)(__mp)), \
vector unsigned long long \
: __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
: __builtin_altivec_vcntmbd((__a), (unsigned char)(__mp)))
/* vec_gen[b|h|w|d|q]m */
@ -18319,10 +18381,10 @@ vec_cfuge(vector unsigned long long __a, vector unsigned long long __b) {
: __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
#endif /* __VSX__ */
/* vec_clrl */
/* vec_clr_first */
static __inline__ vector signed char __ATTRS_o_ai
vec_clrl(vector signed char __a, unsigned int __n) {
vec_clr_first(vector signed char __a, unsigned int __n) {
#ifdef __LITTLE_ENDIAN__
return __builtin_altivec_vclrrb(__a, __n);
#else
@ -18331,7 +18393,7 @@ vec_clrl(vector signed char __a, unsigned int __n) {
}
static __inline__ vector unsigned char __ATTRS_o_ai
vec_clrl(vector unsigned char __a, unsigned int __n) {
vec_clr_first(vector unsigned char __a, unsigned int __n) {
#ifdef __LITTLE_ENDIAN__
return __builtin_altivec_vclrrb((vector signed char)__a, __n);
#else
@ -18339,10 +18401,10 @@ vec_clrl(vector unsigned char __a, unsigned int __n) {
#endif
}
/* vec_clrr */
/* vec_clr_last */
static __inline__ vector signed char __ATTRS_o_ai
vec_clrr(vector signed char __a, unsigned int __n) {
vec_clr_last(vector signed char __a, unsigned int __n) {
#ifdef __LITTLE_ENDIAN__
return __builtin_altivec_vclrlb(__a, __n);
#else
@ -18351,7 +18413,7 @@ vec_clrr(vector signed char __a, unsigned int __n) {
}
static __inline__ vector unsigned char __ATTRS_o_ai
vec_clrr(vector unsigned char __a, unsigned int __n) {
vec_clr_last(vector unsigned char __a, unsigned int __n) {
#ifdef __LITTLE_ENDIAN__
return __builtin_altivec_vclrlb((vector signed char)__a, __n);
#else
@ -18733,36 +18795,39 @@ static __inline__ vector double __ATTRS_o_ai vec_splatid(const float __a) {
static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
vector signed int __a, const unsigned int __b, const signed int __c) {
const unsigned int __d = __b & 0x01;
#ifdef __LITTLE_ENDIAN__
__a[1 - __b] = __c;
__a[3 - __b] = __c;
__a[1 - __d] = __c;
__a[3 - __d] = __c;
#else
__a[__b] = __c;
__a[2 + __b] = __c;
__a[__d] = __c;
__a[2 + __d] = __c;
#endif
return __a;
}
static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
const unsigned int __d = __b & 0x01;
#ifdef __LITTLE_ENDIAN__
__a[1 - __b] = __c;
__a[3 - __b] = __c;
__a[1 - __d] = __c;
__a[3 - __d] = __c;
#else
__a[__b] = __c;
__a[2 + __b] = __c;
__a[__d] = __c;
__a[2 + __d] = __c;
#endif
return __a;
}
static __inline__ vector float __ATTRS_o_ai
vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
const unsigned int __d = __b & 0x01;
#ifdef __LITTLE_ENDIAN__
__a[1 - __b] = __c;
__a[3 - __b] = __c;
__a[1 - __d] = __c;
__a[3 - __d] = __c;
#else
__a[__b] = __c;
__a[2 + __b] = __c;
__a[__d] = __c;
__a[2 + __d] = __c;
#endif
return __a;
}
@ -18976,6 +19041,51 @@ vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) {
#endif /* __SIZEOF_INT128__ */
#endif /* __POWER10_VECTOR__ */
#ifdef __POWER8_VECTOR__
#define __bcdadd(__a, __b, __ps) __builtin_ppc_bcdadd((__a), (__b), (__ps))
#define __bcdsub(__a, __b, __ps) __builtin_ppc_bcdsub((__a), (__b), (__ps))
static __inline__ long __bcdadd_ofl(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdadd_p(__CR6_SO, __a, __b);
}
static __inline__ long __bcdsub_ofl(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __b);
}
static __inline__ long __bcd_invalid(vector unsigned char __a) {
return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __a);
}
static __inline__ long __bcdcmpeq(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_EQ, __a, __b);
}
static __inline__ long __bcdcmplt(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_LT, __a, __b);
}
static __inline__ long __bcdcmpgt(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_GT, __a, __b);
}
static __inline__ long __bcdcmple(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_GT_REV, __a, __b);
}
static __inline__ long __bcdcmpge(vector unsigned char __a,
vector unsigned char __b) {
return __builtin_ppc_bcdsub_p(__CR6_LT_REV, __a, __b);
}
#endif // __POWER8_VECTOR__
#undef __ATTRS_o_ai
#endif /* __ALTIVEC_H */

View File

@ -10,6 +10,10 @@
#ifndef __AMMINTRIN_H
#define __AMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */

View File

@ -314,7 +314,7 @@ typedef struct __tile1024i_str {
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_loadd(__tile1024i *dst, const void *base,
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
@ -335,7 +335,7 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_stream_loadd(__tile1024i *dst, const void *base,
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
@ -357,7 +357,7 @@ static void __tile_stream_loadd(__tile1024i *dst, const void *base,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
@ -380,7 +380,7 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
@ -403,7 +403,7 @@ static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
@ -426,7 +426,7 @@ static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
@ -446,7 +446,8 @@ static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
/// \param stride
/// The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
__tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
@ -459,7 +460,7 @@ static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
/// \param dst
/// The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
static void __tile_zero(__tile1024i *dst) {
static __inline__ void __tile_zero(__tile1024i *dst) {
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
@ -479,7 +480,7 @@ static void __tile_zero(__tile1024i *dst) {
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);

View File

@ -730,6 +730,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
#endif
/* Memory Operations Intrinsics */
#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
#endif
/* Transactional Memory Extension (TME) Intrinsics */
#if __ARM_FEATURE_TME

9420
lib/include/arm_neon.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -20,25 +20,25 @@
/* SSE4 Multiple Packed Sums of Absolute Difference. */
#define _mm256_mpsadbw_epu8(X, Y, M) \
(__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
(__v32qi)(__m256i)(Y), (int)(M))
((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
(__v32qi)(__m256i)(Y), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi8(__m256i __a)
{
return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi16(__m256i __a)
{
return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_abs_epi32(__m256i __a)
{
return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
return (__m256i)__builtin_elementwise_abs((__v8si)__a);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
@ -114,8 +114,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
}
#define _mm256_alignr_epi8(a, b, n) \
(__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n))
((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (n)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_and_si256(__m256i __a, __m256i __b)
@ -149,8 +149,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
}
#define _mm256_blend_epi16(V1, V2, M) \
(__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
(__v16hi)(__m256i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
@ -253,73 +253,73 @@ _mm256_madd_epi16(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi8(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epi32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu8(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_max_epu32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi8(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epi32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu8(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_min_epu32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
}
static __inline__ int __DEFAULT_FN_ATTRS256
@ -467,13 +467,13 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
}
#define _mm256_shuffle_epi32(a, imm) \
(__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
#define _mm256_shufflehi_epi16(a, imm) \
(__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
#define _mm256_shufflelo_epi16(a, imm) \
(__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sign_epi8(__m256i __a, __m256i __b)
@ -494,10 +494,10 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
}
#define _mm256_slli_si256(a, imm) \
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
#define _mm256_bslli_epi128(a, imm) \
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_slli_epi16(__m256i __a, int __count)
@ -560,10 +560,10 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
}
#define _mm256_srli_si256(a, imm) \
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
#define _mm256_bsrli_epi128(a, imm) \
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_srli_epi16(__m256i __a, int __count)
@ -743,12 +743,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
#define _mm_blend_epi32(V1, V2, M) \
(__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), (int)(M))
((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
(__v4si)(__m128i)(V2), (int)(M)))
#define _mm256_blend_epi32(V1, V2, M) \
(__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_broadcastb_epi8(__m128i __X)
@ -806,7 +806,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
}
#define _mm256_permute4x64_pd(V, M) \
(__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
@ -815,17 +815,17 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
}
#define _mm256_permute4x64_epi64(V, M) \
(__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
#define _mm256_permute2x128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
#define _mm256_extracti128_si256(V, M) \
(__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
#define _mm256_inserti128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
(__v2di)(__m128i)(V2), (int)(M))
((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
(__v2di)(__m128i)(V2), (int)(M)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskload_epi32(int const *__X, __m256i __M)
@ -936,211 +936,211 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
}
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s))
(__v2df)(__m128d)(mask), (s)))
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)(__m256d)(mask), (s))
(__v4df)(__m256d)(mask), (s)))
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)(__m128d)(mask), (s))
(__v2df)(__m128d)(mask), (s)))
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)(__m256d)(mask), (s))
(__v4df)(__m256d)(mask), (s)))
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s))
(__v4sf)(__m128)(mask), (s)))
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)(__m256)(mask), (s))
(__v8sf)(__m256)(mask), (s)))
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)(__m128)(mask), (s))
(__v4sf)(__m128)(mask), (s)))
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)(__m128)(mask), (s))
(__v4sf)(__m128)(mask), (s)))
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4si)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s))
(__v4si)(__m128i)(mask), (s)))
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
(int const *)(m), \
(__v8si)(__m256i)(i), \
(__v8si)(__m256i)(mask), (s))
(__v8si)(__m256i)(mask), (s)))
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v2di)(__m128i)(i), \
(__v4si)(__m128i)(mask), (s))
(__v4si)(__m128i)(mask), (s)))
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
(int const *)(m), \
(__v4di)(__m256i)(i), \
(__v4si)(__m128i)(mask), (s))
(__v4si)(__m128i)(mask), (s)))
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s))
(__v2di)(__m128i)(mask), (s)))
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)(__m256i)(mask), (s))
(__v4di)(__m256i)(mask), (s)))
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)(__m128i)(mask), (s))
(__v2di)(__m128i)(mask), (s)))
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)(__m256i)(mask), (s))
(__v4di)(__m256i)(mask), (s)))
#define _mm_i32gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s))
(s)))
#define _mm256_i32gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4si)(__m128i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s))
(s)))
#define _mm_i64gather_pd(m, i, s) \
(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
(double const *)(m), \
(__v2di)(__m128i)(i), \
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
_mm_setzero_pd()), \
(s))
(s)))
#define _mm256_i64gather_pd(m, i, s) \
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
(double const *)(m), \
(__v4di)(__m256i)(i), \
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
_mm256_setzero_pd(), \
_CMP_EQ_OQ), \
(s))
(s)))
#define _mm_i32gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4si)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
(s)))
#define _mm256_i32gather_ps(m, i, s) \
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
(float const *)(m), \
(__v8si)(__m256i)(i), \
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
_mm256_setzero_ps(), \
_CMP_EQ_OQ), \
(s))
(s)))
#define _mm_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v2di)(__m128i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
(s)))
#define _mm256_i64gather_ps(m, i, s) \
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
(float const *)(m), \
(__v4di)(__m256i)(i), \
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
_mm_setzero_ps()), \
(s))
(s)))
#define _mm_i32gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4si)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm256_i32gather_epi32(m, i, s) \
(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
(int const *)(m), (__v8si)(__m256i)(i), \
(__v8si)_mm256_set1_epi32(-1), (s))
(__v8si)_mm256_set1_epi32(-1), (s)))
#define _mm_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v2di)(__m128i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm256_i64gather_epi32(m, i, s) \
(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
(int const *)(m), (__v4di)(__m256i)(i), \
(__v4si)_mm_set1_epi32(-1), (s))
(__v4si)_mm_set1_epi32(-1), (s)))
#define _mm_i32gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s))
(__v2di)_mm_set1_epi64x(-1), (s)))
#define _mm256_i32gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4si)(__m128i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s))
(__v4di)_mm256_set1_epi64x(-1), (s)))
#define _mm_i64gather_epi64(m, i, s) \
(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
(long long const *)(m), \
(__v2di)(__m128i)(i), \
(__v2di)_mm_set1_epi64x(-1), (s))
(__v2di)_mm_set1_epi64x(-1), (s)))
#define _mm256_i64gather_epi64(m, i, s) \
(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
(long long const *)(m), \
(__v4di)(__m256i)(i), \
(__v4di)_mm256_set1_epi64x(-1), (s))
(__v4di)_mm256_set1_epi64x(-1), (s)))
#undef __DEFAULT_FN_ATTRS256
#undef __DEFAULT_FN_ATTRS128

View File

@ -232,7 +232,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
///
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
@ -247,7 +247,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
/// bit is not set.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
@ -265,7 +265,7 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
/// A 16-bit mask.
/// \param __A
/// A 256-bit vector of [16 x bfloat].
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
static __inline__ __m512 __DEFAULT_FN_ATTRS512
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(

View File

@ -178,16 +178,16 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B)
}
#define _kshiftli_mask32(A, I) \
(__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))
((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
#define _kshiftri_mask32(A, I) \
(__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))
((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
#define _kshiftli_mask64(A, I) \
(__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))
((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
#define _kshiftri_mask64(A, I) \
(__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))
((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_cvtmask32_u32(__mmask32 __A) {
@ -232,44 +232,44 @@ _store_mask64(__mmask64 *__A, __mmask64 __B) {
/* Integer compare */
#define _mm512_cmp_epi8_mask(a, b, p) \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1)
(__mmask64)-1))
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m))
(__mmask64)(m)))
#define _mm512_cmp_epu8_mask(a, b, p) \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)-1)
(__mmask64)-1))
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
(__v64qi)(__m512i)(b), (int)(p), \
(__mmask64)(m))
(__mmask64)(m)))
#define _mm512_cmp_epi16_mask(a, b, p) \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1)
(__mmask32)-1))
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m))
(__mmask32)(m)))
#define _mm512_cmp_epu16_mask(a, b, p) \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)-1)
(__mmask32)-1))
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
(__v32hi)(__m512i)(b), (int)(p), \
(__mmask32)(m))
(__mmask32)(m)))
#define _mm512_cmpeq_epi8_mask(A, B) \
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -485,7 +485,7 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_abs_epi8 (__m512i __A)
{
return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -507,7 +507,7 @@ _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_abs_epi16 (__m512i __A)
{
return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -751,7 +751,7 @@ _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epi8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -773,7 +773,7 @@ _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epi16 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -796,7 +796,7 @@ _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epu8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -818,7 +818,7 @@ _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_max_epu16 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -840,7 +840,7 @@ _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epi8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -862,7 +862,7 @@ _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epi16 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -884,7 +884,7 @@ _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epu8 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -906,7 +906,7 @@ _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_min_epu16 (__m512i __A, __m512i __B)
{
return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -1428,36 +1428,36 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
#define _mm512_shufflehi_epi16(A, imm) \
(__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))
((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflehi_epi16((A), \
(imm)), \
(__v32hi)(__m512i)(W))
(__v32hi)(__m512i)(W)))
#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflehi_epi16((A), \
(imm)), \
(__v32hi)_mm512_setzero_si512())
(__v32hi)_mm512_setzero_si512()))
#define _mm512_shufflelo_epi16(A, imm) \
(__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))
((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflelo_epi16((A), \
(imm)), \
(__v32hi)(__m512i)(W))
(__v32hi)(__m512i)(W)))
#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shufflelo_epi16((A), \
(imm)), \
(__v32hi)_mm512_setzero_si512())
(__v32hi)_mm512_setzero_si512()))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sllv_epi16(__m512i __A, __m512i __B)
@ -1527,7 +1527,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
}
#define _mm512_bslli_epi128(a, imm) \
(__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_srlv_epi16(__m512i __A, __m512i __B)
@ -1664,7 +1664,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
}
#define _mm512_bsrli_epi128(a, imm) \
(__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
@ -1984,32 +1984,32 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
}
#define _mm512_alignr_epi8(A, B, N) \
(__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(N))
((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(N)))
#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
(__v64qi)(__m512i)(W))
(__v64qi)(__m512i)(W)))
#define _mm512_maskz_alignr_epi8(U, A, B, N) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
(__v64qi)(__m512i)_mm512_setzero_si512())
(__v64qi)(__m512i)_mm512_setzero_si512()))
#define _mm512_dbsad_epu8(A, B, imm) \
(__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm))
((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), (int)(imm)))
#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
(__v32hi)(__m512i)(W))
(__v32hi)(__m512i)(W)))
#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
(__v32hi)_mm512_setzero_si512())
(__v32hi)_mm512_setzero_si512()))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_sad_epu8 (__m512i __A, __m512i __B)

View File

@ -121,10 +121,10 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B)
}
#define _kshiftli_mask8(A, I) \
(__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))
((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
#define _kshiftri_mask8(A, I) \
(__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))
((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_cvtmask8_u32(__mmask8 __A) {
@ -342,19 +342,19 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
}
#define _mm512_cvt_roundpd_epi64(A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtpd_epu64 (__m512d __A) {
@ -381,19 +381,19 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
}
#define _mm512_cvt_roundpd_epu64(A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtps_epi64 (__m256 __A) {
@ -420,19 +420,19 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
}
#define _mm512_cvt_roundps_epi64(A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvtps_epu64 (__m256 __A) {
@ -459,19 +459,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
}
#define _mm512_cvt_roundps_epu64(A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512d __DEFAULT_FN_ATTRS512
@ -494,19 +494,19 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
}
#define _mm512_cvt_roundepi64_pd(A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m256 __DEFAULT_FN_ATTRS512
_mm512_cvtepi64_ps (__m512i __A) {
@ -533,19 +533,19 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
}
#define _mm512_cvt_roundepi64_ps(A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)(__m256)(W), (__mmask8)(U), \
(int)(R))
(int)(R)))
#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
@ -573,19 +573,19 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
}
#define _mm512_cvtt_roundpd_epi64(A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttpd_epu64 (__m512d __A) {
@ -612,19 +612,19 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
}
#define _mm512_cvtt_roundpd_epu64(A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttps_epi64 (__m256 __A) {
@ -651,19 +651,19 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
}
#define _mm512_cvtt_roundps_epi64(A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_cvttps_epu64 (__m256 __A) {
@ -690,19 +690,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
}
#define _mm512_cvtt_roundps_epu64(A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)(__m512i)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
(__v8di)_mm512_setzero_si512(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m512d __DEFAULT_FN_ATTRS512
_mm512_cvtepu64_pd (__m512i __A) {
@ -724,20 +724,20 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
}
#define _mm512_cvt_roundepu64_pd(A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
static __inline__ __m256 __DEFAULT_FN_ATTRS512
@ -765,290 +765,290 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
}
#define _mm512_cvt_roundepu64_ps(A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)(__m256)(W), (__mmask8)(U), \
(int)(R))
(int)(R)))
#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_range_pd(A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_range_pd(W, U, A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_range_pd(U, A, B, C) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_range_round_pd(A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)(__m512d)(W), (__mmask8)(U), \
(int)(R))
(int)(R)))
#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(B), (int)(C), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_range_ps(A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_range_ps(W, U, A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)(__m512)(W), (__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_range_ps(U, A, B, C) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_range_round_ps(A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R))
(__mmask16)-1, (int)(R)))
#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)(__m512)(W), (__mmask16)(U), \
(int)(R))
(int)(R)))
#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(B), (int)(C), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), (int)(R))
(__mmask16)(U), (int)(R)))
#define _mm_range_round_ss(A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8) -1, (int)(C),\
(int)(R))
(int)(R)))
#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W),\
(__mmask8)(U), (int)(C),\
(int)(R))
(int)(R)))
#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_range_round_ss(U, A, B, C, R) \
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C),\
(int)(R))
(int)(R)))
#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_range_round_sd(A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8) -1, (int)(C),\
(int)(R))
(int)(R)))
#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W),\
(__mmask8)(U), (int)(C),\
(int)(R))
(int)(R)))
#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm_maskz_range_round_sd(U, A, B, C, R) \
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C),\
(int)(R))
(int)(R)))
#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
#define _mm512_reduce_pd(A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_reduce_pd(W, U, A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)(__m512d)(W), \
(__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_reduce_pd(U, A, B) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_reduce_ps(A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_mask_reduce_ps(W, U, A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)(__m512)(W), \
(__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_maskz_reduce_ps(U, A, B) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm512_reduce_round_pd(A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)(__m512d)(W), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(U), (int)(R))
(__mmask8)(U), (int)(R)))
#define _mm512_reduce_round_ps(A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R))
(__mmask16)-1, (int)(R)))
#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)(__m512)(W), \
(__mmask16)(U), (int)(R))
(__mmask16)(U), (int)(R)))
#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(U), (int)(R))
(__mmask16)(U), (int)(R)))
#define _mm_reduce_ss(A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
(int)(C), _MM_FROUND_CUR_DIRECTION)
(int)(C), _MM_FROUND_CUR_DIRECTION))
#define _mm_mask_reduce_ss(W, U, A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W), (__mmask8)(U), \
(int)(C), _MM_FROUND_CUR_DIRECTION)
(int)(C), _MM_FROUND_CUR_DIRECTION))
#define _mm_maskz_reduce_ss(U, A, B, C) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm_reduce_round_ss(A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
(int)(C), (int)(R))
(int)(C), (int)(R)))
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(W), (__mmask8)(U), \
(int)(C), (int)(R))
(int)(C), (int)(R)))
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(U), (int)(C), (int)(R))
(__mmask8)(U), (int)(C), (int)(R)))
#define _mm_reduce_sd(A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(C), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm_mask_reduce_sd(W, U, A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W), (__mmask8)(U), \
(int)(C), _MM_FROUND_CUR_DIRECTION)
(int)(C), _MM_FROUND_CUR_DIRECTION))
#define _mm_maskz_reduce_sd(U, A, B, C) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C), \
_MM_FROUND_CUR_DIRECTION)
_MM_FROUND_CUR_DIRECTION))
#define _mm_reduce_round_sd(A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(C), (int)(R))
(__mmask8)-1, (int)(C), (int)(R)))
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(W), (__mmask8)(U), \
(int)(C), (int)(R))
(int)(C), (int)(R)))
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U), (int)(C), (int)(R))
(__mmask8)(U), (int)(C), (int)(R)))
static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
_mm512_movepi32_mask (__m512i __A)
@ -1218,158 +1218,158 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
}
#define _mm512_extractf32x8_ps(A, imm) \
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
(__v8sf)_mm256_undefined_ps(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
(__v8sf)(__m256)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_extractf64x2_pd(A, imm) \
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
(int)(imm), \
(__v2df)_mm_undefined_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
(int)(imm), \
(__v2df)(__m128d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
(int)(imm), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_extracti32x8_epi32(A, imm) \
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
(__v8si)_mm256_undefined_si256(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
(__v8si)(__m256i)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
(__v8si)_mm256_setzero_si256(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_extracti64x2_epi64(A, imm) \
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
(int)(imm), \
(__v2di)_mm_undefined_si128(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
(int)(imm), \
(__v2di)(__m128i)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
(int)(imm), \
(__v2di)_mm_setzero_si128(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm512_insertf32x8(A, B, imm) \
(__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
(__v8sf)(__m256)(B), (int)(imm))
((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
(__v8sf)(__m256)(B), (int)(imm)))
#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
(__v16sf)(__m512)(W))
(__v16sf)(__m512)(W)))
#define _mm512_maskz_insertf32x8(U, A, B, imm) \
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
(__v16sf)_mm512_setzero_ps())
(__v16sf)_mm512_setzero_ps()))
#define _mm512_insertf64x2(A, B, imm) \
(__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
(__v2df)(__m128d)(B), (int)(imm))
((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
(__v2df)(__m128d)(B), (int)(imm)))
#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
(__v8df)(__m512d)(W))
(__v8df)(__m512d)(W)))
#define _mm512_maskz_insertf64x2(U, A, B, imm) \
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
(__v8df)_mm512_setzero_pd())
(__v8df)_mm512_setzero_pd()))
#define _mm512_inserti32x8(A, B, imm) \
(__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
(__v8si)(__m256i)(B), (int)(imm))
((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
(__v8si)(__m256i)(B), (int)(imm)))
#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
(__v16si)(__m512i)(W))
(__v16si)(__m512i)(W)))
#define _mm512_maskz_inserti32x8(U, A, B, imm) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
(__v16si)_mm512_setzero_si512())
(__v16si)_mm512_setzero_si512()))
#define _mm512_inserti64x2(A, B, imm) \
(__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
(__v2di)(__m128i)(B), (int)(imm))
((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
(__v2di)(__m128i)(B), (int)(imm)))
#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
(__v8di)(__m512i)(W))
(__v8di)(__m512i)(W)))
#define _mm512_maskz_inserti64x2(U, A, B, imm) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
(__v8di)_mm512_setzero_si512())
(__v8di)_mm512_setzero_si512()))
#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)(U))
((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)(U)))
#define _mm512_fpclass_ps_mask(A, imm) \
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)-1)
((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
(int)(imm), (__mmask16)-1))
#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm512_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
(__mmask8)-1))
#define _mm_fpclass_sd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1))
#define _mm_mask_fpclass_sd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm_fpclass_ss_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1))
#define _mm_mask_fpclass_ss_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U)))
#undef __DEFAULT_FN_ATTRS512
#undef __DEFAULT_FN_ATTRS

View File

@ -15,19 +15,19 @@
/* exp2a23 */
#define _mm512_exp2a23_round_pd(A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -39,19 +39,19 @@
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_round_ps(A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R))
(__mmask16)-1, (int)(R)))
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R))
(__mmask16)(M), (int)(R)))
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -64,19 +64,19 @@
/* rsqrt28 */
#define _mm512_rsqrt28_round_pd(A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -88,19 +88,19 @@
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_round_ps(A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R))
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R))
(__mmask16)(M), (int)(R)))
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -112,22 +112,22 @@
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_ss(A, B) \
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -139,22 +139,22 @@
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_rsqrt28_sd(A, B) \
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -167,19 +167,19 @@
/* rcp28 */
#define _mm512_rcp28_round_pd(A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)(__m512d)(S), (__mmask8)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
(__v8df)_mm512_setzero_pd(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm512_rcp28_pd(A) \
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@ -191,19 +191,19 @@
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_round_ps(A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)-1, (int)(R))
(__mmask16)-1, (int)(R)))
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)(__m512)(S), (__mmask16)(M), \
(int)(R))
(int)(R)))
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
(__v16sf)_mm512_setzero_ps(), \
(__mmask16)(M), (int)(R))
(__mmask16)(M), (int)(R)))
#define _mm512_rcp28_ps(A) \
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@ -215,22 +215,22 @@
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_ss(A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)(__m128)(S), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_ss(A, B) \
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@ -242,22 +242,22 @@
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_round_sd(A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1, (int)(R))
(__mmask8)-1, (int)(R)))
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)(__m128d)(S), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(M), (int)(R))
(__mmask8)(M), (int)(R)))
#define _mm_rcp28_sd(A, B) \
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)

File diff suppressed because it is too large Load Diff

3349
lib/include/avx512fp16intrin.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -129,88 +129,88 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
}
#define _mm512_shldi_epi64(A, B, I) \
(__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S))
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512())
(__v8di)_mm512_setzero_si512()))
#define _mm512_shldi_epi32(A, B, I) \
(__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S))
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512())
(__v16si)_mm512_setzero_si512()))
#define _mm512_shldi_epi16(A, B, I) \
(__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S))
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512())
(__v32hi)_mm512_setzero_si512()))
#define _mm512_shrdi_epi64(A, B, I) \
(__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)(__m512i)(S))
(__v8di)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
(__v8di)_mm512_setzero_si512())
(__v8di)_mm512_setzero_si512()))
#define _mm512_shrdi_epi32(A, B, I) \
(__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)(__m512i)(S))
(__v16si)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
(__v16si)_mm512_setzero_si512())
(__v16si)_mm512_setzero_si512()))
#define _mm512_shrdi_epi16(A, B, I) \
(__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I))
((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B), (int)(I)))
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)(__m512i)(S))
(__v32hi)(__m512i)(S)))
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
(__v32hi)_mm512_setzero_si512())
(__v32hi)_mm512_setzero_si512()))
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)

View File

@ -420,18 +420,46 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
return __R[0];
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
return _mm_castsi128_ps(
(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data.
///
/// \headerfile <x86intrin.h>
///
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
///
/// \headerfile <x86intrin.h>
@ -441,13 +469,33 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
///
/// \param __S
/// A 128-bit vector of [4 x float]. Elements are copied from __S when
/// the corresponding mask bit is not set.
/// \param __U
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
/// bit is not set.
/// \param __A
/// A 128-bit vector of [4 x bfloat].
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
static __inline__ __m128 __DEFAULT_FN_ATTRS128
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
16));
}
/// Convert Packed BF16 Data to Packed float Data using merging mask.
///
/// \headerfile <x86intrin.h>
@ -460,7 +508,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
/// bit is not set.
/// \param __A
/// A 128-bit vector of [8 x bfloat].
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
static __inline__ __m256 __DEFAULT_FN_ATTRS256
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(

View File

@ -21,84 +21,84 @@
/* Integer compare */
#define _mm_cmp_epi8_mask(a, b, p) \
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)-1)
(__mmask16)-1))
#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)(m))
(__mmask16)(m)))
#define _mm_cmp_epu8_mask(a, b, p) \
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)-1)
(__mmask16)-1))
#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (int)(p), \
(__mmask16)(m))
(__mmask16)(m)))
#define _mm256_cmp_epi8_mask(a, b, p) \
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)-1)
(__mmask32)-1))
#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)(m))
(__mmask32)(m)))
#define _mm256_cmp_epu8_mask(a, b, p) \
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)-1)
(__mmask32)-1))
#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
(__v32qi)(__m256i)(b), (int)(p), \
(__mmask32)(m))
(__mmask32)(m)))
#define _mm_cmp_epi16_mask(a, b, p) \
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)(m))
(__mmask8)(m)))
#define _mm_cmp_epu16_mask(a, b, p) \
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
(__v8hi)(__m128i)(b), (int)(p), \
(__mmask8)(m))
(__mmask8)(m)))
#define _mm256_cmp_epi16_mask(a, b, p) \
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)-1)
(__mmask16)-1))
#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)(m))
(__mmask16)(m)))
#define _mm256_cmp_epu16_mask(a, b, p) \
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)-1)
(__mmask16)-1))
#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
(__v16hi)(__m256i)(b), (int)(p), \
(__mmask16)(m))
(__mmask16)(m)))
#define _mm_cmpeq_epi8_mask(A, B) \
_mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@ -1821,46 +1821,46 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
(__v8hi)(__m128i)(W))
(__v8hi)(__m128i)(W)))
#define _mm_maskz_shufflehi_epi16(U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
(__v8hi)_mm_setzero_si128())
(__v8hi)_mm_setzero_si128()))
#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
(__v16hi)(__m256i)(W))
(__v16hi)(__m256i)(W)))
#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
(__v16hi)_mm256_setzero_si256())
(__v16hi)_mm256_setzero_si256()))
#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
(__v8hi)(__m128i)(W))
(__v8hi)(__m128i)(W)))
#define _mm_maskz_shufflelo_epi16(U, A, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
(__v8hi)_mm_setzero_si128())
(__v8hi)_mm_setzero_si128()))
#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflelo_epi16((A), \
(imm)), \
(__v16hi)(__m256i)(W))
(__v16hi)(__m256i)(W)))
#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shufflelo_epi16((A), \
(imm)), \
(__v16hi)_mm256_setzero_si256())
(__v16hi)_mm256_setzero_si256()))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_sllv_epi16(__m256i __A, __m256i __B)
@ -2756,52 +2756,52 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
}
#define _mm_mask_alignr_epi8(W, U, A, B, N) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
(__v16qi)(__m128i)(W))
(__v16qi)(__m128i)(W)))
#define _mm_maskz_alignr_epi8(U, A, B, N) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
(__v16qi)_mm_setzero_si128())
(__v16qi)_mm_setzero_si128()))
#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
(__v32qi)(__m256i)(W))
(__v32qi)(__m256i)(W)))
#define _mm256_maskz_alignr_epi8(U, A, B, N) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
(__v32qi)_mm256_setzero_si256())
(__v32qi)_mm256_setzero_si256()))
#define _mm_dbsad_epu8(A, B, imm) \
(__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(imm))
((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(imm)))
#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
(__v8hi)(__m128i)(W))
(__v8hi)(__m128i)(W)))
#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
(__v8hi)_mm_setzero_si128())
(__v8hi)_mm_setzero_si128()))
#define _mm256_dbsad_epu8(A, B, imm) \
(__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), (int)(imm))
((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), (int)(imm)))
#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
(__v16hi)(__m256i)(W))
(__v16hi)(__m256i)(W)))
#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
(__v16hi)_mm256_setzero_si256())
(__v16hi)_mm256_setzero_si256()))
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256

View File

@ -773,134 +773,134 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
}
#define _mm_range_pd(A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_range_pd(W, U, A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \
(__v2df)(__m128d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_maskz_range_pd(U, A, B, C) \
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
(__v2df)(__m128d)(B), (int)(C), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_range_pd(A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_range_pd(W, U, A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \
(__v4df)(__m256d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_maskz_range_pd(U, A, B, C) \
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
(__v4df)(__m256d)(B), (int)(C), \
(__v4df)_mm256_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_range_ps(A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_range_ps(W, U, A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \
(__v4sf)(__m128)(W), (__mmask8)(U))
(__v4sf)(__m128)(W), (__mmask8)(U)))
#define _mm_maskz_range_ps(U, A, B, C) \
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
(__v4sf)(__m128)(B), (int)(C), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_range_ps(A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_range_ps(W, U, A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \
(__v8sf)(__m256)(W), (__mmask8)(U))
(__v8sf)(__m256)(W), (__mmask8)(U)))
#define _mm256_maskz_range_ps(U, A, B, C) \
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
(__v8sf)(__m256)(B), (int)(C), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_reduce_pd(A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_reduce_pd(W, U, A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)(__m128d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_maskz_reduce_pd(U, A, B) \
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_reduce_pd(A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)_mm256_setzero_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_reduce_pd(W, U, A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)(__m256d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_maskz_reduce_pd(U, A, B) \
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
(__v4df)_mm256_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_reduce_ps(A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm_mask_reduce_ps(W, U, A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)(__m128)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm_maskz_reduce_ps(U, A, B) \
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
(__v4sf)_mm_setzero_ps(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_reduce_ps(A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_reduce_ps(W, U, A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)(__m256)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_maskz_reduce_ps(U, A, B) \
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
(__v8sf)_mm256_setzero_ps(), \
(__mmask8)(U))
(__mmask8)(U)))
static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
_mm_movepi32_mask (__m128i __A)
@ -1066,100 +1066,100 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
}
#define _mm256_extractf64x2_pd(A, imm) \
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
(int)(imm), \
(__v2df)_mm_undefined_pd(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
(int)(imm), \
(__v2df)(__m128d)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
(int)(imm), \
(__v2df)_mm_setzero_pd(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_extracti64x2_epi64(A, imm) \
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
(int)(imm), \
(__v2di)_mm_undefined_si128(), \
(__mmask8)-1)
(__mmask8)-1))
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
(int)(imm), \
(__v2di)(__m128i)(W), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
(int)(imm), \
(__v2di)_mm_setzero_si128(), \
(__mmask8)(U))
(__mmask8)(U)))
#define _mm256_insertf64x2(A, B, imm) \
(__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
(__v2df)(__m128d)(B), (int)(imm))
((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
(__v2df)(__m128d)(B), (int)(imm)))
#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
(__v4df)(__m256d)(W))
(__v4df)(__m256d)(W)))
#define _mm256_maskz_insertf64x2(U, A, B, imm) \
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
(__v4df)_mm256_setzero_pd())
(__v4df)_mm256_setzero_pd()))
#define _mm256_inserti64x2(A, B, imm) \
(__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
(__v2di)(__m128i)(B), (int)(imm))
((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
(__v2di)(__m128i)(B), (int)(imm)))
#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
(__v4di)(__m256i)(W))
(__v4di)(__m256i)(W)))
#define _mm256_maskz_inserti64x2(U, A, B, imm) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
(__v4di)_mm256_setzero_si256())
(__v4di)_mm256_setzero_si256()))
#define _mm_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
(__mmask8)-1))
#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm256_fpclass_pd_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
(__mmask8)-1))
#define _mm_mask_fpclass_ps_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm_fpclass_ps_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
(__mmask8)-1))
#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)(U))
((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)(U)))
#define _mm256_fpclass_ps_mask(A, imm) \
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)-1)
((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
(__mmask8)-1))
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256

2068
lib/include/avx512vlfp16intrin.h vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -239,172 +239,172 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
}
#define _mm256_shldi_epi64(A, B, I) \
(__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S))
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256())
(__v4di)_mm256_setzero_si256()))
#define _mm_shldi_epi64(A, B, I) \
(__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S))
(__v2di)(__m128i)(S)))
#define _mm_maskz_shldi_epi64(U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128())
(__v2di)_mm_setzero_si128()))
#define _mm256_shldi_epi32(A, B, I) \
(__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S))
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256())
(__v8si)_mm256_setzero_si256()))
#define _mm_shldi_epi32(A, B, I) \
(__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S))
(__v4si)(__m128i)(S)))
#define _mm_maskz_shldi_epi32(U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128())
(__v4si)_mm_setzero_si128()))
#define _mm256_shldi_epi16(A, B, I) \
(__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S))
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256())
(__v16hi)_mm256_setzero_si256()))
#define _mm_shldi_epi16(A, B, I) \
(__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S))
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shldi_epi16(U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128())
(__v8hi)_mm_setzero_si128()))
#define _mm256_shrdi_epi64(A, B, I) \
(__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)(__m256i)(S))
(__v4di)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
(__v4di)_mm256_setzero_si256())
(__v4di)_mm256_setzero_si256()))
#define _mm_shrdi_epi64(A, B, I) \
(__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)(__m128i)(S))
(__v2di)(__m128i)(S)))
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
(__v2di)_mm_setzero_si128())
(__v2di)_mm_setzero_si128()))
#define _mm256_shrdi_epi32(A, B, I) \
(__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)(__m256i)(S))
(__v8si)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
(__v8si)_mm256_setzero_si256())
(__v8si)_mm256_setzero_si256()))
#define _mm_shrdi_epi32(A, B, I) \
(__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)(__m128i)(S))
(__v4si)(__m128i)(S)))
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
(__v4si)_mm_setzero_si128())
(__v4si)_mm_setzero_si128()))
#define _mm256_shrdi_epi16(A, B, I) \
(__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I))
((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B), (int)(I)))
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)(__m256i)(S))
(__v16hi)(__m256i)(S)))
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
(__v16hi)_mm256_setzero_si256())
(__v16hi)_mm256_setzero_si256()))
#define _mm_shrdi_epi16(A, B, I) \
(__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I))
((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (int)(I)))
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)(__m128i)(S))
(__v8hi)(__m128i)(S)))
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
(__v8hi)_mm_setzero_si128())
(__v8hi)_mm_setzero_si128()))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)

View File

@ -36,7 +36,7 @@
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpbusd_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -56,7 +56,7 @@
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpbusds_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -74,7 +74,7 @@
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpwssd_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -92,7 +92,7 @@
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpwssds_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -112,7 +112,7 @@
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpbusd_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@ -132,7 +132,7 @@
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpbusds_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -150,7 +150,7 @@
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpwssd_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@ -168,7 +168,7 @@
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpwssds_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)

View File

@ -400,7 +400,7 @@ _mm256_rcp_ps(__m256 __a)
/// 11: Truncated.
/// \returns A 256-bit vector of [4 x double] containing the rounded values.
#define _mm256_round_pd(V, M) \
(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
/// Rounds the values stored in a 256-bit vector of [8 x float] as
/// specified by the byte operand. The source values are rounded to integer
@ -432,7 +432,7 @@ _mm256_rcp_ps(__m256 __a)
/// 11: Truncated.
/// \returns A 256-bit vector of [8 x float] containing the rounded values.
#define _mm256_round_ps(V, M) \
(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
/// source values are rounded up to integer values and returned as 64-bit
@ -989,7 +989,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_permute_pd(A, C) \
(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
/// Copies the values in a 256-bit vector of [4 x double] as specified by
/// the immediate integer operand.
@ -1029,7 +1029,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute_pd(A, C) \
(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
/// Copies the values in a 128-bit vector of [4 x float] as specified by
/// the immediate integer operand.
@ -1085,7 +1085,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_permute_ps(A, C) \
(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
/// Copies the values in a 256-bit vector of [8 x float] as specified by
/// the immediate integer operand.
@ -1177,7 +1177,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// returned vector.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute_ps(A, C) \
(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
/// Permutes 128-bit data values stored in two 256-bit vectors of
/// [4 x double], as specified by the immediate integer operand.
@ -1217,8 +1217,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_permute2f128_pd(V1, V2, M) \
(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M))
((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M)))
/// Permutes 128-bit data values stored in two 256-bit vectors of
/// [8 x float], as specified by the immediate integer operand.
@ -1258,8 +1258,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_permute2f128_ps(V1, V2, M) \
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M))
((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M)))
/// Permutes 128-bit data values stored in two 256-bit integer vectors,
/// as specified by the immediate integer operand.
@ -1298,8 +1298,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// destination.
/// \returns A 256-bit integer vector containing the copied values.
#define _mm256_permute2f128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M))
((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
(__v8si)(__m256i)(V2), (int)(M)))
/* Vector Blend */
/// Merges 64-bit double-precision data values stored in either of the
@ -1327,8 +1327,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [4 x double] containing the copied values.
#define _mm256_blend_pd(V1, V2, M) \
(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M))
((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
(__v4df)(__m256d)(V2), (int)(M)))
/// Merges 32-bit single-precision data values stored in either of the
/// two 256-bit vectors of [8 x float], as specified by the immediate
@ -1355,8 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
/// operand \a V2 is copied to the same position in the destination.
/// \returns A 256-bit vector of [8 x float] containing the copied values.
#define _mm256_blend_ps(V1, V2, M) \
(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M))
((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (int)(M)))
/// Merges 64-bit double-precision data values stored in either of the
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
@ -1453,8 +1453,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// two parallel dot product computations.
/// \returns A 256-bit vector of [8 x float] containing the two dot products.
#define _mm256_dp_ps(V1, V2, M) \
(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M))
((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
(__v8sf)(__m256)(V2), (M)))
/* Vector shuffle */
/// Selects 8 float values from the 256-bit operands of [8 x float], as
@ -1507,8 +1507,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
#define _mm256_shuffle_ps(a, b, mask) \
(__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (int)(mask))
((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (int)(mask)))
/// Selects four double-precision values from the 256-bit operands of
/// [4 x double], as specified by the immediate value operand.
@ -1553,8 +1553,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// destination.
/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
#define _mm256_shuffle_pd(a, b, mask) \
(__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (int)(mask))
((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (int)(mask)))
/* Compare */
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
@ -1647,8 +1647,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_pd(a, b, c) \
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c))
((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)))
/// Compares each of the corresponding values of two 128-bit vectors of
/// [4 x float], using the operation specified by the immediate integer
@ -1707,8 +1707,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ps(a, b, c) \
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c))
((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)))
/// Compares each of the corresponding double-precision values of two
/// 256-bit vectors of [4 x double], using the operation specified by the
@ -1767,8 +1767,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
#define _mm256_cmp_pd(a, b, c) \
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (c))
((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
(__v4df)(__m256d)(b), (c)))
/// Compares each of the corresponding values of two 256-bit vectors of
/// [8 x float], using the operation specified by the immediate integer
@ -1827,8 +1827,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
#define _mm256_cmp_ps(a, b, c) \
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (c))
((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
(__v8sf)(__m256)(b), (c)))
/// Compares each of the corresponding scalar double-precision values of
/// two 128-bit vectors of [2 x double], using the operation specified by the
@ -1886,8 +1886,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
#define _mm_cmp_sd(a, b, c) \
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c))
((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
(__v2df)(__m128d)(b), (c)))
/// Compares each of the corresponding scalar values of two 128-bit
/// vectors of [4 x float], using the operation specified by the immediate
@ -1945,8 +1945,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// 0x1F: True (unordered, signaling)
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
#define _mm_cmp_ss(a, b, c) \
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c))
((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
(__v4sf)(__m128)(b), (c)))
/// Takes a [8 x i32] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -1964,7 +1964,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 32 bits of extended
/// packed data.
#define _mm256_extract_epi32(X, N) \
(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
/// Takes a [16 x i16] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -1982,8 +1982,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
/// packed data.
#define _mm256_extract_epi16(X, N) \
(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
(int)(N))
((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
(int)(N)))
/// Takes a [32 x i8] vector and returns the vector element value
/// indexed by the immediate constant operand.
@ -2001,8 +2001,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
/// packed data.
#define _mm256_extract_epi8(X, N) \
(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
(int)(N))
((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
(int)(N)))
#ifdef __x86_64__
/// Takes a [4 x i64] vector and returns the vector element value
@ -2021,7 +2021,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A 64-bit integer containing the extracted 64 bits of extended
/// packed data.
#define _mm256_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
#endif
/// Takes a [8 x i32] vector and replaces the vector element value
@ -2043,8 +2043,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi32(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
(int)(I), (int)(N)))
/// Takes a [16 x i16] vector and replaces the vector element value
@ -2066,8 +2066,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi16(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
(int)(I), (int)(N)))
/// Takes a [32 x i8] vector and replaces the vector element value
/// indexed by the immediate constant operand with a new value. Returns the
@ -2088,8 +2088,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi8(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
(int)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
(int)(I), (int)(N)))
#ifdef __x86_64__
/// Takes a [4 x i64] vector and replaces the vector element value
@ -2111,8 +2111,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
/// \returns A copy of vector \a __a, after replacing its element indexed by
/// \a __imm with \a __b.
#define _mm256_insert_epi64(X, I, N) \
(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
(long long)(I), (int)(N))
((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
(long long)(I), (int)(N)))
#endif
/* Conversion */
@ -4592,8 +4592,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
#define _mm256_insertf128_ps(V1, V2, M) \
(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
(__v4sf)(__m128)(V2), (int)(M))
((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
(__v4sf)(__m128)(V2), (int)(M)))
/// Constructs a new 256-bit vector of [4 x double] by first duplicating
/// a 256-bit vector of [4 x double] given in the first parameter, and then
@ -4630,8 +4630,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
#define _mm256_insertf128_pd(V1, V2, M) \
(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
(__v2df)(__m128d)(V2), (int)(M))
((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
(__v2df)(__m128d)(V2), (int)(M)))
/// Constructs a new 256-bit integer vector by first duplicating a
/// 256-bit integer vector given in the first parameter, and then replacing
@ -4668,8 +4668,8 @@ _mm256_zextsi128_si256(__m128i __a)
/// result.
/// \returns A 256-bit integer vector containing the interleaved values.
#define _mm256_insertf128_si256(V1, V2, M) \
(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
(__v4si)(__m128i)(V2), (int)(M))
((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
(__v4si)(__m128i)(V2), (int)(M)))
/*
Vector extract.
@ -4698,7 +4698,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
#define _mm256_extractf128_ps(V, M) \
(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
/// Extracts either the upper or the lower 128 bits from a 256-bit vector
/// of [4 x double], as determined by the immediate integer parameter, and
@ -4722,7 +4722,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
#define _mm256_extractf128_pd(V, M) \
(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
/// Extracts either the upper or the lower 128 bits from a 256-bit
/// integer vector, as determined by the immediate integer parameter, and
@ -4746,177 +4746,7 @@ _mm256_zextsi128_si256(__m128i __a)
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
/// \returns A 128-bit integer vector containing the extracted bits.
#define _mm256_extractf128_si256(V, M) \
(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
/* SIMD load ops (unaligned) */
/// Loads two 128-bit floating-point vectors of [4 x float] from
/// unaligned memory locations and constructs a 256-bit floating-point vector
/// of [8 x float] by concatenating the two 128-bit vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing 4 consecutive
/// single-precision floating-point values. These values are to be copied to
/// bits[255:128] of the result. The address of the memory location does not
/// have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing 4 consecutive
/// single-precision floating-point values. These values are to be copied to
/// bits[127:0] of the result. The address of the memory location does not
/// have to be aligned.
/// \returns A 256-bit floating-point vector of [8 x float] containing the
/// concatenated result.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
{
__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
}
/// Loads two 128-bit floating-point vectors of [2 x double] from
/// unaligned memory locations and constructs a 256-bit floating-point vector
/// of [4 x double] by concatenating the two 128-bit vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing two consecutive
/// double-precision floating-point values. These values are to be copied to
/// bits[255:128] of the result. The address of the memory location does not
/// have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing two consecutive
/// double-precision floating-point values. These values are to be copied to
/// bits[127:0] of the result. The address of the memory location does not
/// have to be aligned.
/// \returns A 256-bit floating-point vector of [4 x double] containing the
/// concatenated result.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
{
__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
}
/// Loads two 128-bit integer vectors from unaligned memory locations and
/// constructs a 256-bit integer vector by concatenating the two 128-bit
/// vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing a 128-bit integer
/// vector. This vector is to be copied to bits[255:128] of the result. The
/// address of the memory location does not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing a 128-bit integer
/// vector. This vector is to be copied to bits[127:0] of the result. The
/// address of the memory location does not have to be aligned.
/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
{
__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
}
/* SIMD store ops (unaligned) */
/// Stores the upper and lower 128 bits of a 256-bit floating-point
/// vector of [8 x float] into two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit floating-point vector of [8 x float].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
{
__m128 __v128;
__v128 = _mm256_castps256_ps128(__a);
_mm_storeu_ps(__addr_lo, __v128);
__v128 = _mm256_extractf128_ps(__a, 1);
_mm_storeu_ps(__addr_hi, __v128);
}
/// Stores the upper and lower 128 bits of a 256-bit floating-point
/// vector of [4 x double] into two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit floating-point vector of [4 x double].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
{
__m128d __v128;
__v128 = _mm256_castpd256_pd128(__a);
_mm_storeu_pd(__addr_lo, __v128);
__v128 = _mm256_extractf128_pd(__a, 1);
_mm_storeu_pd(__addr_hi, __v128);
}
/// Stores the upper and lower 128 bits of a 256-bit integer vector into
/// two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit integer vector.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
{
__m128i __v128;
__v128 = _mm256_castsi256_si128(__a);
_mm_storeu_si128(__addr_lo, __v128);
__v128 = _mm256_extractf128_si256(__a, 1);
_mm_storeu_si128(__addr_hi, __v128);
}
((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
/// Constructs a 256-bit floating-point vector of [8 x float] by
/// concatenating two 128-bit floating-point vectors of [4 x float].
@ -5047,6 +4877,173 @@ _mm256_setr_m128i (__m128i __lo, __m128i __hi)
return (__m256i)_mm256_set_m128i(__hi, __lo);
}
/* SIMD load ops (unaligned) */
/// Loads two 128-bit floating-point vectors of [4 x float] from
/// unaligned memory locations and constructs a 256-bit floating-point vector
/// of [8 x float] by concatenating the two 128-bit vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing 4 consecutive
/// single-precision floating-point values. These values are to be copied to
/// bits[255:128] of the result. The address of the memory location does not
/// have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing 4 consecutive
/// single-precision floating-point values. These values are to be copied to
/// bits[127:0] of the result. The address of the memory location does not
/// have to be aligned.
/// \returns A 256-bit floating-point vector of [8 x float] containing the
/// concatenated result.
static __inline __m256 __DEFAULT_FN_ATTRS
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
{
return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
}
/// Loads two 128-bit floating-point vectors of [2 x double] from
/// unaligned memory locations and constructs a 256-bit floating-point vector
/// of [4 x double] by concatenating the two 128-bit vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing two consecutive
/// double-precision floating-point values. These values are to be copied to
/// bits[255:128] of the result. The address of the memory location does not
/// have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing two consecutive
/// double-precision floating-point values. These values are to be copied to
/// bits[127:0] of the result. The address of the memory location does not
/// have to be aligned.
/// \returns A 256-bit floating-point vector of [4 x double] containing the
/// concatenated result.
static __inline __m256d __DEFAULT_FN_ATTRS
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
{
return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
}
/// Loads two 128-bit integer vectors from unaligned memory locations and
/// constructs a 256-bit integer vector by concatenating the two 128-bit
/// vectors.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to load instructions followed by the
/// <c> VINSERTF128 </c> instruction.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location containing a 128-bit integer
/// vector. This vector is to be copied to bits[255:128] of the result. The
/// address of the memory location does not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location containing a 128-bit integer
/// vector. This vector is to be copied to bits[127:0] of the result. The
/// address of the memory location does not have to be aligned.
/// \returns A 256-bit integer vector containing the concatenated result.
static __inline __m256i __DEFAULT_FN_ATTRS
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
{
return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
}
/* SIMD store ops (unaligned) */
/// Stores the upper and lower 128 bits of a 256-bit floating-point
/// vector of [8 x float] into two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit floating-point vector of [8 x float].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
{
__m128 __v128;
__v128 = _mm256_castps256_ps128(__a);
_mm_storeu_ps(__addr_lo, __v128);
__v128 = _mm256_extractf128_ps(__a, 1);
_mm_storeu_ps(__addr_hi, __v128);
}
/// Stores the upper and lower 128 bits of a 256-bit floating-point
/// vector of [4 x double] into two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit floating-point vector of [4 x double].
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
{
__m128d __v128;
__v128 = _mm256_castpd256_pd128(__a);
_mm_storeu_pd(__addr_lo, __v128);
__v128 = _mm256_extractf128_pd(__a, 1);
_mm_storeu_pd(__addr_hi, __v128);
}
/// Stores the upper and lower 128 bits of a 256-bit integer vector into
/// two different unaligned memory locations.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
/// store instructions.
///
/// \param __addr_hi
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __addr_lo
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
/// copied to this memory location. The address of this memory location does
/// not have to be aligned.
/// \param __a
/// A 256-bit integer vector.
static __inline void __DEFAULT_FN_ATTRS
_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
{
__m128i __v128;
__v128 = _mm256_castsi256_si128(__a);
_mm_storeu_si128(__addr_lo, __v128);
__v128 = _mm256_extractf128_si256(__a, 1);
_mm_storeu_si128(__addr_hi, __v128);
}
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS128

View File

@ -42,10 +42,20 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
return __builtin_ia32_rdsspd(__a);
}
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
unsigned int t;
return __builtin_ia32_rdsspd(t);
}
#ifdef __x86_64__
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
return __builtin_ia32_rdsspq(__a);
}
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
unsigned long long t;
return __builtin_ia32_rdsspq(t);
}
#endif /* __x86_64__ */
#ifdef __x86_64__

3
lib/include/cpuid.h vendored
View File

@ -195,11 +195,12 @@
#define bit_PCONFIG 0x00040000
#define bit_IBT 0x00100000
#define bit_AMXBF16 0x00400000
#define bit_AVX512FP16 0x00800000
#define bit_AMXTILE 0x01000000
#define bit_AMXINT8 0x02000000
/* Features in %eax for leaf 7 sub-leaf 1 */
#define bit_AVXVNNI 0x00000008
#define bit_AVXVNNI 0x00000010
#define bit_AVX512BF16 0x00000020
#define bit_HRESET 0x00400000

100
lib/include/crc32intrin.h vendored Normal file
View File

@ -0,0 +1,100 @@
/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CRC32INTRIN_H
#define __CRC32INTRIN_H
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("crc32")))
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned char operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u8(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned short operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u16(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
}
/// Adds the first unsigned integer operand to the CRC-32C checksum of
/// the second unsigned integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u32(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
}
#ifdef __x86_64__
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned 64-bit integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#endif /* __CRC32INTRIN_H */

View File

@ -10,6 +10,10 @@
#ifndef __EMMINTRIN_H
#define __EMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <xmmintrin.h>
typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
@ -2371,7 +2375,7 @@ _mm_madd_epi16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
}
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
@ -2391,7 +2395,7 @@ _mm_max_epi16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
}
/// Compares corresponding elements of two 128-bit signed [8 x i16]
@ -2411,7 +2415,7 @@ _mm_max_epu8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi16(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
}
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
@ -2431,7 +2435,7 @@ _mm_min_epi16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
}
/// Multiplies the corresponding elements of two signed [8 x i16]
@ -2818,10 +2822,10 @@ _mm_xor_si128(__m128i __a, __m128i __b)
/// \a a.
/// \returns A 128-bit integer vector containing the left-shifted value.
#define _mm_slli_si128(a, imm) \
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
#define _mm_bslli_si128(a, imm) \
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
/// by the specified number of bits. Low-order bits are cleared.
@ -3035,10 +3039,10 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
/// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value.
#define _mm_srli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
#define _mm_bsrli_si128(a, imm) \
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
/// Right-shifts each of 16-bit values in the 128-bit integer vector
/// operand by the specified number of bits. High-order bits are cleared.
@ -4356,8 +4360,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
/// \returns An integer, whose lower 16 bits are selected from the 128-bit
/// integer vector parameter and the remaining bits are assigned zeros.
#define _mm_extract_epi16(a, imm) \
(int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
(int)(imm))
((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
(int)(imm)))
/// Constructs a 128-bit integer vector by first making a copy of the
/// 128-bit integer vector parameter, and then inserting the lower 16 bits
@ -4380,8 +4384,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
/// lower 16 bits of \a __b are written.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi16(a, b, imm) \
(__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
(int)(imm))
((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
(int)(imm)))
/// Copies the values of the most significant bits from each 8-bit
/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
@ -4430,7 +4434,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [127:96] of \a a.
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shuffle_epi32(a, imm) \
(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4460,7 +4464,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [63:48] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflelo_epi16(a, imm) \
(__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
@ -4490,7 +4494,7 @@ _mm_movemask_epi8(__m128i __a)
/// 11: assign values from bits [127:112] of \a a. \n
/// \returns A 128-bit integer vector containing the shuffled values.
#define _mm_shufflehi_epi16(a, imm) \
(__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
@ -4844,8 +4848,8 @@ _mm_movemask_pd(__m128d __a)
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
#define _mm_shuffle_pd(a, b, i) \
(__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
(int)(i))
((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
(int)(i)))
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
/// floating-point vector of [4 x float].

View File

@ -66,8 +66,8 @@ _cvtsh_ss(unsigned short __a)
/// 1XX: Use MXCSR.RC for rounding
/// \returns The converted 16-bit half-precision float value.
#define _cvtss_sh(a, imm) \
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0])
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
(imm)))[0]))
/// Converts a 128-bit vector containing 32-bit float values into a
/// 128-bit vector containing 16-bit half-precision float values.
@ -93,7 +93,7 @@ _cvtsh_ss(unsigned short __a)
/// values. The lower 64 bits are used to store the converted 16-bit
/// half-precision floating-point values.
#define _mm_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 128-bit vector containing 32-bit float values.
@ -136,7 +136,7 @@ _mm_cvtph_ps(__m128i __a)
/// \returns A 128-bit vector containing the converted 16-bit half-precision
/// float values.
#define _mm256_cvtps_ph(a, imm) \
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
/// Converts a 128-bit vector containing 16-bit half-precision float
/// values into a 256-bit vector of [8 x float].

21
lib/include/float.h vendored
View File

@ -14,10 +14,11 @@
* additional definitions provided for Windows.
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
*
* Also fall back on Darwin to allow additional definitions and
* Also fall back on Darwin and AIX to allow additional definitions and
* implementation-defined values.
*/
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
#if (defined(__APPLE__) || defined(__MINGW32__) || defined(_MSC_VER) || \
defined(_AIX)) && \
__STDC_HOSTED__ && __has_include_next(<float.h>)
/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
@ -37,7 +38,9 @@
# undef FLT_MANT_DIG
# undef DBL_MANT_DIG
# undef LDBL_MANT_DIG
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201103L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# undef DECIMAL_DIG
# endif
# undef FLT_DIG
@ -64,7 +67,9 @@
# undef FLT_MIN
# undef DBL_MIN
# undef LDBL_MIN
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201703L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# undef FLT_TRUE_MIN
# undef DBL_TRUE_MIN
# undef LDBL_TRUE_MIN
@ -87,7 +92,9 @@
#define DBL_MANT_DIG __DBL_MANT_DIG__
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201103L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# define DECIMAL_DIG __DECIMAL_DIG__
#endif
@ -123,7 +130,9 @@
#define DBL_MIN __DBL_MIN__
#define LDBL_MIN __LDBL_MIN__
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
__cplusplus >= 201703L || \
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__

View File

@ -28,14 +28,14 @@
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I))
(char)(I)))
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), \
(char)(I))
(char)(I)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
@ -46,14 +46,14 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
#ifdef __AVXINTRIN_H
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I))
(char)(I)))
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
(__v32qi)(__m256i)(B), \
(char)(I))
(char)(I)))
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
@ -65,31 +65,31 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
#ifdef __AVX512BWINTRIN_H
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I))
(char)(I)))
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S))
(__v64qi)(__m512i)(S)))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
(__v64qi)(__m512i)(B), \
(char)(I))
(char)(I)))
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
(__v64qi)(__m512i)(S))
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
(__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
(__v64qi)(__m512i)(S)))
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
U, A, B, I)
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
@ -117,39 +117,38 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
#ifdef __AVX512VLBWINTRIN_H
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S))
(__v16qi)(__m128i)(S)))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I)
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S))
(__v32qi)(__m256i)(S)))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I)
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
(__v16qi)(__m128i)(S))
(__v16qi)(__m128i)(S)))
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
U, A, B, I)
_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
(__v32qi)(__m256i)(S))
(__v32qi)(__m256i)(S)))
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
U, A, B, I)
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128

View File

@ -8003,17 +8003,6 @@
#define Q6_P_vtrunohb_PP __builtin_HEXAGON_S6_vtrunohb_ppp
#endif /* __HEXAGON_ARCH___ >= 62 */
#if __HEXAGON_ARCH__ >= 62
/* ==========================================================================
Assembly Syntax: Vd32=vmem(Rt32):nt
C Intrinsic Prototype: HVX_Vector Q6_V_vmem_R_nt(Word32 Rt)
Instruction Type: MAPPING
Execution Slots: SLOT0123
========================================================================== */
#define Q6_V_vmem_R_nt __builtin_HEXAGON_V6_ldntnt0
#endif /* __HEXAGON_ARCH___ >= 62 */
#if __HEXAGON_ARCH__ >= 65
/* ==========================================================================
Assembly Syntax: Pd4=!any8(vcmpb.eq(Rss32,Rtt32))

View File

@ -1177,37 +1177,6 @@ private:
#endif /* __cplusplus */
// V65 Silver types
#if __Q6S_ARCH__ >= 65
// Silver vector types are 128 bytes, and pairs are 256. The vector predicate
// types are 16 bytes and 32 bytes for pairs.
typedef long HEXAGON_VecPred128 __attribute__((__vector_size__(16)))
__attribute__((aligned(128)));
typedef long HEXAGON_VecPred256 __attribute__((__vector_size__(32)))
__attribute__((aligned(128)));
typedef long HEXAGON_Vect1024 __attribute__((__vector_size__(128)))
__attribute__((aligned(128)));
typedef long HEXAGON_Vect2048 __attribute__((__vector_size__(256)))
__attribute__((aligned(256)));
typedef long HEXAGON_UVect1024 __attribute__((__vector_size__(128)))
__attribute__((aligned(4)));
typedef long HEXAGON_UVect2048 __attribute__((__vector_size__(256)))
__attribute__((aligned(4)));
#define Q6S_VectorPredPair HEXAGON_VecPred256
#define Q6S_VectorPred HEXAGON_VecPred128
#define Q6S_Vector HEXAGON_Vect1024
#define Q6S_VectorPair HEXAGON_Vect2048
#define Q6S_UVector HEXAGON_UVect1024
#define Q6S_UVectorPair HEXAGON_UVect2048
#else /* __Q6S_ARCH__ >= 65 */
// V65 Vector types
#if __HVX_ARCH__ >= 65
#if defined __HVX__ && (__HVX_LENGTH__ == 128)
@ -1256,7 +1225,6 @@ private:
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 64) */
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 128) */
#endif /* __HVX_ARCH__ >= 65 */
#endif /* __Q6S_ARCH__ >= 65 */
/* Predicates */

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,7 @@
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
#if defined(__cplusplus) && (__cplusplus >= 201103L)
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
@ -282,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32b(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
@ -303,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32w(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
@ -324,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
__crc32d(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
@ -346,7 +346,7 @@ __crc32d(unsigned int __C, unsigned int __D)
* \returns The result of adding operand \a __C to the CRC-32C checksum of
* operand \a __D.
*/
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
__crc32q(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
@ -435,7 +435,7 @@ __rorq(unsigned long long __X, int __C) {
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_CAST
#undef __DEFAULT_FN_ATTRS_SSE42
#undef __DEFAULT_FN_ATTRS_CRC32
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
#endif /* __IA32INTRIN_H */

View File

@ -10,6 +10,10 @@
#ifndef __IMMINTRIN_H
#define __IMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <x86gprintrin.h>
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
@ -210,6 +214,20 @@
#include <avx512pfintrin.h>
#endif
/*
* FIXME: _Float16 type is legal only when HW support float16 operation.
* We use __AVX512FP16__ to identify if float16 is supported or not, so
* when float16 is not supported, the related header is not included.
*
*/
#if defined(__AVX512FP16__)
#include <avx512fp16intrin.h>
#endif
#if defined(__AVX512FP16__) && defined(__AVX512VL__)
#include <avx512vlfp16intrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512BF16__)
#include <avx512bf16intrin.h>
@ -525,13 +543,13 @@ extern "C" {
#if defined(__i386__) || defined(__x86_64__)
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
@ -539,13 +557,13 @@ _InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
#if defined(__x86_64__)
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
: "+r" (_Value), "+m" (*_Target) :: "memory");
return _Value;
}
@ -557,7 +575,7 @@ _InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
@ -565,7 +583,7 @@ _InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
long _Exchange, long _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
@ -575,7 +593,7 @@ _InterlockedCompareExchange_HLERelease(long volatile *_Destination,
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;
@ -583,7 +601,7 @@ _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
static __inline__ __int64 __DEFAULT_FN_ATTRS
_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
__int64 _Exchange, __int64 _Comparand) {
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
: "+a" (_Comparand), "+m" (*_Destination)
: "r" (_Exchange) : "memory");
return _Comparand;

40
lib/include/intrin.h vendored
View File

@ -97,8 +97,9 @@ unsigned long __readcr8(void);
unsigned int __readdr(unsigned int);
#ifdef __i386__
unsigned char __readfsbyte(unsigned long);
unsigned __int64 __readfsqword(unsigned long);
unsigned short __readfsword(unsigned long);
unsigned long __readfsdword(unsigned long);
unsigned __int64 __readfsqword(unsigned long);
#endif
unsigned __int64 __readmsr(unsigned long);
unsigned __int64 __readpmc(unsigned long);
@ -149,10 +150,8 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
void __attribute__((__deprecated__(
"use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
void __attribute__((__deprecated__(
"use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
void _ReadBarrier(void);
void _ReadWriteBarrier(void);
unsigned int _rorx_u32(unsigned int, const unsigned int);
int _sarx_i32(int, unsigned int);
#if __STDC_HOSTED__
@ -163,8 +162,7 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
void _Store_HLERelease(long volatile *, long);
void _Store64_HLERelease(__int64 volatile *, __int64);
void _StorePointer_HLERelease(void *volatile *, void *);
void __attribute__((__deprecated__(
"use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
void _WriteBarrier(void);
unsigned __int32 xbegin(void);
void _xend(void);
@ -457,7 +455,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
"rep movsb\n"
"xchg {%%esi, %1|%1, esi}"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
@ -467,12 +467,14 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
unsigned long const *__src,
size_t __n) {
#if defined(__x86_64__)
__asm__ __volatile__("rep movsl"
__asm__ __volatile__("rep movs{l|d}"
: "+D"(__dst), "+S"(__src), "+c"(__n)
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
"rep movs{l|d}\n"
"xchg {%%esi, %1|%1, esi}"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
@ -487,7 +489,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
"rep movsw\n"
"xchg {%%esi, %1|%1, esi}"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
@ -496,7 +500,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
unsigned long __x,
size_t __n) {
__asm__ __volatile__("rep stosl"
__asm__ __volatile__("rep stos{l|d}"
: "+D"(__dst), "+c"(__n)
: "a"(__x)
: "memory");
@ -538,9 +542,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
#else
/* x86-64 uses %rbx as the base register, so preserve it. */
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
__asm("xchgq %%rbx,%q1\n" \
__asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n" \
"cpuid\n" \
"xchgq %%rbx,%q1" \
"xchg{q} {%%rbx, %q1|%q1, rbx}" \
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__leaf), "2"(__count))
#endif
@ -600,13 +604,17 @@ __readmsr(unsigned long __register) {
static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
unsigned __LPTRINT_TYPE__ __cr3_val;
__asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
__asm__ __volatile__(
"mov {%%cr3, %0|%0, cr3}"
: "=r"(__cr3_val)
:
: "memory");
return __cr3_val;
}
static __inline__ void __DEFAULT_FN_ATTRS
__writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
__asm__ ("mov %0, %%cr3" : : "r"(__cr3_val) : "memory");
__asm__ ("mov {%0, %%cr3|cr3, %0}" : : "r"(__cr3_val) : "memory");
}
#ifdef __cplusplus

View File

@ -99,7 +99,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
}
/// Wrap a 128-bit AES key from __key into a key handle and output in
/// ((__m128i*)__h) to ((__m128i*)__h) + 5 and a 32-bit value as return.
/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return.
/// The explicit source operand __htype specifies handle restrictions.
///
/// \headerfile <x86intrin.h>
@ -120,9 +120,6 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
/// MEM[__h+127:__h] := Handle[127:0] // AAD
/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
/// OF := 0
/// SF := 0
/// ZF := 0
@ -136,7 +133,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
}
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
/// a 32-bit value as return.
/// The explicit source operand __htype specifies handle restrictions.
///
@ -160,9 +157,6 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
/// OF := 0
/// SF := 0
/// ZF := 0

20
lib/include/limits.h vendored
View File

@ -62,6 +62,26 @@
#define CHAR_BIT __CHAR_BIT__
/* C2x 5.2.4.2.1 */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#define BOOL_WIDTH __BOOL_WIDTH__
#define CHAR_WIDTH CHAR_BIT
#define SCHAR_WIDTH CHAR_BIT
#define UCHAR_WIDTH CHAR_BIT
#define USHRT_WIDTH __SHRT_WIDTH__
#define SHRT_WIDTH __SHRT_WIDTH__
#define UINT_WIDTH __INT_WIDTH__
#define INT_WIDTH __INT_WIDTH__
#define ULONG_WIDTH __LONG_WIDTH__
#define LONG_WIDTH __LONG_WIDTH__
#define ULLONG_WIDTH __LLONG_WIDTH__
#define LLONG_WIDTH __LLONG_WIDTH__
#define BITINT_MAXWIDTH __BITINT_MAXWIDTH__
#endif
#ifdef __CHAR_UNSIGNED__ /* -funsigned-char */
#define CHAR_MIN 0
#define CHAR_MAX UCHAR_MAX

View File

@ -10,6 +10,10 @@
#ifndef __MMINTRIN_H
#define __MMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
typedef long long __v1di __attribute__((__vector_size__(8)));

View File

@ -10,6 +10,10 @@
#ifndef __NMMINTRIN_H
#define __NMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
just include it now then. */
#include <smmintrin.h>

View File

@ -12,8 +12,8 @@
// Define extension macros
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
// For SPIR all extensions are supported.
#if defined(__SPIR__)
// For SPIR and SPIR-V all extensions are supported.
#if defined(__SPIR__) || defined(__SPIRV__)
#define cl_khr_subgroup_extended_types 1
#define cl_khr_subgroup_non_uniform_vote 1
#define cl_khr_subgroup_ballot 1
@ -25,12 +25,31 @@
#define cl_khr_integer_dot_product 1
#define __opencl_c_integer_dot_product_input_4x8bit 1
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
#define cl_ext_float_atomics 1
#ifdef cl_khr_fp16
#define __opencl_c_ext_fp16_global_atomic_load_store 1
#define __opencl_c_ext_fp16_local_atomic_load_store 1
#define __opencl_c_ext_fp16_global_atomic_add 1
#define __opencl_c_ext_fp16_local_atomic_add 1
#define __opencl_c_ext_fp16_global_atomic_min_max 1
#define __opencl_c_ext_fp16_local_atomic_min_max 1
#endif
#ifdef cl_khr_fp64
#define __opencl_c_ext_fp64_global_atomic_add 1
#define __opencl_c_ext_fp64_local_atomic_add 1
#define __opencl_c_ext_fp64_global_atomic_min_max 1
#define __opencl_c_ext_fp64_local_atomic_min_max 1
#endif
#define __opencl_c_ext_fp32_global_atomic_add 1
#define __opencl_c_ext_fp32_local_atomic_add 1
#define __opencl_c_ext_fp32_global_atomic_min_max 1
#define __opencl_c_ext_fp32_local_atomic_min_max 1
#endif // defined(__SPIR__)
#endif // defined(__SPIR__) || defined(__SPIRV__)
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
// Define feature macros for OpenCL C 2.0
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
#if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200)
#define __opencl_c_pipes 1
#define __opencl_c_generic_address_space 1
#define __opencl_c_work_group_collective_functions 1
@ -45,12 +64,19 @@
#endif
// Define header-only feature macros for OpenCL C 3.0.
#if (__OPENCL_C_VERSION__ == 300)
// For the SPIR target all features are supported.
#if defined(__SPIR__)
#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
// For the SPIR and SPIR-V target all features are supported.
#if defined(__SPIR__) || defined(__SPIRV__)
#define __opencl_c_atomic_scope_all_devices 1
#define __opencl_c_read_write_images 1
#endif // defined(__SPIR__)
#endif // (__OPENCL_C_VERSION__ == 300)
#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
#if !defined(__opencl_c_generic_address_space)
// Internal feature macro to provide named (global, local, private) address
// space overloads for builtin functions that take a pointer argument.
#define __opencl_c_named_address_space_builtins 1
#endif // !defined(__opencl_c_generic_address_space)
// built-in scalar data types:
@ -329,11 +355,17 @@ typedef enum memory_scope {
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
#if defined(__opencl_c_atomic_scope_all_devices)
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
memory_scope_all_devices = memory_scope_all_svm_devices,
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
#endif // defined(__opencl_c_atomic_scope_all_devices)
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
/**
* Subgroups have different requirements on forward progress, so just test
* all the relevant macros.
* CL 3.0 sub-groups "they are not guaranteed to make independent forward progress"
* KHR subgroups "Subgroups within a workgroup are independent, make forward progress with respect to each other"
*/
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;
@ -473,12 +505,14 @@ typedef int clk_profiling_info;
#define MAX_WORK_DIM 3
#ifdef __opencl_c_device_enqueue
typedef struct {
unsigned int workDimension;
size_t globalWorkOffset[MAX_WORK_DIM];
size_t globalWorkSize[MAX_WORK_DIM];
size_t localWorkSize[MAX_WORK_DIM];
} ndrange_t;
#endif // __opencl_c_device_enqueue
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@ -572,6 +606,28 @@ typedef struct {
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
// C++ for OpenCL - __remove_address_space
#if defined(__OPENCL_CPP_VERSION__)
template <typename _Tp> struct __remove_address_space { using type = _Tp; };
#if defined(__opencl_c_generic_address_space)
template <typename _Tp> struct __remove_address_space<__generic _Tp> {
using type = _Tp;
};
#endif
template <typename _Tp> struct __remove_address_space<__global _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__private _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__local _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__constant _Tp> {
using type = _Tp;
};
#endif
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
#define __kernel_exec(X, typen) __kernel \

2162
lib/include/opencl-c.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -17,9 +17,18 @@
// We require std::math functions in the complex builtins below.
#include <cmath>
#ifdef __NVPTX__
#define __OPENMP_NVPTX__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_NVPTX__
#endif // __NVPTX__
#ifdef __AMDGCN__
#define __OPENMP_AMDGCN__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_AMDGCN__
#endif // __AMDGCN__
#endif
// Grab the host header too.
@ -36,11 +45,11 @@
#ifndef _LIBCPP_STD_VER
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
device = {arch(amdgcn, nvptx, nvptx64)}, \
implementation = {extension(match_any, allow_templates)})
#include <complex_cmath.h>
#pragma omp end declare variant
#endif
#endif // _LIBCPP_STD_VER

View File

@ -17,10 +17,19 @@
// We require math functions in the complex builtins below.
#include <math.h>
#ifdef __NVPTX__
#define __OPENMP_NVPTX__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_NVPTX__
#endif
#ifdef __AMDGCN__
#define __OPENMP_AMDGCN__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_AMDGCN__
#endif
#endif
// Grab the host header too.
#include_next <complex.h>

View File

@ -10,6 +10,10 @@
#ifndef __PMMINTRIN_H
#define __PMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <emmintrin.h>
/* Define the default attributes for the functions in this file. */

View File

@ -35,7 +35,7 @@
#ifndef EMMINTRIN_H_
#define EMMINTRIN_H_
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
@ -2319,6 +2319,7 @@ _mm_castsi128_pd(__m128i __A)
#else
#include_next <emmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* EMMINTRIN_H_ */

View File

@ -10,7 +10,7 @@
#ifndef _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <stdlib.h>

View File

@ -35,7 +35,7 @@
#ifndef _MMINTRIN_H_INCLUDED
#define _MMINTRIN_H_INCLUDED
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
/* The Intel API is flexible enough that we must allow aliasing with other
@ -1445,6 +1445,7 @@ extern __inline __m64
#else
#include_next <mmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* _MMINTRIN_H_INCLUDED */

View File

@ -38,7 +38,7 @@
#ifndef PMMINTRIN_H_
#define PMMINTRIN_H_
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
/* We need definitions from the SSE2 and SSE header files*/
#include <emmintrin.h>
@ -145,6 +145,7 @@ _mm_lddqu_si128 (__m128i const *__P)
#else
#include_next <pmmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* PMMINTRIN_H_ */

View File

@ -29,10 +29,10 @@
#ifndef SMMINTRIN_H_
#define SMMINTRIN_H_
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
#include <emmintrin.h>
#include <tmmintrin.h>
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
@ -104,6 +104,7 @@ extern __inline __m128i
#else
#include_next <smmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* _SMMINTRIN_H_ */

View File

@ -25,7 +25,7 @@
#ifndef TMMINTRIN_H_
#define TMMINTRIN_H_
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
#include <altivec.h>
@ -490,6 +490,7 @@ _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
#else
#include_next <tmmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* TMMINTRIN_H_ */

View File

@ -34,7 +34,7 @@
#ifndef _XMMINTRIN_H_INCLUDED
#define _XMMINTRIN_H_INCLUDED
#if defined(__linux__) && defined(__ppc64__)
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
/* Define four value permute mask */
#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
@ -1838,6 +1838,7 @@ do { \
#else
#include_next <xmmintrin.h>
#endif /* defined(__linux__) && defined(__ppc64__) */
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
#endif /* _XMMINTRIN_H_INCLUDED */

View File

@ -47,9 +47,12 @@ _m_prefetch(void *__P)
/// \param __P
/// A pointer specifying the memory address to be prefetched.
static __inline__ void __attribute__((__always_inline__, __nodebug__))
_m_prefetchw(void *__P)
_m_prefetchw(volatile const void *__P)
{
__builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wcast-qual"
__builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
#pragma clang diagnostic pop
}
#endif /* __PRFCHWINTRIN_H */

218900
lib/include/riscv_vector.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -10,6 +10,10 @@
#ifndef __SMMINTRIN_H
#define __SMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <tmmintrin.h>
/* Define the default attributes for the functions in this file. */
@ -231,7 +235,7 @@
/// 11: Truncated
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
#define _mm_round_ps(X, M) \
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
/// Copies three upper elements of the first 128-bit vector operand to
/// the corresponding three upper elements of the 128-bit result vector of
@ -272,8 +276,8 @@
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
/// values.
#define _mm_round_ss(X, Y, M) \
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M))
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)))
/// Rounds each element of the 128-bit vector of [2 x double] to an
/// integer value according to the rounding control specified by the second
@ -306,7 +310,7 @@
/// 11: Truncated
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
#define _mm_round_pd(X, M) \
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
/// Copies the upper element of the first 128-bit vector operand to the
/// corresponding upper element of the 128-bit result vector of [2 x double].
@ -347,8 +351,8 @@
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
/// values.
#define _mm_round_sd(X, Y, M) \
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M))
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)))
/* SSE4 Packed Blending Intrinsics. */
/// Returns a 128-bit vector of [2 x double] where the values are
@ -376,8 +380,8 @@
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [2 x double] containing the copied values.
#define _mm_blend_pd(V1, V2, M) \
(__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), (int)(M))
((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
(__v2df)(__m128d)(V2), (int)(M)))
/// Returns a 128-bit vector of [4 x float] where the values are selected
/// from either the first or second operand as specified by the third
@ -404,8 +408,8 @@
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [4 x float] containing the copied values.
#define _mm_blend_ps(V1, V2, M) \
(__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
(__v4sf)(__m128)(V2), (int)(M))
((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
(__v4sf)(__m128)(V2), (int)(M)))
/// Returns a 128-bit vector of [2 x double] where the values are
/// selected from either the first or second operand as specified by the
@ -513,8 +517,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
/// is copied to the same position in the result.
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
#define _mm_blend_epi16(V1, V2, M) \
(__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), (int)(M))
((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
(__v8hi)(__m128i)(V2), (int)(M)))
/* SSE4 Dword Multiply Instructions. */
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
@ -590,8 +594,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// in the corresponding element; otherwise that element is set to zero.
/// \returns A 128-bit vector of [4 x float] containing the dot product.
#define _mm_dp_ps(X, Y, M) \
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M))
((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
(__v4sf)(__m128)(Y), (M)))
/// Computes the dot product of the two 128-bit vectors of [2 x double]
/// and returns it in the elements of the 128-bit result vector of
@ -625,8 +629,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
/// each [2 x double] vector. If a bit is set, the dot product is returned in
/// the corresponding element; otherwise that element is set to zero.
#define _mm_dp_pd(X, Y, M) \
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M))
((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), (M)))
/* SSE4 Streaming Load Hint Instruction. */
/// Loads integer values from a 128-bit aligned memory location to a
@ -664,7 +668,7 @@ _mm_stream_load_si128 (__m128i const *__V)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi8 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -683,7 +687,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi8 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -702,7 +706,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu16 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -721,7 +725,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu16 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -740,7 +744,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epi32 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -759,7 +763,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epi32 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -778,7 +782,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_min_epu32 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
}
/// Compares the corresponding elements of two 128-bit vectors of
@ -797,7 +801,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_max_epu32 (__m128i __V1, __m128i __V2)
{
return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
}
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
@ -865,15 +869,13 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 10: Bits [95:64] of parameter \a X are returned. \n
/// 11: Bits [127:96] of parameter \a X are returned.
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
#define _mm_extract_ps(X, N) (__extension__ \
({ union { int __i; float __f; } __t; \
__t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
__t.__i;}))
#define _mm_extract_ps(X, N) \
__builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
/* Miscellaneous insert and extract macros. */
/* Extract a single-precision float from X at index N into D. */
#define _MM_EXTRACT_FLOAT(D, X, N) \
{ (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
an index suitable for _mm_insert_ps. */
@ -925,8 +927,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1111: Bits [127:120] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi8(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
(int)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
(int)(I), (int)(N)))
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
/// the 128-bit integer vector parameter, and then inserting the 32-bit
@ -957,8 +959,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 11: Bits [127:96] of the result are used for insertion.
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi32(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
(int)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
(int)(I), (int)(N)))
#ifdef __x86_64__
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
@ -988,8 +990,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1: Bits [127:64] of the result are used for insertion. \n
/// \returns A 128-bit integer vector containing the constructed values.
#define _mm_insert_epi64(X, I, N) \
(__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
(long long)(I), (int)(N))
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
(long long)(I), (int)(N)))
#endif /* __x86_64__ */
/* Extract int from packed integer array at index. This returns the element
@ -1031,8 +1033,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 128-bit integer vector parameter and the remaining bits are assigned
/// zeros.
#define _mm_extract_epi8(X, N) \
(int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N))
((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N)))
/// Extracts a 32-bit element from the 128-bit integer vector of
/// [4 x i32], using the immediate value parameter \a N as a selector.
@ -1057,7 +1059,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// \returns An integer, whose lower 32 bits are selected from the 128-bit
/// integer vector parameter and the remaining bits are assigned zeros.
#define _mm_extract_epi32(X, N) \
(int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
#ifdef __x86_64__
/// Extracts a 64-bit element from the 128-bit integer vector of
@ -1080,7 +1082,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
/// 1: Bits [127:64] are returned. \n
/// \returns A 64-bit integer.
#define _mm_extract_epi64(X, N) \
(long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
#endif /* __x86_64 */
/* SSE4 128-bit Packed Integer Comparisons. */
@ -1514,8 +1516,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
/// \returns A 128-bit integer vector containing the sums of the sets of
/// absolute differences between both operands.
#define _mm_mpsadbw_epu8(X, Y, M) \
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M))
((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (M)))
/// Finds the minimum unsigned 16-bit element in the input 128-bit
/// vector of [8 x u16] and returns it and along with its index.
@ -1624,8 +1626,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns a 128-bit integer vector representing the result mask of
/// the comparison.
#define _mm_cmpistrm(A, B, M) \
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1678,8 +1680,8 @@ _mm_minpos_epu16(__m128i __V)
/// 1: The index of the most significant set bit. \n
/// \returns Returns an integer representing the result index of the comparison.
#define _mm_cmpistri(A, B, M) \
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -1738,9 +1740,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns a 128-bit integer vector representing the result mask of
/// the comparison.
#define _mm_cmpestrm(A, LA, B, LB, M) \
(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -1797,9 +1799,9 @@ _mm_minpos_epu16(__m128i __V)
/// 1: The index of the most significant set bit. \n
/// \returns Returns an integer representing the result index of the comparison.
#define _mm_cmpestri(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
/// Uses the immediate operand \a M to perform a comparison of string
@ -1849,8 +1851,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the bit mask is zero and the length of the string in
/// \a B is the maximum; otherwise, returns 0.
#define _mm_cmpistra(A, B, M) \
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1898,8 +1900,8 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B.
/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
#define _mm_cmpistrc(A, B, M) \
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1946,8 +1948,8 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B. \n
/// \returns Returns bit 0 of the resulting bit mask.
#define _mm_cmpistro(A, B, M) \
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -1996,8 +1998,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a A is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpistrs(A, B, M) \
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with implicitly defined lengths that is contained in source operands
@ -2046,8 +2048,8 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a B is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpistrz(A, B, M) \
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M))
((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2100,9 +2102,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the bit mask is zero and the length of the string in
/// \a B is the maximum, otherwise, returns 0.
#define _mm_cmpestra(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2154,9 +2156,9 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B. \n
/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
#define _mm_cmpestrc(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2207,9 +2209,9 @@ _mm_minpos_epu16(__m128i __V)
/// to the size of \a A or \a B.
/// \returns Returns bit 0 of the resulting bit mask.
#define _mm_cmpestro(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2262,9 +2264,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a A is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpestrs(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/// Uses the immediate operand \a M to perform a comparison of string
/// data with explicitly defined lengths that is contained in source operands
@ -2316,9 +2318,9 @@ _mm_minpos_epu16(__m128i __V)
/// \returns Returns 1 if the length of the string in \a B is less than the
/// maximum, otherwise, returns 0.
#define _mm_cmpestrz(A, LA, B, LB, M) \
(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
(__v16qi)(__m128i)(B), (int)(LB), \
(int)(M))
(int)(M)))
/* SSE4.2 Compare Packed Data -- Greater Than. */
/// Compares each of the corresponding 64-bit values of the 128-bit
@ -2340,91 +2342,10 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
}
/* SSE4.2 Accumulate CRC32. */
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned char operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u8(unsigned int __C, unsigned char __D)
{
return __builtin_ia32_crc32qi(__C, __D);
}
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned short operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u16(unsigned int __C, unsigned short __D)
{
return __builtin_ia32_crc32hi(__C, __D);
}
/// Adds the first unsigned integer operand to the CRC-32C checksum of
/// the second unsigned integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned int __DEFAULT_FN_ATTRS
_mm_crc32_u32(unsigned int __C, unsigned int __D)
{
return __builtin_ia32_crc32si(__C, __D);
}
#ifdef __x86_64__
/// Adds the unsigned integer operand to the CRC-32C checksum of the
/// unsigned 64-bit integer operand.
///
/// \headerfile <x86intrin.h>
///
/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
///
/// \param __C
/// An unsigned integer operand to add to the CRC-32C checksum of operand
/// \a __D.
/// \param __D
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
/// operand \a __D.
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
{
return __builtin_ia32_crc32di(__C, __D);
}
#endif /* __x86_64__ */
#undef __DEFAULT_FN_ATTRS
#include <popcntintrin.h>
#include <crc32intrin.h>
#endif /* __SMMINTRIN_H */

View File

@ -12,8 +12,12 @@
/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
* example, already has a Clang-compatible stdatomic.h header.
*
* Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
* to the clang resource header until that is fully supported.
*/
#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
# include_next <stdatomic.h>
#else
@ -40,6 +44,11 @@ extern "C" {
/* 7.17.2 Initialization */
#define ATOMIC_VAR_INIT(value) (value)
#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) && \
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
/* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
#pragma clang deprecated(ATOMIC_VAR_INIT)
#endif
#define atomic_init __c11_atomic_init
/* 7.17.3 Order and consistency */
@ -149,6 +158,10 @@ typedef _Atomic(uintmax_t) atomic_uintmax_t;
typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
#define ATOMIC_FLAG_INIT { 0 }
#if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
/* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
#pragma clang deprecated(ATOMIC_FLAG_INIT)
#endif
/* These should be provided by the libc implementation. */
#ifdef __cplusplus

168
lib/include/stdint.h vendored
View File

@ -461,6 +461,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT64_MAX INT64_C( 9223372036854775807)
# define INT64_MIN (-INT64_C( 9223372036854775807)-1)
# define UINT64_MAX UINT64_C(18446744073709551615)
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT64_WIDTH 64
# define INT64_WIDTH UINT64_WIDTH
# define __UINT_LEAST64_WIDTH UINT64_WIDTH
# define __UINT_LEAST32_WIDTH UINT64_WIDTH
# define __UINT_LEAST16_WIDTH UINT64_WIDTH
# define __UINT_LEAST8_MAX UINT64_MAX
#endif /* __STDC_VERSION__ */
# define __INT_LEAST64_MIN INT64_MIN
# define __INT_LEAST64_MAX INT64_MAX
# define __UINT_LEAST64_MAX UINT64_MAX
@ -482,6 +494,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST64_MIN __INT_LEAST64_MIN
# define INT_FAST64_MAX __INT_LEAST64_MAX
# define UINT_FAST64_MAX __UINT_LEAST64_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
# define INT_LEAST64_WIDTH UINT_LEAST64_WIDTH
# define UINT_FAST64_WIDTH __UINT_LEAST64_WIDTH
# define INT_FAST64_WIDTH UINT_FAST64_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT_LEAST64_MIN */
@ -495,6 +516,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST56_MIN INT56_MIN
# define INT_FAST56_MAX INT56_MAX
# define UINT_FAST56_MAX UINT56_MAX
# define __INT_LEAST32_MIN INT56_MIN
# define __INT_LEAST32_MAX INT56_MAX
# define __UINT_LEAST32_MAX UINT56_MAX
@ -504,6 +526,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define __INT_LEAST8_MIN INT56_MIN
# define __INT_LEAST8_MAX INT56_MAX
# define __UINT_LEAST8_MAX UINT56_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT56_WIDTH 56
# define INT56_WIDTH UINT56_WIDTH
# define UINT_LEAST56_WIDTH UINT56_WIDTH
# define INT_LEAST56_WIDTH UINT_LEAST56_WIDTH
# define UINT_FAST56_WIDTH UINT56_WIDTH
# define INT_FAST56_WIDTH UINT_FAST56_WIDTH
# define __UINT_LEAST32_WIDTH UINT56_WIDTH
# define __UINT_LEAST16_WIDTH UINT56_WIDTH
# define __UINT_LEAST8_WIDTH UINT56_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT56_TYPE__ */
@ -517,6 +553,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST48_MIN INT48_MIN
# define INT_FAST48_MAX INT48_MAX
# define UINT_FAST48_MAX UINT48_MAX
# define __INT_LEAST32_MIN INT48_MIN
# define __INT_LEAST32_MAX INT48_MAX
# define __UINT_LEAST32_MAX UINT48_MAX
@ -526,6 +563,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define __INT_LEAST8_MIN INT48_MIN
# define __INT_LEAST8_MAX INT48_MAX
# define __UINT_LEAST8_MAX UINT48_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#define UINT48_WIDTH 48
#define INT48_WIDTH UINT48_WIDTH
#define UINT_LEAST48_WIDTH UINT48_WIDTH
#define INT_LEAST48_WIDTH UINT_LEAST48_WIDTH
#define UINT_FAST48_WIDTH UINT48_WIDTH
#define INT_FAST48_WIDTH UINT_FAST48_WIDTH
#define __UINT_LEAST32_WIDTH UINT48_WIDTH
#define __UINT_LEAST16_WIDTH UINT48_WIDTH
#define __UINT_LEAST8_WIDTH UINT48_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT48_TYPE__ */
@ -539,6 +590,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST40_MIN INT40_MIN
# define INT_FAST40_MAX INT40_MAX
# define UINT_FAST40_MAX UINT40_MAX
# define __INT_LEAST32_MIN INT40_MIN
# define __INT_LEAST32_MAX INT40_MAX
# define __UINT_LEAST32_MAX UINT40_MAX
@ -548,6 +600,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define __INT_LEAST8_MIN INT40_MIN
# define __INT_LEAST8_MAX INT40_MAX
# define __UINT_LEAST8_MAX UINT40_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT40_WIDTH 40
# define INT40_WIDTH UINT40_WIDTH
# define UINT_LEAST40_WIDTH UINT40_WIDTH
# define INT_LEAST40_WIDTH UINT_LEAST40_WIDTH
# define UINT_FAST40_WIDTH UINT40_WIDTH
# define INT_FAST40_WIDTH UINT_FAST40_WIDTH
# define __UINT_LEAST32_WIDTH UINT40_WIDTH
# define __UINT_LEAST16_WIDTH UINT40_WIDTH
# define __UINT_LEAST8_WIDTH UINT40_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT40_TYPE__ */
@ -555,6 +621,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT32_MAX INT32_C(2147483647)
# define INT32_MIN (-INT32_C(2147483647)-1)
# define UINT32_MAX UINT32_C(4294967295)
# define __INT_LEAST32_MIN INT32_MIN
# define __INT_LEAST32_MAX INT32_MAX
# define __UINT_LEAST32_MAX UINT32_MAX
@ -564,6 +631,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define __INT_LEAST8_MIN INT32_MIN
# define __INT_LEAST8_MAX INT32_MAX
# define __UINT_LEAST8_MAX UINT32_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT32_WIDTH 32
# define INT32_WIDTH UINT32_WIDTH
# define __UINT_LEAST32_WIDTH UINT32_WIDTH
# define __UINT_LEAST16_WIDTH UINT32_WIDTH
# define __UINT_LEAST8_WIDTH UINT32_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT32_TYPE__ */
#ifdef __INT_LEAST32_MIN
@ -573,6 +650,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST32_MIN __INT_LEAST32_MIN
# define INT_FAST32_MAX __INT_LEAST32_MAX
# define UINT_FAST32_MAX __UINT_LEAST32_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
# define INT_LEAST32_WIDTH UINT_LEAST32_WIDTH
# define UINT_FAST32_WIDTH __UINT_LEAST32_WIDTH
# define INT_FAST32_WIDTH UINT_FAST32_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT_LEAST32_MIN */
@ -586,12 +672,26 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST24_MIN INT24_MIN
# define INT_FAST24_MAX INT24_MAX
# define UINT_FAST24_MAX UINT24_MAX
# define __INT_LEAST16_MIN INT24_MIN
# define __INT_LEAST16_MAX INT24_MAX
# define __UINT_LEAST16_MAX UINT24_MAX
# define __INT_LEAST8_MIN INT24_MIN
# define __INT_LEAST8_MAX INT24_MAX
# define __UINT_LEAST8_MAX UINT24_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT24_WIDTH 24
# define INT24_WIDTH UINT24_WIDTH
# define UINT_LEAST24_WIDTH UINT24_WIDTH
# define INT_LEAST24_WIDTH UINT_LEAST24_WIDTH
# define UINT_FAST24_WIDTH UINT24_WIDTH
# define INT_FAST24_WIDTH UINT_FAST24_WIDTH
# define __UINT_LEAST16_WIDTH UINT24_WIDTH
# define __UINT_LEAST8_WIDTH UINT24_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT24_TYPE__ */
@ -599,12 +699,22 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#define INT16_MAX INT16_C(32767)
#define INT16_MIN (-INT16_C(32767)-1)
#define UINT16_MAX UINT16_C(65535)
# define __INT_LEAST16_MIN INT16_MIN
# define __INT_LEAST16_MAX INT16_MAX
# define __UINT_LEAST16_MAX UINT16_MAX
# define __INT_LEAST8_MIN INT16_MIN
# define __INT_LEAST8_MAX INT16_MAX
# define __UINT_LEAST8_MAX UINT16_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT16_WIDTH 16
# define INT16_WIDTH UINT16_WIDTH
# define __UINT_LEAST16_WIDTH UINT16_WIDTH
# define __UINT_LEAST8_WIDTH UINT16_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT16_TYPE__ */
#ifdef __INT_LEAST16_MIN
@ -614,6 +724,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST16_MIN __INT_LEAST16_MIN
# define INT_FAST16_MAX __INT_LEAST16_MAX
# define UINT_FAST16_MAX __UINT_LEAST16_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
# define INT_LEAST16_WIDTH UINT_LEAST16_WIDTH
# define UINT_FAST16_WIDTH __UINT_LEAST16_WIDTH
# define INT_FAST16_WIDTH UINT_FAST16_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT_LEAST16_MIN */
@ -621,9 +740,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT8_MAX INT8_C(127)
# define INT8_MIN (-INT8_C(127)-1)
# define UINT8_MAX UINT8_C(255)
# define __INT_LEAST8_MIN INT8_MIN
# define __INT_LEAST8_MAX INT8_MAX
# define __UINT_LEAST8_MAX UINT8_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT8_WIDTH 8
# define INT8_WIDTH UINT8_WIDTH
# define __UINT_LEAST8_WIDTH UINT8_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT8_TYPE__ */
#ifdef __INT_LEAST8_MIN
@ -633,6 +761,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
# define INT_FAST8_MIN __INT_LEAST8_MIN
# define INT_FAST8_MAX __INT_LEAST8_MAX
# define UINT_FAST8_MAX __UINT_LEAST8_MAX
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
# define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
# define INT_LEAST8_WIDTH UINT_LEAST8_WIDTH
# define UINT_FAST8_WIDTH __UINT_LEAST8_WIDTH
# define INT_FAST8_WIDTH UINT_FAST8_WIDTH
#endif /* __STDC_VERSION__ */
#endif /* __INT_LEAST8_MIN */
/* Some utility macros */
@ -652,6 +789,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#define PTRDIFF_MAX __PTRDIFF_MAX__
#define SIZE_MAX __SIZE_MAX__
/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
/* NB: The C standard requires that these be the same value, but the compiler
exposes separate internal width macros. */
#define INTPTR_WIDTH __INTPTR_WIDTH__
#define UINTPTR_WIDTH __UINTPTR_WIDTH__
#endif
/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
* is enabled. */
#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
@ -663,6 +810,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#define INTMAX_MAX __INTMAX_MAX__
#define UINTMAX_MAX __UINTMAX_MAX__
/* C2x 7.20.2.5 Width of greatest-width integer types. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
/* NB: The C standard requires that these be the same value, but the compiler
exposes separate internal width macros. */
#define INTMAX_WIDTH __INTMAX_WIDTH__
#define UINTMAX_WIDTH __UINTMAX_WIDTH__
#endif
/* C99 7.18.3 Limits of other integer types. */
#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
@ -689,5 +846,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
/* C2x 7.20.3.x Width of other integer types. */
/* FIXME: This is using the placeholder dates Clang produces for these macros
in C2x mode; switch to the correct values once they've been published. */
#if __STDC_VERSION__ >= 202000L
#define PTRDIFF_WIDTH __PTRDIFF_WIDTH__
#define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
#define SIZE_WIDTH __SIZE_WIDTH__
#define WCHAR_WIDTH __WCHAR_WIDTH__
#define WINT_WIDTH __WINT_WIDTH__
#endif
#endif /* __STDC_HOSTED__ */
#endif /* __CLANG_STDINT_H */

View File

@ -10,6 +10,10 @@
#ifndef __TMMINTRIN_H
#define __TMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <pmmintrin.h>
/* Define the default attributes for the functions in this file. */
@ -49,7 +53,7 @@ _mm_abs_pi8(__m64 __a)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
}
/// Computes the absolute value of each of the packed 16-bit signed
@ -85,7 +89,7 @@ _mm_abs_pi16(__m64 __a)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
}
/// Computes the absolute value of each of the packed 32-bit signed
@ -121,7 +125,7 @@ _mm_abs_pi32(__m64 __a)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)
{
return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
return (__m128i)__builtin_elementwise_abs((__v4si)__a);
}
/// Concatenates the two 128-bit integer vector operands, and
@ -145,8 +149,8 @@ _mm_abs_epi32(__m128i __a)
/// \returns A 128-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_epi8(a, b, n) \
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n))
((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
(__v16qi)(__m128i)(b), (n)))
/// Concatenates the two 64-bit integer vector operands, and right-shifts
/// the result by the number of bytes specified in the immediate operand.
@ -168,7 +172,7 @@ _mm_abs_epi32(__m128i __a)
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) \
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].

View File

@ -172,7 +172,8 @@ typedef enum {
_UVRSC_CORE = 0, /* integer register */
_UVRSC_VFP = 1, /* vfp */
_UVRSC_WMMXD = 3, /* Intel WMMX data register */
_UVRSC_WMMXC = 4 /* Intel WMMX control register */
_UVRSC_WMMXC = 4, /* Intel WMMX control register */
_UVRSC_PSEUDO = 5 /* Special purpose pseudo register */
} _Unwind_VRS_RegClass;
typedef enum {

View File

@ -82,4 +82,4 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS_F
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F
#endif
#endif // __VAESINTRIN_H

View File

@ -15,15 +15,15 @@
#define __VPCLMULQDQINTRIN_H
#define _mm256_clmulepi64_epi128(A, B, I) \
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B), \
(char)(I))
(char)(I)))
#ifdef __AVX512FINTRIN_H
#define _mm512_clmulepi64_epi128(A, B, I) \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
(char)(I))
(char)(I)))
#endif // __AVX512FINTRIN_H
#endif /* __VPCLMULQDQINTRIN_H */

View File

@ -276,12 +276,28 @@ wasm_i8x16_make(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3, int8_t __c4,
__c12, __c13, __c14, __c15};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_u8x16_make(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15) {
return (v128_t)(__u8x16){__c0, __c1, __c2, __c3, __c4, __c5,
__c6, __c7, __c8, __c9, __c10, __c11,
__c12, __c13, __c14, __c15};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_i16x8_make(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7) {
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_u16x8_make(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7) {
return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
int32_t __c1,
int32_t __c2,
@ -289,11 +305,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_make(uint32_t __c0,
uint32_t __c1,
uint32_t __c2,
uint32_t __c3) {
return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_make(int64_t __c0,
int64_t __c1) {
return (v128_t)(__i64x2){__c0, __c1};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_make(uint64_t __c0,
uint64_t __c1) {
return (v128_t)(__u64x2){__c0, __c1};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_make(float __c0,
float __c1,
float __c2,
@ -324,6 +352,24 @@ wasm_i8x16_const(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3,
__c12, __c13, __c14, __c15};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_u8x16_const(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
__REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
__REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
__REQUIRE_CONSTANT(__c7) __REQUIRE_CONSTANT(__c8)
__REQUIRE_CONSTANT(__c9) __REQUIRE_CONSTANT(__c10)
__REQUIRE_CONSTANT(__c11) __REQUIRE_CONSTANT(__c12)
__REQUIRE_CONSTANT(__c13) __REQUIRE_CONSTANT(__c14)
__REQUIRE_CONSTANT(__c15) {
return (v128_t)(__u8x16){__c0, __c1, __c2, __c3, __c4, __c5,
__c6, __c7, __c8, __c9, __c10, __c11,
__c12, __c13, __c14, __c15};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7)
@ -334,6 +380,16 @@ wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_u16x8_const(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
__REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
__REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
__REQUIRE_CONSTANT(__c7) {
return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@ -341,12 +397,25 @@ wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_u32x4_const(uint32_t __c0, uint32_t __c1, uint32_t __c2, uint32_t __c3)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
__REQUIRE_CONSTANT(__c3) {
return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const(int64_t __c0,
int64_t __c1)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
return (v128_t)(__i64x2){__c0, __c1};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const(uint64_t __c0,
uint64_t __c1)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
return (v128_t)(__u64x2){__c0, __c1};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS
wasm_f32x4_const(float __c0, float __c1, float __c2, float __c3)
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@ -366,21 +435,42 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_const_splat(int8_t __c)
__c, __c, __c, __c, __c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_const_splat(uint8_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__u8x16){__c, __c, __c, __c, __c, __c, __c, __c,
__c, __c, __c, __c, __c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_const_splat(int16_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__i16x8){__c, __c, __c, __c, __c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_const_splat(uint16_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__u16x8){__c, __c, __c, __c, __c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_const_splat(int32_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__i32x4){__c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_const_splat(uint32_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__u32x4){__c, __c, __c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const_splat(int64_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__i64x2){__c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const_splat(uint64_t __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__u64x2){__c, __c};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_const_splat(float __c)
__REQUIRE_CONSTANT(__c) {
return (v128_t)(__f32x4){__c, __c, __c, __c};
@ -396,6 +486,11 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
__a, __a, __a, __a, __a, __a, __a, __a};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_splat(uint8_t __a) {
return (v128_t)(__u8x16){__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
}
static __inline__ int8_t __DEFAULT_FN_ATTRS wasm_i8x16_extract_lane(v128_t __a,
int __i)
__REQUIRE_CONSTANT(__i) {
@ -417,10 +512,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_replace_lane(v128_t __a,
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_replace_lane(v128_t __a,
int __i,
uint8_t __b)
__REQUIRE_CONSTANT(__i) {
__u8x16 __v = (__u8x16)__a;
__v[__i] = __b;
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
return (v128_t)(__i16x8){__a, __a, __a, __a, __a, __a, __a, __a};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_splat(uint16_t __a) {
return (v128_t)(__u16x8){__a, __a, __a, __a, __a, __a, __a, __a};
}
static __inline__ int16_t __DEFAULT_FN_ATTRS wasm_i16x8_extract_lane(v128_t __a,
int __i)
__REQUIRE_CONSTANT(__i) {
@ -441,16 +549,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_replace_lane(v128_t __a,
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_replace_lane(
v128_t __a, int __i, uint16_t __b) __REQUIRE_CONSTANT(__i) {
__u16x8 __v = (__u16x8)__a;
__v[__i] = __b;
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_splat(int32_t __a) {
return (v128_t)(__i32x4){__a, __a, __a, __a};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_splat(uint32_t __a) {
return (v128_t)(__u32x4){__a, __a, __a, __a};
}
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_extract_lane(v128_t __a,
int __i)
__REQUIRE_CONSTANT(__i) {
return ((__i32x4)__a)[__i];
}
static __inline__ uint32_t __DEFAULT_FN_ATTRS
wasm_u32x4_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
return ((__u32x4)__a)[__i];
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
int __i,
int32_t __b)
@ -460,16 +584,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_replace_lane(
v128_t __a, int __i, uint32_t __b) __REQUIRE_CONSTANT(__i) {
__u32x4 __v = (__u32x4)__a;
__v[__i] = __b;
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_splat(int64_t __a) {
return (v128_t)(__i64x2){__a, __a};
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_splat(uint64_t __a) {
return (v128_t)(__u64x2){__a, __a};
}
static __inline__ int64_t __DEFAULT_FN_ATTRS wasm_i64x2_extract_lane(v128_t __a,
int __i)
__REQUIRE_CONSTANT(__i) {
return ((__i64x2)__a)[__i];
}
static __inline__ uint64_t __DEFAULT_FN_ATTRS
wasm_u64x2_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
return ((__u64x2)__a)[__i];
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
int __i,
int64_t __b)
@ -479,6 +619,13 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_replace_lane(
v128_t __a, int __i, uint64_t __b) __REQUIRE_CONSTANT(__i) {
__u64x2 __v = (__u64x2)__a;
__v[__i] = __b;
return (v128_t)__v;
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_splat(float __a) {
return (v128_t)(__f32x4){__a, __a, __a, __a};
}
@ -804,7 +951,7 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i8x16_all_true(v128_t __a) {
return __builtin_wasm_all_true_i8x16((__i8x16)__a);
}
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
return __builtin_wasm_bitmask_i8x16((__i8x16)__a);
}
@ -813,17 +960,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i8x16)__a << __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i8x16)__a >> __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__u8x16)__a >> __b);
}
@ -894,22 +1041,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i16x8_all_true(v128_t __a) {
return __builtin_wasm_all_true_i16x8((__i16x8)__a);
}
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
return __builtin_wasm_bitmask_i16x8((__i16x8)__a);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i16x8)__a << __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i16x8)__a >> __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__u16x8)__a >> __b);
}
@ -985,22 +1132,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i32x4_all_true(v128_t __a) {
return __builtin_wasm_all_true_i32x4((__i32x4)__a);
}
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
return __builtin_wasm_bitmask_i32x4((__i32x4)__a);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i32x4)__a << __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i32x4)__a >> __b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__u32x4)__a >> __b);
}
@ -1056,22 +1203,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_all_true(v128_t __a) {
return __builtin_wasm_all_true_i64x2((__i64x2)__a);
}
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
return __builtin_wasm_bitmask_i64x2((__i64x2)__a);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i64x2)__a << (int64_t)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__i64x2)__a >> (int64_t)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
int32_t __b) {
uint32_t __b) {
return (v128_t)((__u64x2)__a >> (int64_t)__b);
}
@ -1150,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
v128_t __b) {
__i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
v128_t __b) {
__i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
@ -1220,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
v128_t __b) {
__i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
v128_t __b) {
__i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
}
static __inline__ v128_t __DEFAULT_FN_ATTRS

View File

@ -10,6 +10,10 @@
#ifndef __WMMINTRIN_H
#define __WMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <emmintrin.h>
#include <__wmmintrin_aes.h>

View File

@ -20,4 +20,16 @@
#include <uintrintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__CRC32__)
#include <crc32intrin.h>
#endif
#define __SSC_MARK(Tag) \
__asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \
"mov {%0, %%ebx|ebx, %0}; " \
".byte 0x64, 0x67, 0x90; " \
"mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \
: "%eax");
#endif /* __X86GPRINTRIN_H */

View File

@ -10,6 +10,10 @@
#ifndef __XMMINTRIN_H
#define __XMMINTRIN_H
#if !defined(__i386__) && !defined(__x86_64__)
#error "This header is only meant to be used on x86 and x64 architecture"
#endif
#include <mmintrin.h>
typedef int __v4si __attribute__((__vector_size__(16)));
@ -2181,7 +2185,7 @@ void _mm_sfence(void);
/// 3: Bits [63:48] are copied to the destination.
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
#define _mm_extract_pi16(a, n) \
(int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
@ -2212,7 +2216,7 @@ void _mm_sfence(void);
/// \returns A 64-bit integer vector containing the copied packed data from the
/// operands.
#define _mm_insert_pi16(a, d, n) \
(__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
/// Compares each of the corresponding packed 16-bit integer values of
/// the 64-bit integer vectors, and writes the greater value to the
@ -2359,7 +2363,7 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
/// 11: assigned from bits [63:48] of \a a.
/// \returns A 64-bit integer vector containing the shuffled values.
#define _mm_shuffle_pi16(a, n) \
(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
/// Conditionally copies the values from each 8-bit element in the first
/// 64-bit integer vector operand to the specified memory location, as
@ -2601,8 +2605,8 @@ void _mm_setcsr(unsigned int __i);
/// 11: Bits [127:96] copied from the specified operand.
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
#define _mm_shuffle_ps(a, b, mask) \
(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
(int)(mask))
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
(int)(mask)))
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].

View File

@ -225,16 +225,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
}
#define _mm_roti_epi8(A, N) \
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
#define _mm_roti_epi16(A, N) \
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
#define _mm_roti_epi32(A, N) \
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
#define _mm_roti_epi64(A, N) \
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shl_epi8(__m128i __A, __m128i __B)
@ -285,36 +285,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
}
#define _mm_com_epu8(A, B, N) \
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)))
#define _mm_com_epu16(A, B, N) \
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)))
#define _mm_com_epu32(A, B, N) \
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)))
#define _mm_com_epu64(A, B, N) \
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)))
#define _mm_com_epi8(A, B, N) \
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
(__v16qi)(__m128i)(B), (N)))
#define _mm_com_epi16(A, B, N) \
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B), (N)))
#define _mm_com_epi32(A, B, N) \
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B), (N)))
#define _mm_com_epi64(A, B, N) \
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N))
((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B), (N)))
#define _MM_PCOMCTRL_LT 0
#define _MM_PCOMCTRL_LE 1
@ -710,23 +710,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
}
#define _mm_permute2_pd(X, Y, C, I) \
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
(__v2df)(__m128d)(Y), \
(__v2di)(__m128i)(C), (I))
(__v2di)(__m128i)(C), (I)))
#define _mm256_permute2_pd(X, Y, C, I) \
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
(__v4df)(__m256d)(Y), \
(__v4di)(__m256i)(C), (I))
(__v4di)(__m256i)(C), (I)))
#define _mm_permute2_ps(X, Y, C, I) \
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(__v4si)(__m128i)(C), (I))
((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(__v4si)(__m128i)(C), (I)))
#define _mm256_permute2_ps(X, Y, C, I) \
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
(__v8sf)(__m256)(Y), \
(__v8si)(__m256i)(C), (I))
(__v8si)(__m256i)(C), (I)))
static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_frcz_ss(__m128 __A)