update C header files to clang 13 rc1

This commit is contained in:
Andrew Kelley 2021-08-15 18:00:10 -07:00
parent 78ff2a148a
commit 21606339af
34 changed files with 156298 additions and 7363 deletions

View File

@ -166,6 +166,8 @@ __DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
__DEVICE__ double round(double __a) { return __nv_round(__a); }
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
__DEVICE__ double log(double __a) { return __nv_log(__a); }
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
@ -270,8 +272,6 @@ __DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
__DEVICE__ float rnormf(int __dim, const float *__t) {
return __nv_rnormf(__dim, __t);
}
__DEVICE__ double round(double __a) { return __nv_round(__a); }
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }

View File

@ -349,9 +349,14 @@ extern "C" {
__device__ int vprintf(const char *, const char *);
__device__ void free(void *) __attribute((nothrow));
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
// __assertfail() used to have a `noreturn` attribute. Unfortunately that
// contributed to triggering the longstanding bug in ptxas when assert was used
// in sufficiently convoluted code. See
// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
__device__ void __assertfail(const char *__message, const char *__file,
unsigned __line, const char *__function,
size_t __charSize) __attribute__((noreturn));
size_t __charSize);
// In order for standard assert() macro on linux to work we need to
// provide device-side __assert_fail()

View File

@ -14,6 +14,7 @@
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
#endif
#if !defined(__HIPCC_RTC__)
#if defined(__cplusplus)
#include <limits>
#include <type_traits>
@ -21,6 +22,7 @@
#endif
#include <limits.h>
#include <stdint.h>
#endif // !defined(__HIPCC_RTC__)
#pragma push_macro("__DEVICE__")
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
@ -34,6 +36,9 @@ __DEVICE__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ float fma(float __x, float __y, float __z) {
return ::fmaf(__x, __y, __z);
}
#if !defined(__HIPCC_RTC__)
// The value returned by fpclassify is platform dependent, therefore it is not
// supported by hipRTC.
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
@ -42,11 +47,51 @@ __DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
#endif // !defined(__HIPCC_RTC__)
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
#if defined(__OPENMP_AMDGCN__)
// For OpenMP we work around some old system headers that have non-conforming
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
// this by providing two versions of these functions, differing only in the
// return type. To avoid conflicting definitions we disable implicit base
// function generation. That means we will end up with two specializations, one
// per type, but only one has a base function defined by the system header.
#pragma omp begin declare variant match( \
implementation = {extension(disable_implicit_base)})
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
// add a suffix. This means we would clash with the names of the variants
// (note that we do not create implicit base functions here). To avoid
// this clash we add a new trait to some of them that is always true
// (this is LLVM after all ;)). It will only influence the mangled name
// of the variants inside the inner region and avoid the clash.
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
__DEVICE__ int isfinite(double __x) { return ::__finite(__x); }
__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
#pragma omp end declare variant
#endif // defined(__OPENMP_AMDGCN__)
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
__DEVICE__ bool isfinite(double __x) { return ::__finite(__x); }
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
#if defined(__OPENMP_AMDGCN__)
#pragma omp end declare variant
#endif // defined(__OPENMP_AMDGCN__)
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
@ -59,8 +104,6 @@ __DEVICE__ bool isgreaterequal(float __x, float __y) {
__DEVICE__ bool isgreaterequal(double __x, double __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isless(float __x, float __y) {
return __builtin_isless(__x, __y);
}
@ -79,8 +122,6 @@ __DEVICE__ bool islessgreater(float __x, float __y) {
__DEVICE__ bool islessgreater(double __x, double __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isunordered(float __x, float __y) {
@ -207,11 +248,117 @@ template <bool __B, class __T = void> struct __hip_enable_if {};
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
namespace __hip {
template <class _Tp> struct is_integral {
enum { value = 0 };
};
template <> struct is_integral<bool> {
enum { value = 1 };
};
template <> struct is_integral<char> {
enum { value = 1 };
};
template <> struct is_integral<signed char> {
enum { value = 1 };
};
template <> struct is_integral<unsigned char> {
enum { value = 1 };
};
template <> struct is_integral<wchar_t> {
enum { value = 1 };
};
template <> struct is_integral<short> {
enum { value = 1 };
};
template <> struct is_integral<unsigned short> {
enum { value = 1 };
};
template <> struct is_integral<int> {
enum { value = 1 };
};
template <> struct is_integral<unsigned int> {
enum { value = 1 };
};
template <> struct is_integral<long> {
enum { value = 1 };
};
template <> struct is_integral<unsigned long> {
enum { value = 1 };
};
template <> struct is_integral<long long> {
enum { value = 1 };
};
template <> struct is_integral<unsigned long long> {
enum { value = 1 };
};
// ToDo: specializes is_arithmetic<_Float16>
template <class _Tp> struct is_arithmetic {
enum { value = 0 };
};
template <> struct is_arithmetic<bool> {
enum { value = 1 };
};
template <> struct is_arithmetic<char> {
enum { value = 1 };
};
template <> struct is_arithmetic<signed char> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned char> {
enum { value = 1 };
};
template <> struct is_arithmetic<wchar_t> {
enum { value = 1 };
};
template <> struct is_arithmetic<short> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned short> {
enum { value = 1 };
};
template <> struct is_arithmetic<int> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned int> {
enum { value = 1 };
};
template <> struct is_arithmetic<long> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long> {
enum { value = 1 };
};
template <> struct is_arithmetic<long long> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long long> {
enum { value = 1 };
};
template <> struct is_arithmetic<float> {
enum { value = 1 };
};
template <> struct is_arithmetic<double> {
enum { value = 1 };
};
struct true_type {
static const __constant__ bool value = true;
};
struct false_type {
static const __constant__ bool value = false;
};
template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};
template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
template <typename __T> typename add_rvalue_reference<__T>::type declval();
// decltype is only available in C++11 and above.
#if __cplusplus >= 201103L
// __hip_promote
namespace __hip {
template <class _Tp> struct __numeric_type {
static void __test(...);
static _Float16 __test(_Float16);
@ -227,8 +374,8 @@ template <class _Tp> struct __numeric_type {
// No support for long double, use double instead.
static double __test(long double);
typedef decltype(__test(std::declval<_Tp>())) type;
static const bool value = !std::is_same<type, void>::value;
typedef decltype(__test(declval<_Tp>())) type;
static const bool value = !is_same<type, void>::value;
};
template <> struct __numeric_type<void> { static const bool value = true; };
@ -271,18 +418,17 @@ public:
template <class _A1, class _A2 = void, class _A3 = void>
class __promote : public __promote_imp<_A1, _A2, _A3> {};
} // namespace __hip
#endif //__cplusplus >= 201103L
} // namespace __hip
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
// floor(double).
#define __HIP_OVERLOAD1(__retty, __fn) \
template <typename __T> \
__DEVICE__ typename __hip_enable_if<std::numeric_limits<__T>::is_integer, \
__retty>::type \
__fn(__T __x) { \
__DEVICE__ \
typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type \
__fn(__T __x) { \
return ::__fn((double)__x); \
}
@ -293,8 +439,7 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
#define __HIP_OVERLOAD2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ typename __hip_enable_if< \
std::numeric_limits<__T1>::is_specialized && \
std::numeric_limits<__T2>::is_specialized, \
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, \
typename __hip::__promote<__T1, __T2>::type>::type \
__fn(__T1 __x, __T2 __y) { \
typedef typename __hip::__promote<__T1, __T2>::type __result_type; \
@ -303,16 +448,14 @@ class __promote : public __promote_imp<_A1, _A2, _A3> {};
#else
#define __HIP_OVERLOAD2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ \
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized && \
std::numeric_limits<__T2>::is_specialized, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \
__hip::is_arithmetic<__T2>::value, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
return __fn((double)__x, (double)__y); \
}
#endif
__HIP_OVERLOAD1(double, abs)
__HIP_OVERLOAD1(double, acos)
__HIP_OVERLOAD1(double, acosh)
__HIP_OVERLOAD1(double, asin)
@ -336,7 +479,9 @@ __HIP_OVERLOAD1(double, floor)
__HIP_OVERLOAD2(double, fmax)
__HIP_OVERLOAD2(double, fmin)
__HIP_OVERLOAD2(double, fmod)
#if !defined(__HIPCC_RTC__)
__HIP_OVERLOAD1(int, fpclassify)
#endif // !defined(__HIPCC_RTC__)
__HIP_OVERLOAD2(double, hypot)
__HIP_OVERLOAD1(int, ilogb)
__HIP_OVERLOAD1(bool, isfinite)
@ -382,9 +527,8 @@ __HIP_OVERLOAD2(double, min)
#if __cplusplus >= 201103L
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ typename __hip_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized &&
std::numeric_limits<__T3>::is_specialized,
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
__hip::is_arithmetic<__T3>::value,
typename __hip::__promote<__T1, __T2, __T3>::type>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
@ -392,33 +536,32 @@ fma(__T1 __x, __T2 __y, __T3 __z) {
}
#else
template <typename __T1, typename __T2, typename __T3>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized &&
std::numeric_limits<__T3>::is_specialized,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value &&
__hip::is_arithmetic<__T3>::value,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
return ::fma((double)__x, (double)__y, (double)__z);
}
#endif
template <typename __T>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
frexp(__T __x, int *__exp) {
return ::frexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
ldexp(__T __x, int __exp) {
return ::ldexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
modf(__T __x, double *__exp) {
return ::modf((double)__x, __exp);
}
@ -426,8 +569,8 @@ __DEVICE__
#if __cplusplus >= 201103L
template <typename __T1, typename __T2>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized,
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value,
typename __hip::__promote<__T1, __T2>::type>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
@ -435,25 +578,24 @@ __DEVICE__
}
#else
template <typename __T1, typename __T2>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
__DEVICE__ typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
return ::remquo((double)__x, (double)__y, __quo);
}
#endif
template <typename __T>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
scalbln(__T __x, long int __exp) {
return ::scalbln((double)__x, __exp);
}
template <typename __T>
__DEVICE__
typename __hip_enable_if<std::numeric_limits<__T>::is_integer, double>::type
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
scalbn(__T __x, int __exp) {
return ::scalbn((double)__x, __exp);
}
@ -468,14 +610,15 @@ __DEVICE__
#endif // defined(__cplusplus)
// Define these overloads inside the namespace our standard library uses.
#if !defined(__HIPCC_RTC__)
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif // _LIBCPP_BEGIN_NAMESPACE_STD
// Pull the new overloads we defined above into namespace std.
// using ::abs; - This may be considered for C++.
@ -620,11 +763,13 @@ _LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
} // namespace std
#endif
#endif // _LIBCPP_END_NAMESPACE_STD
#endif // !defined(__HIPCC_RTC__)
// Define device-side math functions from <ymath.h> on MSVC.
#if !defined(__HIPCC_RTC__)
#if defined(_MSC_VER)
// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
@ -658,6 +803,7 @@ __DEVICE__ __attribute__((overloadable)) float _FSinh(float x, float y) {
}
#endif // defined(__cplusplus)
#endif // defined(_MSC_VER)
#endif // !defined(__HIPCC_RTC__)
#pragma pop_macro("__DEVICE__")

View File

@ -138,14 +138,22 @@ __device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
__device__ __attribute__((const)) float
__llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
__device__ __attribute__((const)) float
__llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
__device__ __attribute__((const)) float
__llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
__device__ __attribute__((const)) float
__llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
__device__ inline __attribute__((const)) float
__llvm_amdgcn_cos_f32(float __x) {
return __builtin_amdgcn_cosf(__x);
}
__device__ inline __attribute__((const)) float
__llvm_amdgcn_rcp_f32(float __x) {
return __builtin_amdgcn_rcpf(__x);
}
__device__ inline __attribute__((const)) float
__llvm_amdgcn_rsq_f32(float __x) {
return __builtin_amdgcn_rsqf(__x);
}
__device__ inline __attribute__((const)) float
__llvm_amdgcn_sin_f32(float __x) {
return __builtin_amdgcn_sinf(__x);
}
// END INTRINSICS
// END FLOAT
@ -269,10 +277,14 @@ __device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
double);
__device__ __attribute__((const)) double
__llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
__device__ __attribute__((const)) double
__llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
__device__ inline __attribute__((const)) double
__llvm_amdgcn_rcp_f64(double __x) {
return __builtin_amdgcn_rcp(__x);
}
__device__ inline __attribute__((const)) double
__llvm_amdgcn_rsq_f64(double __x) {
return __builtin_amdgcn_rsq(__x);
}
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
__device__ _Float16 __ocml_cos_f16(_Float16);

View File

@ -13,11 +13,13 @@
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
#endif
#if !defined(__HIPCC_RTC__)
#if defined(__cplusplus)
#include <algorithm>
#endif
#include <limits.h>
#include <stdint.h>
#endif // __HIPCC_RTC__
#pragma push_macro("__DEVICE__")
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
@ -36,7 +38,7 @@ template<bool>
struct __compare_result{};
template<>
struct __compare_result<true> {
static const bool valid;
static const __device__ bool valid;
};
__DEVICE__
@ -1260,6 +1262,7 @@ float min(float __x, float __y) { return fminf(__x, __y); }
__DEVICE__
double min(double __x, double __y) { return fmin(__x, __y); }
#if !defined(__HIPCC_RTC__)
__host__ inline static int min(int __arg1, int __arg2) {
return std::min(__arg1, __arg2);
}
@ -1267,6 +1270,7 @@ __host__ inline static int min(int __arg1, int __arg2) {
__host__ inline static int max(int __arg1, int __arg2) {
return std::max(__arg1, __arg2);
}
#endif // __HIPCC_RTC__
#endif
#pragma pop_macro("__DEVICE__")

View File

@ -18,52 +18,107 @@
#if __HIP__
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#define __host__ __attribute__((host))
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
#ifdef __cplusplus
extern "C" {
__attribute__((__visibility__("default")))
__attribute__((weak))
__attribute__((noreturn))
__device__ void __cxa_pure_virtual(void) {
__builtin_trap();
}
__attribute__((__visibility__("default")))
__attribute__((weak))
__attribute__((noreturn))
__device__ void __cxa_deleted_virtual(void) {
__builtin_trap();
}
}
#endif //__cplusplus
#if !defined(__HIPCC_RTC__)
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#else
typedef __SIZE_TYPE__ size_t;
// Define macros which are needed to declare HIP device API's without standard
// C/C++ headers. This is for readability so that these API's can be written
// the same way as non-hipRTC use case. These macros need to be popped so that
// they do not pollute users' name space.
#pragma push_macro("NULL")
#pragma push_macro("uint32_t")
#pragma push_macro("uint64_t")
#pragma push_macro("CHAR_BIT")
#pragma push_macro("INT_MAX")
#define NULL (void *)0
#define uint32_t __UINT32_TYPE__
#define uint64_t __UINT64_TYPE__
#define CHAR_BIT __CHAR_BIT__
#define INT_MAX __INTMAX_MAX__
#endif // __HIPCC_RTC__
typedef __SIZE_TYPE__ __hip_size_t;
#ifdef __cplusplus
extern "C" {
#endif //__cplusplus
#if __HIP_ENABLE_DEVICE_MALLOC__
extern "C" __device__ void *__hip_malloc(size_t __size);
extern "C" __device__ void *__hip_free(void *__ptr);
static inline __device__ void *malloc(size_t __size) {
__device__ void *__hip_malloc(__hip_size_t __size);
__device__ void *__hip_free(void *__ptr);
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
return __hip_malloc(__size);
}
static inline __device__ void *free(void *__ptr) { return __hip_free(__ptr); }
#else
static inline __device__ void *malloc(size_t __size) {
__builtin_trap();
return nullptr;
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
return __hip_free(__ptr);
}
static inline __device__ void *free(void *__ptr) {
#else
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
__builtin_trap();
return nullptr;
return (void *)0;
}
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
__builtin_trap();
return (void *)0;
}
#endif
#ifdef __cplusplus
} // extern "C"
#endif //__cplusplus
#include <__clang_hip_libdevice_declares.h>
#include <__clang_hip_math.h>
#if !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
#if defined(__HIPCC_RTC__)
#include <__clang_hip_cmath.h>
#else
#include <__clang_cuda_math_forward_declares.h>
#include <__clang_hip_cmath.h>
#include <__clang_cuda_complex_builtins.h>
#include <algorithm>
#include <complex>
#include <new>
#endif // !_OPENMP || __HIP_ENABLE_CUDA_WRAPPER_FOR_OPENMP__
#endif // __HIPCC_RTC__
#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
#if defined(__HIPCC_RTC__)
#pragma pop_macro("NULL")
#pragma pop_macro("uint32_t")
#pragma pop_macro("uint64_t")
#pragma pop_macro("CHAR_BIT")
#pragma pop_macro("INT_MAX")
#endif // __HIPCC_RTC__
#endif // __HIP__
#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__

1584
lib/include/altivec.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -15,8 +15,13 @@
#define __AMXINTRIN_H
#ifdef __x86_64__
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS_TILE \
__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
#define __DEFAULT_FN_ATTRS_BF16 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
/// Load tile configuration from a 64-byte memory location specified by
/// "mem_addr". The tile configuration includes the tile type palette, the
@ -25,7 +30,7 @@
/// config and the tile data, and the tiles are zeroed. Any invalid
/// configurations will result in #GP fault.
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
///
@ -41,7 +46,7 @@ _tile_loadconfig(const void *__config) {
/// palette, the number of bytes per row, and the number of rows. If tiles
/// are not configured, all zeroes will be stored to memory.
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
///
@ -55,7 +60,7 @@ _tile_storeconfig(void *__config) {
/// Release the tile configuration to return to the init state, which
/// releases all storage it currently holds.
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
@ -66,7 +71,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// destination tile "dst" using the tile configuration previously configured
/// via "_tile_loadconfig".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
@ -86,7 +91,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
@ -104,7 +109,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// "stride" using the tile configuration previously configured via
/// "_tile_loadconfig".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
@ -119,7 +124,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// Zero the tile specified by "tdest".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
@ -133,7 +138,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
@ -152,7 +157,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
@ -171,7 +176,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
@ -190,7 +195,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
@ -208,7 +213,7 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <x86intrin.h>
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
@ -221,10 +226,12 @@ static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
#define _tile_dpbf16ps(dst, src0, src1) \
__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
#define __DEFAULT_FN_ATTRS_INT8 \
__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
/// AMX tile register size can be configured, the maximum size is 16x64=1024
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
/// represent 2D tile and the fixed size is maximum amx tile register size.
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
@ -232,12 +239,43 @@ _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
__SIZE_TYPE__ stride) {
return __builtin_ia32_tileloaddt164_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ void __DEFAULT_FN_ATTRS_INT8
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
__SIZE_TYPE__ stride, _tile1024i tile) {
@ -245,34 +283,211 @@ _tile_stored_internal(unsigned short m, unsigned short n, void *base,
(__SIZE_TYPE__)(stride), tile);
}
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
/// on the shape information to do configure. The constant value is preferred
/// for optimization by compiler.
typedef struct __tile1024i_str {
const unsigned short row;
const unsigned short col;
_tile1024i tile;
} __tile1024i;
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src1,
__tile1024i src2) {
dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile,
src1.tile, src2.tile);
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
/// caching can be optimized accordingly.
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be loaded in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_stream_loadd(__tile1024i *dst, const void *base,
__SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
/// and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
/// "dst", and store the 32-bit result back to tile "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_INT8
static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
///
/// \param dst
/// A destination tile. Max size is 1024 Bytes.
/// \param base
/// A pointer to base address.
/// \param stride
/// The stride between the rows' data to be stored in memory.
__DEFAULT_FN_ATTRS_TILE
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
}
/// Zero the tile specified by "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
///
/// \param dst
/// The destination tile to be zero. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_TILE
static void __tile_zero(__tile1024i *dst) {
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
/// "dst".
///
/// \headerfile <immintrin.h>
///
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
///
/// \param dst
/// The destination tile. Max size is 1024 Bytes.
/// \param src0
/// The 1st source tile. Max size is 1024 Bytes.
/// \param src1
/// The 2nd source tile. Max size is 1024 Bytes.
__DEFAULT_FN_ATTRS_BF16
static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
__tile1024i src1) {
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#undef __DEFAULT_FN_ATTRS_BF16
#endif /* __x86_64__ */
#endif /* __AMXINTRIN_H */

View File

@ -639,6 +639,49 @@ __jcvt(double __a) {
}
#endif
/* Armv8.5-A FP rounding intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint32zf(float __a) {
return __builtin_arm_frint32zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint32z(double __a) {
return __builtin_arm_frint32z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint64zf(float __a) {
return __builtin_arm_frint64zf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint64z(double __a) {
return __builtin_arm_frint64z(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint32xf(float __a) {
return __builtin_arm_frint32xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint32x(double __a) {
return __builtin_arm_frint32x(__a);
}
static __inline__ float __attribute__((__always_inline__, __nodebug__))
__frint64xf(float __a) {
return __builtin_arm_frint64xf(__a);
}
static __inline__ double __attribute__((__always_inline__, __nodebug__))
__frint64x(double __a) {
return __builtin_arm_frint64x(__a);
}
#endif
/* Armv8.7-A load/store 64-byte intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
typedef struct {
@ -709,6 +752,18 @@ __arm_st64bv0(void *__addr, data512_t __value) {
#endif /* __ARM_FEATURE_TME */
/* Armv8.5-A Random number generation intrinsics */
#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__rndr(uint64_t *__p) {
return __builtin_arm_rndr(__p);
}
static __inline__ int __attribute__((__always_inline__, __nodebug__))
__rndrrs(uint64_t *__p) {
return __builtin_arm_rndrrs(__p);
}
#endif
#if defined(__cplusplus)
}
#endif

1242
lib/include/arm_neon.h vendored

File diff suppressed because it is too large Load Diff

17687
lib/include/arm_sve.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -9300,8 +9300,11 @@ _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
* computations. In vector-reduction arithmetic, the evaluation order is
* independent of the order of the input elements of V.
* For floating point types, we always assume the elements are reassociable even
* if -fast-math is off.
* For floating-point intrinsics:
* 1. When using fadd/fmul intrinsics, the order of operations within the
* vector is unspecified (associative math).
* 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
* produce unspecified results.
* Used bisection method. At each step, we partition the vector with previous
* step in half, and the operation is performed on its two halves.
@ -9524,75 +9527,49 @@ _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
return __builtin_ia32_reduce_umin_d512((__v16si)__V);
}
#define _mm512_mask_reduce_operator(op) \
__m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
__m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
__m256d __t3 = _mm256_##op(__t1, __t2); \
__m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
__m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
__m128d __t6 = _mm_##op(__t4, __t5); \
__m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
__m128d __t8 = _mm_##op(__t6, __t7); \
return __t8[0]
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_max_pd(__m512d __V) {
_mm512_mask_reduce_operator(max_pd);
return __builtin_ia32_reduce_fmax_pd512(__V);
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_min_pd(__m512d __V) {
_mm512_mask_reduce_operator(min_pd);
return __builtin_ia32_reduce_fmin_pd512(__V);
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
__V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
_mm512_mask_reduce_operator(max_pd);
return __builtin_ia32_reduce_fmax_pd512(__V);
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
__V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
_mm512_mask_reduce_operator(min_pd);
return __builtin_ia32_reduce_fmin_pd512(__V);
}
#undef _mm512_mask_reduce_operator
#define _mm512_mask_reduce_operator(op) \
__m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
__m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
__m256 __t3 = _mm256_##op(__t1, __t2); \
__m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
__m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
__m128 __t6 = _mm_##op(__t4, __t5); \
__m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
__m128 __t8 = _mm_##op(__t6, __t7); \
__m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
__m128 __t10 = _mm_##op(__t8, __t9); \
return __t10[0]
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ps(__m512 __V) {
_mm512_mask_reduce_operator(max_ps);
return __builtin_ia32_reduce_fmax_ps512(__V);
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ps(__m512 __V) {
_mm512_mask_reduce_operator(min_ps);
return __builtin_ia32_reduce_fmin_ps512(__V);
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
__V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
_mm512_mask_reduce_operator(max_ps);
return __builtin_ia32_reduce_fmax_ps512(__V);
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
__V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
_mm512_mask_reduce_operator(min_ps);
return __builtin_ia32_reduce_fmin_ps512(__V);
}
#undef _mm512_mask_reduce_operator
/// Moves the least significant 32 bits of a vector of [16 x i32] to a
/// 32-bit signed integer value.
@ -9611,6 +9588,169 @@ _mm512_cvtsi512_si32(__m512i __A) {
return __b[0];
}
/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
/// locations starting at location \a base_addr at packed 32-bit integer indices
/// stored in the lower half of \a vindex scaled by \a scale them in dst.
///
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// dst[i+63:i] := MEM[addr+63:addr]
/// ENDFOR
/// dst[MAX:512] := 0
/// \endoperation
#define _mm512_i32logather_pd(vindex, base_addr, scale) \
_mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
/// Loads 8 double-precision (64-bit) floating-point elements from memory
/// starting at location \a base_addr at packed 32-bit integer indices stored in
/// the lower half of \a vindex scaled by \a scale into dst using writemask
/// \a mask (elements are copied from \a src when the corresponding mask bit is
/// not set).
///
/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// IF mask[j]
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// dst[i+63:i] := MEM[addr+63:addr]
/// ELSE
/// dst[i+63:i] := src[i+63:i]
/// FI
/// ENDFOR
/// dst[MAX:512] := 0
/// \endoperation
#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
_mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
(base_addr), (scale))
/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
/// at packed 32-bit integer indices stored in the lower half of \a vindex
/// scaled by \a scale and stores them in dst.
///
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// dst[i+63:i] := MEM[addr+63:addr]
/// ENDFOR
/// dst[MAX:512] := 0
/// \endoperation
#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
_mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
/// at packed 32-bit integer indices stored in the lower half of \a vindex
/// scaled by \a scale and stores them in dst using writemask \a mask (elements
/// are copied from \a src when the corresponding mask bit is not set).
///
/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// IF mask[j]
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// dst[i+63:i] := MEM[addr+63:addr]
/// ELSE
/// dst[i+63:i] := src[i+63:i]
/// FI
/// ENDFOR
/// dst[MAX:512] := 0
/// \endoperation
#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
_mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
(base_addr), (scale))
/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
/// and to memory locations starting at location \a base_addr at packed 32-bit
/// integer indices stored in \a vindex scaled by \a scale.
///
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// MEM[addr+63:addr] := v1[i+63:i]
/// ENDFOR
/// \endoperation
#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
_mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
/// to memory locations starting at location \a base_addr at packed 32-bit
/// integer indices stored in \a vindex scaled by \a scale. Only those elements
/// whose corresponding mask bit is set in writemask \a mask are written to
/// memory.
///
/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// IF mask[j]
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// MEM[addr+63:addr] := a[i+63:i]
/// FI
/// ENDFOR
/// \endoperation
#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
_mm512_mask_i32scatter_pd((base_addr), (mask), \
_mm512_castsi512_si256(vindex), (v1), (scale))
/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
/// memory locations starting at location \a base_addr at packed 32-bit integer
/// indices stored in \a vindex scaled by \a scale.
///
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// MEM[addr+63:addr] := a[i+63:i]
/// ENDFOR
/// \endoperation
#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
_mm512_i32scatter_epi64((base_addr), \
_mm512_castsi512_si256(vindex), (v1), (scale))
/// Stores 8 packed 64-bit integer elements located in a and stores them in
/// memory locations starting at location \a base_addr at packed 32-bit integer
/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
/// whose corresponding mask bit is not set are not written to memory).
///
/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// i := j*64
/// m := j*32
/// IF mask[j]
/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
/// MEM[addr+63:addr] := a[i+63:i]
/// FI
/// ENDFOR
/// \endoperation
#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
_mm512_mask_i32scatter_epi64((base_addr), (mask), \
_mm512_castsi512_si256(vindex), (v1), (scale))
#undef __DEFAULT_FN_ATTRS512
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS

16
lib/include/builtins.h vendored Normal file
View File

@ -0,0 +1,16 @@
/*===---- builtins.h - Standard header for extra builtins -----------------===*\
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
\*===----------------------------------------------------------------------===*/
/// Some legacy compilers have builtin definitions in a file named builtins.h.
/// This header file has been added to allow compatibility with code that was
/// written for those compilers. Code may have an include line for this file
/// and to avoid an error an empty file with this name is provided.
#ifndef __BUILTINS_H
#define __BUILTINS_H
#endif /* __BUILTINS_H */

View File

@ -72,8 +72,16 @@
#define _GLIBCXX_USE_C99_COMPLEX 0
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
// Work around a compatibility issue with libstdc++ 11.1.0
// https://bugs.llvm.org/show_bug.cgi?id=50383
#pragma push_macro("__failed_assertion")
#if _GLIBCXX_RELEASE == 11
#define __failed_assertion __cuda_failed_assertion
#endif
#include_next <complex>
#pragma pop_macro("__failed_assertion")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")

View File

@ -0,0 +1,298 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_
#define _HEXAGON_CIRC_BREV_INTRINSICS_H_ 1
#include <hexagon_protos.h>
#include <stdint.h>
/* Circular Load */
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_D(Word64 dst, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_D(dest,ptr,incr,bufsize,K) \
{ ptr = (int64_t *) HEXAGON_circ_ldd (ptr, &(dest), ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_W(Word32 dst, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_W(dest,ptr,incr,bufsize,K) \
{ ptr = (int *) HEXAGON_circ_ldw (ptr, &(dest), (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_H(Word16 dst, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_H(dest,ptr,incr,bufsize,K) \
{ ptr = (int16_t *) HEXAGON_circ_ldh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_UH( UWord16 dst, UWord16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_UH(dest,ptr,incr,bufsize,K) \
{ ptr = (uint16_t *) HEXAGON_circ_lduh (ptr, &(dest), ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_B(Word8 dst, Word8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_B(dest,ptr,incr,bufsize,K) \
{ ptr = (int8_t *) HEXAGON_circ_ldb (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_load_update_UB(dest,ptr,incr,bufsize,K) \
{ ptr = (uint8_t *) HEXAGON_circ_ldub (ptr, &(dest), ((((K)-2)<<24)|(bufsize)), incr); }
/* Circular Store */
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_store_update_D(Word64 *src, Word64 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_store_update_D(src,ptr,incr,bufsize,K) \
{ ptr = (int64_t *) HEXAGON_circ_std (ptr, src, ((((K)+1)<<24)|((bufsize)<<3)), ((incr)*8)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_store_update_W(Word32 *src, Word32 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_store_update_W(src,ptr,incr,bufsize,K) \
{ ptr = (int *) HEXAGON_circ_stw (ptr, src, (((K)<<24)|((bufsize)<<2)), ((incr)*4)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_store_update_HL(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_store_update_HL(src,ptr,incr,bufsize,K) \
{ ptr = (int16_t *) HEXAGON_circ_sth (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_store_update_HH(Word16 *src, Word16 *ptr, UWord32 incr, UWord32 bufsize, UWord32 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_store_update_HH(src,ptr,incr,bufsize,K) \
{ ptr = (int16_t *) HEXAGON_circ_sthhi (ptr, src, ((((K)-1)<<24)|((bufsize)<<1)), ((incr)*2)); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_circ_store_update_B(Word8 *src, Word8 *ptr, UWord32 I4, UWord32 bufsize, UWord64 K)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_circ_store_update_B(src,ptr,incr,bufsize,K) \
{ ptr = (int8_t *) HEXAGON_circ_stb (ptr, src, ((((K)-2)<<24)|(bufsize)), incr); }
/* Bit Reverse Load */
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_D(Word64 dst, Word64 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_D(dest,ptr,log2bufsize) \
{ ptr = (int64_t *) HEXAGON_brev_ldd (ptr, &(dest), (1<<(16-((log2bufsize) + 3)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_W(Word32 dst, Word32 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_W(dest,ptr,log2bufsize) \
{ ptr = (int *) HEXAGON_brev_ldw (ptr, &(dest), (1<<(16-((log2bufsize) + 2)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_H(Word16 dst, Word16 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_H(dest,ptr,log2bufsize) \
{ ptr = (int16_t *) HEXAGON_brev_ldh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_UH(UWord16 dst, UWord16 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_UH(dest,ptr,log2bufsize) \
{ ptr = (uint16_t *) HEXAGON_brev_lduh (ptr, &(dest), (1<<(16-((log2bufsize) + 1)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_B(Word8 dst, Word8 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_B(dest,ptr,log2bufsize) \
{ ptr = (int8_t *) HEXAGON_brev_ldb (ptr, &(dest), (1<<(16-((log2bufsize))))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_load_update_UB(UWord8 dst, UWord8 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_load_update_UB(dest,ptr,log2bufsize) \
{ ptr = (uint8_t *) HEXAGON_brev_ldub (ptr, &(dest), (1<<(16-((log2bufsize))))); }
/* Bit Reverse Store */
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_store_update_D(Word64 *src, Word64 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_store_update_D(src,ptr,log2bufsize) \
{ ptr = (int64_t *) HEXAGON_brev_std (ptr, src, (1<<(16-((log2bufsize) + 3)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_store_update_W(Word32 *src, Word32 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_store_update_W(src,ptr,log2bufsize) \
{ ptr = (int *) HEXAGON_brev_stw (ptr, src, (1<<(16-((log2bufsize) + 2)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_store_update_HL(Word16 *src, Word16 *ptr, Word32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_store_update_HL(src,ptr,log2bufsize) \
{ ptr = (int16_t *) HEXAGON_brev_sth (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_store_update_HH(Word16 *src, Word16 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_store_update_HH(src,ptr,log2bufsize) \
{ ptr = (int16_t *) HEXAGON_brev_sthhi (ptr, src, (1<<(16-((log2bufsize) + 1)))); }
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: void Q6_bitrev_store_update_B(Word8 *src, Word8 *ptr, UWord32 Iu4)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#define Q6_bitrev_store_update_B(src,ptr,log2bufsize) \
{ ptr = (int8_t *) HEXAGON_brev_stb (ptr, src, (1<<(16-((log2bufsize))))); }
#define HEXAGON_circ_ldd __builtin_circ_ldd
#define HEXAGON_circ_ldw __builtin_circ_ldw
#define HEXAGON_circ_ldh __builtin_circ_ldh
#define HEXAGON_circ_lduh __builtin_circ_lduh
#define HEXAGON_circ_ldb __builtin_circ_ldb
#define HEXAGON_circ_ldub __builtin_circ_ldub
#define HEXAGON_circ_std __builtin_circ_std
#define HEXAGON_circ_stw __builtin_circ_stw
#define HEXAGON_circ_sth __builtin_circ_sth
#define HEXAGON_circ_sthhi __builtin_circ_sthhi
#define HEXAGON_circ_stb __builtin_circ_stb
#define HEXAGON_brev_ldd __builtin_brev_ldd
#define HEXAGON_brev_ldw __builtin_brev_ldw
#define HEXAGON_brev_ldh __builtin_brev_ldh
#define HEXAGON_brev_lduh __builtin_brev_lduh
#define HEXAGON_brev_ldb __builtin_brev_ldb
#define HEXAGON_brev_ldub __builtin_brev_ldub
#define HEXAGON_brev_std __builtin_brev_std
#define HEXAGON_brev_stw __builtin_brev_stw
#define HEXAGON_brev_sth __builtin_brev_sth
#define HEXAGON_brev_sthhi __builtin_brev_sthhi
#define HEXAGON_brev_stb __builtin_brev_stb
#ifdef __HVX__
/* ==========================================================================
Assembly Syntax: if (Qt) vmem(Rt+#0) = Vs
C Intrinsic Prototype: void Q6_vmaskedstoreq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
Instruction Type: COPROC_VMEM
Execution Slots: SLOT0
========================================================================== */
#define Q6_vmaskedstoreq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstoreq)
/* ==========================================================================
Assembly Syntax: if (!Qt) vmem(Rt+#0) = Vs
C Intrinsic Prototype: void Q6_vmaskedstorenq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
Instruction Type: COPROC_VMEM
Execution Slots: SLOT0
========================================================================== */
#define Q6_vmaskedstorenq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorenq)
/* ==========================================================================
Assembly Syntax: if (Qt) vmem(Rt+#0):nt = Vs
C Intrinsic Prototype: void Q6_vmaskedstorentq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
Instruction Type: COPROC_VMEM
Execution Slots: SLOT0
========================================================================== */
#define Q6_vmaskedstorentq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentq)
/* ==========================================================================
Assembly Syntax: if (!Qt) vmem(Rt+#0):nt = Vs
C Intrinsic Prototype: void Q6_vmaskedstorentnq_QAV(HVX_VectorPred Qt, HVX_VectorAddress A, HVX_Vector Vs)
Instruction Type: COPROC_VMEM
Execution Slots: SLOT0
========================================================================== */
#define Q6_vmaskedstorentnq_QAV __BUILTIN_VECTOR_WRAP(__builtin_HEXAGON_V6_vmaskedstorentnq)
#endif
#endif /* #ifndef _HEXAGON_CIRC_BREV_INTRINSICS_H_ */
#ifdef __NOT_DEFINED__
/*** comment block template ***/
/* ==========================================================================
Assembly Syntax: Return=instruction()
C Intrinsic Prototype: ReturnType Intrinsic(ParamType Rs, ParamType Rt)
Instruction Type: InstructionType
Execution Slots: SLOT0123
========================================================================== */
#endif /*** __NOT_DEFINED__ ***/

8450
lib/include/hexagon_protos.h vendored Normal file

File diff suppressed because it is too large Load Diff

2653
lib/include/hexagon_types.h vendored Normal file

File diff suppressed because it is too large Load Diff

4392
lib/include/hvx_hexagon_protos.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -72,11 +72,6 @@
#include <f16cintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__VPCLMULQDQ__)
#include <vpclmulqdqintrin.h>
#endif
/* No feature check desired due to internal checks */
#include <bmiintrin.h>
@ -230,6 +225,11 @@
#include <pkuintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__VPCLMULQDQ__)
#include <vpclmulqdqintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__VAES__)
#include <vaesintrin.h>

49
lib/include/intrin.h vendored
View File

@ -451,24 +451,47 @@ unsigned char _InterlockedCompareExchange128_rel(__int64 volatile *_Destination,
static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
unsigned char const *__src,
size_t __n) {
__asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n)
: : "memory");
#if defined(__x86_64__)
__asm__ __volatile__("rep movsb"
: "+D"(__dst), "+S"(__src), "+c"(__n)
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
#endif
}
static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
unsigned long const *__src,
size_t __n) {
#if defined(__x86_64__)
__asm__ __volatile__("rep movsl"
: "+D"(__dst), "+S"(__src), "+c"(__n)
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
#endif
}
static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
unsigned short const *__src,
size_t __n) {
#if defined(__x86_64__)
__asm__ __volatile__("rep movsw"
: "+D"(__dst), "+S"(__src), "+c"(__n)
:
: "memory");
#else
__asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
: "+D"(__dst), "+r"(__src), "+c"(__n)
:
: "memory");
#endif
}
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
unsigned long __x,
@ -507,16 +530,26 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
|* Misc
\*----------------------------------------------------------------------------*/
#if defined(__i386__) || defined(__x86_64__)
#if defined(__i386__)
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
__asm("cpuid" \
: "=a"(__eax), "=b"(__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__leaf), "2"(__count))
#else
/* x86-64 uses %rbx as the base register, so preserve it. */
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
__asm("xchgq %%rbx,%q1\n" \
"cpuid\n" \
"xchgq %%rbx,%q1" \
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
: "0"(__leaf), "2"(__count))
#endif
static __inline__ void __DEFAULT_FN_ATTRS __cpuid(int __info[4], int __level) {
__asm__("cpuid"
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
: "a"(__level), "c"(0));
__cpuid_count(__level, 0, __info[0], __info[1], __info[2], __info[3]);
}
static __inline__ void __DEFAULT_FN_ATTRS __cpuidex(int __info[4], int __level,
int __ecx) {
__asm__("cpuid"
: "=a"(__info[0]), "=b"(__info[1]), "=c"(__info[2]), "=d"(__info[3])
: "a"(__level), "c"(__ecx));
__cpuid_count(__level, __ecx, __info[0], __info[1], __info[2], __info[3]);
}
static __inline__ void __DEFAULT_FN_ATTRS __halt(void) {
__asm__ volatile("hlt");

View File

@ -230,10 +230,12 @@ _mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
@ -267,10 +269,12 @@ _mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
@ -304,10 +308,12 @@ _mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
/// IF (IllegalHandle)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF (Authentic == 0)
/// ZF := 1
/// MEM[__odata+127:__odata] := 0
/// ELSE
/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
/// ZF := 0
@ -354,10 +360,16 @@ _mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
@ -394,10 +406,16 @@ _mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
@ -434,10 +452,16 @@ _mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
/// IF (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
@ -474,10 +498,16 @@ _mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void*
/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
/// If (IllegalHandle)
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
/// IF Authentic == 0
/// ZF := 1
/// FOR i := 0 to 7
/// __odata[i] := 0
/// ENDFOR
/// ELSE
/// FOR i := 0 to 7
/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)

View File

@ -21,9 +21,37 @@
#define cl_khr_subgroup_shuffle 1
#define cl_khr_subgroup_shuffle_relative 1
#define cl_khr_subgroup_clustered_reduce 1
#define cl_khr_extended_bit_ops 1
#define cl_khr_integer_dot_product 1
#define __opencl_c_integer_dot_product_input_4x8bit 1
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
#endif // defined(__SPIR__)
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
// Define feature macros for OpenCL C 2.0
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
#define __opencl_c_pipes 1
#define __opencl_c_generic_address_space 1
#define __opencl_c_work_group_collective_functions 1
#define __opencl_c_atomic_order_acq_rel 1
#define __opencl_c_atomic_order_seq_cst 1
#define __opencl_c_atomic_scope_device 1
#define __opencl_c_atomic_scope_all_devices 1
#define __opencl_c_device_enqueue 1
#define __opencl_c_read_write_images 1
#define __opencl_c_program_scope_global_variables 1
#define __opencl_c_images 1
#endif
// Define header-only feature macros for OpenCL C 3.0.
#if (__OPENCL_C_VERSION__ == 300)
// For the SPIR target all features are supported.
#if defined(__SPIR__)
#define __opencl_c_atomic_scope_all_devices 1
#endif // defined(__SPIR__)
#endif // (__OPENCL_C_VERSION__ == 300)
// built-in scalar data types:
/**
@ -141,7 +169,9 @@ typedef double double8 __attribute__((ext_vector_type(8)));
typedef double double16 __attribute__((ext_vector_type(16)));
#endif
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
#if defined(__OPENCL_CPP_VERSION__)
#define NULL nullptr
#elif defined(__OPENCL_C_VERSION__)
#define NULL ((void*)0)
#endif
@ -297,7 +327,12 @@ typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
#if defined(__opencl_c_atomic_scope_all_devices)
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
memory_scope_all_devices = memory_scope_all_svm_devices,
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
#endif // defined(__opencl_c_atomic_scope_all_devices)
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
@ -322,7 +357,9 @@ typedef enum memory_order
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
#if defined(__opencl_c_atomic_order_seq_cst)
memory_order_seq_cst = __ATOMIC_SEQ_CST
#endif
} memory_order;
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@ -445,8 +482,113 @@ typedef struct {
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
/**
* OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
* Reinterprets a data type as another data type of the same size
*/
#define as_char(x) __builtin_astype((x), char)
#define as_char2(x) __builtin_astype((x), char2)
#define as_char3(x) __builtin_astype((x), char3)
#define as_char4(x) __builtin_astype((x), char4)
#define as_char8(x) __builtin_astype((x), char8)
#define as_char16(x) __builtin_astype((x), char16)
#define as_uchar(x) __builtin_astype((x), uchar)
#define as_uchar2(x) __builtin_astype((x), uchar2)
#define as_uchar3(x) __builtin_astype((x), uchar3)
#define as_uchar4(x) __builtin_astype((x), uchar4)
#define as_uchar8(x) __builtin_astype((x), uchar8)
#define as_uchar16(x) __builtin_astype((x), uchar16)
#define as_short(x) __builtin_astype((x), short)
#define as_short2(x) __builtin_astype((x), short2)
#define as_short3(x) __builtin_astype((x), short3)
#define as_short4(x) __builtin_astype((x), short4)
#define as_short8(x) __builtin_astype((x), short8)
#define as_short16(x) __builtin_astype((x), short16)
#define as_ushort(x) __builtin_astype((x), ushort)
#define as_ushort2(x) __builtin_astype((x), ushort2)
#define as_ushort3(x) __builtin_astype((x), ushort3)
#define as_ushort4(x) __builtin_astype((x), ushort4)
#define as_ushort8(x) __builtin_astype((x), ushort8)
#define as_ushort16(x) __builtin_astype((x), ushort16)
#define as_int(x) __builtin_astype((x), int)
#define as_int2(x) __builtin_astype((x), int2)
#define as_int3(x) __builtin_astype((x), int3)
#define as_int4(x) __builtin_astype((x), int4)
#define as_int8(x) __builtin_astype((x), int8)
#define as_int16(x) __builtin_astype((x), int16)
#define as_uint(x) __builtin_astype((x), uint)
#define as_uint2(x) __builtin_astype((x), uint2)
#define as_uint3(x) __builtin_astype((x), uint3)
#define as_uint4(x) __builtin_astype((x), uint4)
#define as_uint8(x) __builtin_astype((x), uint8)
#define as_uint16(x) __builtin_astype((x), uint16)
#define as_long(x) __builtin_astype((x), long)
#define as_long2(x) __builtin_astype((x), long2)
#define as_long3(x) __builtin_astype((x), long3)
#define as_long4(x) __builtin_astype((x), long4)
#define as_long8(x) __builtin_astype((x), long8)
#define as_long16(x) __builtin_astype((x), long16)
#define as_ulong(x) __builtin_astype((x), ulong)
#define as_ulong2(x) __builtin_astype((x), ulong2)
#define as_ulong3(x) __builtin_astype((x), ulong3)
#define as_ulong4(x) __builtin_astype((x), ulong4)
#define as_ulong8(x) __builtin_astype((x), ulong8)
#define as_ulong16(x) __builtin_astype((x), ulong16)
#define as_float(x) __builtin_astype((x), float)
#define as_float2(x) __builtin_astype((x), float2)
#define as_float3(x) __builtin_astype((x), float3)
#define as_float4(x) __builtin_astype((x), float4)
#define as_float8(x) __builtin_astype((x), float8)
#define as_float16(x) __builtin_astype((x), float16)
#ifdef cl_khr_fp64
#define as_double(x) __builtin_astype((x), double)
#define as_double2(x) __builtin_astype((x), double2)
#define as_double3(x) __builtin_astype((x), double3)
#define as_double4(x) __builtin_astype((x), double4)
#define as_double8(x) __builtin_astype((x), double8)
#define as_double16(x) __builtin_astype((x), double16)
#endif // cl_khr_fp64
#ifdef cl_khr_fp16
#define as_half(x) __builtin_astype((x), half)
#define as_half2(x) __builtin_astype((x), half2)
#define as_half3(x) __builtin_astype((x), half3)
#define as_half4(x) __builtin_astype((x), half4)
#define as_half8(x) __builtin_astype((x), half8)
#define as_half16(x) __builtin_astype((x), half16)
#endif // cl_khr_fp16
#define as_size_t(x) __builtin_astype((x), size_t)
#define as_ptrdiff_t(x) __builtin_astype((x), ptrdiff_t)
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
#define __kernel_exec(X, typen) __kernel \
__attribute__((work_group_size_hint(X, 1, 1))) \
__attribute__((vec_type_hint(typen)))
#define kernel_exec(X, typen) __kernel \
__attribute__((work_group_size_hint(X, 1, 1))) \
__attribute__((vec_type_hint(typen)))
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
#endif
#ifdef cl_intel_device_side_avc_motion_estimation
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
@ -580,7 +722,6 @@ typedef struct {
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
#endif // cl_intel_device_side_avc_motion_estimation
// Disable any extensions we may have enabled previously.

864
lib/include/opencl-c.h vendored

File diff suppressed because it is too large Load Diff

View File

@ -39,4 +39,46 @@ extern "C" {
#pragma omp end declare variant
// Ensure we make `_ZdlPv`, aka. `operator delete(void*)` available without the
// need to `include <new>` in C++ mode.
#ifdef __cplusplus
// We require malloc/free.
#include <cstdlib>
#pragma push_macro("OPENMP_NOEXCEPT")
#if __cplusplus >= 201103L
#define OPENMP_NOEXCEPT noexcept
#else
#define OPENMP_NOEXCEPT
#endif
// Device overrides for non-placement new and delete.
inline void *operator new(__SIZE_TYPE__ size) {
if (size == 0)
size = 1;
return ::malloc(size);
}
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
inline void operator delete(void *ptr)OPENMP_NOEXCEPT { ::free(ptr); }
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
::operator delete(ptr);
}
// Sized delete, C++14 only.
#if __cplusplus >= 201402L
inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
::operator delete(ptr);
}
inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
::operator delete(ptr);
}
#endif
#pragma pop_macro("OPENMP_NOEXCEPT")
#endif
#endif

View File

@ -17,7 +17,6 @@
// We require std::math functions in the complex builtins below.
#include <cmath>
#define __CUDA__
#define __OPENMP_NVPTX__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_NVPTX__
@ -26,9 +25,6 @@
// Grab the host header too.
#include_next <complex>
#ifdef __cplusplus
// If we are compiling against libc++, the macro _LIBCPP_STD_VER should be set
// after including <cmath> above. Since the complex header we use is a
// simplified version of the libc++, we don't need it in this case. If we
@ -48,5 +44,3 @@
#pragma omp end declare variant
#endif
#endif

View File

@ -17,7 +17,6 @@
// We require math functions in the complex builtins below.
#include <math.h>
#define __CUDA__
#define __OPENMP_NVPTX__
#include <__clang_cuda_complex_builtins.h>
#undef __OPENMP_NVPTX__

View File

@ -9,6 +9,8 @@
#ifndef __CLANG_OPENMP_WRAPPERS_NEW
#define __CLANG_OPENMP_WRAPPERS_NEW
// We need the system <new> for the std::nothrow_t. The new/delete operators
// which do not use nothrow_t are provided without the <new> header.
#include_next <new>
#if defined(__NVPTX__) && defined(_OPENMP)
@ -22,48 +24,24 @@
#define OPENMP_NOEXCEPT
#endif
// Device overrides for non-placement new and delete.
inline void *operator new(__SIZE_TYPE__ size) {
if (size == 0)
size = 1;
return ::malloc(size);
}
inline void *operator new(__SIZE_TYPE__ size,
const std::nothrow_t &) OPENMP_NOEXCEPT {
return ::operator new(size);
}
inline void *operator new[](__SIZE_TYPE__ size) { return ::operator new(size); }
inline void *operator new[](__SIZE_TYPE__ size, const std::nothrow_t &) {
return ::operator new(size);
}
inline void operator delete(void *ptr)OPENMP_NOEXCEPT {
if (ptr)
::free(ptr);
}
inline void operator delete(void *ptr, const std::nothrow_t &)OPENMP_NOEXCEPT {
::operator delete(ptr);
}
inline void operator delete[](void *ptr) OPENMP_NOEXCEPT {
::operator delete(ptr);
}
inline void operator delete[](void *ptr,
const std::nothrow_t &) OPENMP_NOEXCEPT {
::operator delete(ptr);
}
// Sized delete, C++14 only.
#if __cplusplus >= 201402L
inline void operator delete(void *ptr, __SIZE_TYPE__ size)OPENMP_NOEXCEPT {
::operator delete(ptr);
}
inline void operator delete[](void *ptr, __SIZE_TYPE__ size) OPENMP_NOEXCEPT {
::operator delete(ptr);
}
#endif
#pragma pop_macro("OPENMP_NOEXCEPT")
#endif

View File

@ -28,7 +28,7 @@
Most SSE scalar float intrinsic operations can be performed more
efficiently as C language float scalar operations or optimized to
use vector SIMD operations. We recommend this for new applications. */
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
#ifndef _XMMINTRIN_H_INCLUDED
@ -62,14 +62,13 @@
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
typedef vector float __m128 __attribute__((__may_alias__));
/* Unaligned version of the same type. */
typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__,
__aligned__ (1)));
typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
/* Internal data types for implementing the intrinsics. */
typedef float __v4sf __attribute__ ((__vector_size__ (16)));
typedef vector float __v4sf;
/* Create an undefined vector. */
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

123865
lib/include/riscv_vector.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -20,6 +20,13 @@
#ifdef __x86_64__
struct __uintr_frame
{
unsigned long long rip;
unsigned long long rflags;
unsigned long long rsp;
};
/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
/// user interrupt cannot be delivered on the instruction boundary following
/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in

View File

@ -28,13 +28,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdec_epi128(__m256i __A, __m256i __B)
{
@ -42,13 +35,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesenclast_epi128(__m256i __A, __m256i __B)
{
@ -56,13 +42,6 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
(__v4di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
{
@ -70,13 +49,35 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS
(__v4di) __B);
}
#ifdef __AVX512FINTRIN_H
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenc_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdec_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesenclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
(__v8di) __B);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS_F
_mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
{
return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
(__v8di) __B);
}
#endif // __AVX512FINTRIN_H
#undef __DEFAULT_FN_ATTRS
#undef __DEFAULT_FN_ATTRS_F

View File

@ -1016,64 +1016,84 @@ vec_scatter_element(__vector double __vec,
static inline __ATTRS_o_ai __vector signed char
vec_xl(long __offset, const signed char *__ptr) {
return *(const __vector signed char *)
((const char *)__ptr + __offset);
__vector signed char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed char));
return V;
}
static inline __ATTRS_o_ai __vector unsigned char
vec_xl(long __offset, const unsigned char *__ptr) {
return *(const __vector unsigned char *)
((const char *)__ptr + __offset);
__vector unsigned char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned char));
return V;
}
static inline __ATTRS_o_ai __vector signed short
vec_xl(long __offset, const signed short *__ptr) {
return *(const __vector signed short *)
((const char *)__ptr + __offset);
__vector signed short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed short));
return V;
}
static inline __ATTRS_o_ai __vector unsigned short
vec_xl(long __offset, const unsigned short *__ptr) {
return *(const __vector unsigned short *)
((const char *)__ptr + __offset);
__vector unsigned short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned short));
return V;
}
static inline __ATTRS_o_ai __vector signed int
vec_xl(long __offset, const signed int *__ptr) {
return *(const __vector signed int *)
((const char *)__ptr + __offset);
__vector signed int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed int));
return V;
}
static inline __ATTRS_o_ai __vector unsigned int
vec_xl(long __offset, const unsigned int *__ptr) {
return *(const __vector unsigned int *)
((const char *)__ptr + __offset);
__vector unsigned int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned int));
return V;
}
static inline __ATTRS_o_ai __vector signed long long
vec_xl(long __offset, const signed long long *__ptr) {
return *(const __vector signed long long *)
((const char *)__ptr + __offset);
__vector signed long long V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed long long));
return V;
}
static inline __ATTRS_o_ai __vector unsigned long long
vec_xl(long __offset, const unsigned long long *__ptr) {
return *(const __vector unsigned long long *)
((const char *)__ptr + __offset);
__vector unsigned long long V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned long long));
return V;
}
#if __ARCH__ >= 12
static inline __ATTRS_o_ai __vector float
vec_xl(long __offset, const float *__ptr) {
return *(const __vector float *)
((const char *)__ptr + __offset);
__vector float V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector float));
return V;
}
#endif
static inline __ATTRS_o_ai __vector double
vec_xl(long __offset, const double *__ptr) {
return *(const __vector double *)
((const char *)__ptr + __offset);
__vector double V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector double));
return V;
}
/*-- vec_xld2 ---------------------------------------------------------------*/
@ -1081,64 +1101,82 @@ vec_xl(long __offset, const double *__ptr) {
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed char
vec_xld2(long __offset, const signed char *__ptr) {
return *(const __vector signed char *)
((const char *)__ptr + __offset);
__vector signed char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed char));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned char
vec_xld2(long __offset, const unsigned char *__ptr) {
return *(const __vector unsigned char *)
((const char *)__ptr + __offset);
__vector unsigned char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned char));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed short
vec_xld2(long __offset, const signed short *__ptr) {
return *(const __vector signed short *)
((const char *)__ptr + __offset);
__vector signed short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed short));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned short
vec_xld2(long __offset, const unsigned short *__ptr) {
return *(const __vector unsigned short *)
((const char *)__ptr + __offset);
__vector unsigned short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned short));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed int
vec_xld2(long __offset, const signed int *__ptr) {
return *(const __vector signed int *)
((const char *)__ptr + __offset);
__vector signed int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed int));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned int
vec_xld2(long __offset, const unsigned int *__ptr) {
return *(const __vector unsigned int *)
((const char *)__ptr + __offset);
__vector unsigned int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned int));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed long long
vec_xld2(long __offset, const signed long long *__ptr) {
return *(const __vector signed long long *)
((const char *)__ptr + __offset);
__vector signed long long V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed long long));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned long long
vec_xld2(long __offset, const unsigned long long *__ptr) {
return *(const __vector unsigned long long *)
((const char *)__ptr + __offset);
__vector unsigned long long V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned long long));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector double
vec_xld2(long __offset, const double *__ptr) {
return *(const __vector double *)
((const char *)__ptr + __offset);
__vector double V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector double));
return V;
}
/*-- vec_xlw4 ---------------------------------------------------------------*/
@ -1146,99 +1184,128 @@ vec_xld2(long __offset, const double *__ptr) {
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed char
vec_xlw4(long __offset, const signed char *__ptr) {
return *(const __vector signed char *)
((const char *)__ptr + __offset);
__vector signed char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed char));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned char
vec_xlw4(long __offset, const unsigned char *__ptr) {
return *(const __vector unsigned char *)
((const char *)__ptr + __offset);
__vector unsigned char V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned char));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed short
vec_xlw4(long __offset, const signed short *__ptr) {
return *(const __vector signed short *)
((const char *)__ptr + __offset);
__vector signed short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed short));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned short
vec_xlw4(long __offset, const unsigned short *__ptr) {
return *(const __vector unsigned short *)
((const char *)__ptr + __offset);
__vector unsigned short V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned short));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector signed int
vec_xlw4(long __offset, const signed int *__ptr) {
return *(const __vector signed int *)
((const char *)__ptr + __offset);
__vector signed int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector signed int));
return V;
}
// This prototype is deprecated.
static inline __ATTRS_o_ai __vector unsigned int
vec_xlw4(long __offset, const unsigned int *__ptr) {
return *(const __vector unsigned int *)
((const char *)__ptr + __offset);
__vector unsigned int V;
__builtin_memcpy(&V, ((const char *)__ptr + __offset),
sizeof(__vector unsigned int));
return V;
}
/*-- vec_xst ----------------------------------------------------------------*/
static inline __ATTRS_o_ai void
vec_xst(__vector signed char __vec, long __offset, signed char *__ptr) {
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
__vector signed char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed char));
}
static inline __ATTRS_o_ai void
vec_xst(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
__vector unsigned char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned char));
}
static inline __ATTRS_o_ai void
vec_xst(__vector signed short __vec, long __offset, signed short *__ptr) {
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
__vector signed short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed short));
}
static inline __ATTRS_o_ai void
vec_xst(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
__vector unsigned short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned short));
}
static inline __ATTRS_o_ai void
vec_xst(__vector signed int __vec, long __offset, signed int *__ptr) {
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
__vector signed int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
}
static inline __ATTRS_o_ai void
vec_xst(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
__vector unsigned int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned int));
}
static inline __ATTRS_o_ai void
vec_xst(__vector signed long long __vec, long __offset,
signed long long *__ptr) {
*(__vector signed long long *)((char *)__ptr + __offset) = __vec;
signed long long *__ptr) {
__vector signed long long V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed long long));
}
static inline __ATTRS_o_ai void
vec_xst(__vector unsigned long long __vec, long __offset,
unsigned long long *__ptr) {
*(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
unsigned long long *__ptr) {
__vector unsigned long long V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned long long));
}
#if __ARCH__ >= 12
static inline __ATTRS_o_ai void
vec_xst(__vector float __vec, long __offset, float *__ptr) {
*(__vector float *)((char *)__ptr + __offset) = __vec;
__vector float V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector float));
}
#endif
static inline __ATTRS_o_ai void
vec_xst(__vector double __vec, long __offset, double *__ptr) {
*(__vector double *)((char *)__ptr + __offset) = __vec;
__vector double V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
}
/*-- vec_xstd2 --------------------------------------------------------------*/
@ -1246,57 +1313,73 @@ vec_xst(__vector double __vec, long __offset, double *__ptr) {
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector signed char __vec, long __offset, signed char *__ptr) {
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
__vector signed char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed char));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
__vector unsigned char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned char));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector signed short __vec, long __offset, signed short *__ptr) {
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
__vector signed short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed short));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
__vector unsigned short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned short));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector signed int __vec, long __offset, signed int *__ptr) {
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
__vector signed int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
__vector unsigned int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned int));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector signed long long __vec, long __offset,
signed long long *__ptr) {
*(__vector signed long long *)((char *)__ptr + __offset) = __vec;
__vector signed long long V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed long long));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector unsigned long long __vec, long __offset,
unsigned long long *__ptr) {
*(__vector unsigned long long *)((char *)__ptr + __offset) = __vec;
__vector unsigned long long V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned long long));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
*(__vector double *)((char *)__ptr + __offset) = __vec;
__vector double V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector double));
}
/*-- vec_xstw4 --------------------------------------------------------------*/
@ -1304,37 +1387,48 @@ vec_xstd2(__vector double __vec, long __offset, double *__ptr) {
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector signed char __vec, long __offset, signed char *__ptr) {
*(__vector signed char *)((char *)__ptr + __offset) = __vec;
__vector signed char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed char));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector unsigned char __vec, long __offset, unsigned char *__ptr) {
*(__vector unsigned char *)((char *)__ptr + __offset) = __vec;
__vector unsigned char V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned char));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector signed short __vec, long __offset, signed short *__ptr) {
*(__vector signed short *)((char *)__ptr + __offset) = __vec;
__vector signed short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector signed short));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector unsigned short __vec, long __offset, unsigned short *__ptr) {
*(__vector unsigned short *)((char *)__ptr + __offset) = __vec;
__vector unsigned short V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned short));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector signed int __vec, long __offset, signed int *__ptr) {
*(__vector signed int *)((char *)__ptr + __offset) = __vec;
__vector signed int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V, sizeof(__vector signed int));
}
// This prototype is deprecated.
static inline __ATTRS_o_ai void
vec_xstw4(__vector unsigned int __vec, long __offset, unsigned int *__ptr) {
*(__vector unsigned int *)((char *)__ptr + __offset) = __vec;
__vector unsigned int V = __vec;
__builtin_memcpy(((char *)__ptr + __offset), &V,
sizeof(__vector unsigned int));
}
/*-- vec_load_bndry ---------------------------------------------------------*/
@ -9259,6 +9353,41 @@ vec_fp_test_data_class(__vector double __a, int __b, int *__c)
__VEC_CLASS_FP_ZERO | \
__VEC_CLASS_FP_INFINITY)
/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
#if __ARCH__ >= 14
#define vec_extend_to_fp32_hi(X, W) \
((__vector float)__builtin_s390_vclfnhs((X), (W)));
#endif
/*-- vec_extend_to_fp32_hi --------------------------------------------------*/
#if __ARCH__ >= 14
#define vec_extend_to_fp32_lo(X, W) \
((__vector float)__builtin_s390_vclfnls((X), (W)));
#endif
/*-- vec_round_from_fp32 ----------------------------------------------------*/
#if __ARCH__ >= 14
#define vec_round_from_fp32(X, Y, W) \
((__vector unsigned short)__builtin_s390_vcrnfs((X), (Y), (W)));
#endif
/*-- vec_convert_to_fp16 ----------------------------------------------------*/
#if __ARCH__ >= 14
#define vec_convert_to_fp16(X, W) \
((__vector unsigned short)__builtin_s390_vcfn((X), (W)));
#endif
/*-- vec_convert_from_fp16 --------------------------------------------------*/
#if __ARCH__ >= 14
#define vec_convert_from_fp16(X, W) \
((__vector unsigned short)__builtin_s390_vcnf((X), (W)));
#endif
/*-- vec_cp_until_zero ------------------------------------------------------*/
static inline __ATTRS_o_ai __vector signed char

View File

@ -19,10 +19,12 @@
(__v4di)(__m256i)(B), \
(char)(I))
#ifdef __AVX512FINTRIN_H
#define _mm512_clmulepi64_epi128(A, B, I) \
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B), \
(char)(I))
#endif // __AVX512FINTRIN_H
#endif /* __VPCLMULQDQINTRIN_H */

File diff suppressed because it is too large Load Diff