zig cc: Remove headers related to GPU offload.

This commit is contained in:
Alex Rønne Petersen 2024-12-16 04:36:57 +01:00
parent 5f34224b2b
commit 181330bbd4
No known key found for this signature in database
24 changed files with 0 additions and 27878 deletions

View File

@ -1,121 +0,0 @@
/*===---- cuda_builtin_vars.h - CUDA built-in variables ---------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CUDA_BUILTIN_VARS_H
#define __CUDA_BUILTIN_VARS_H
// Forward declares from vector_types.h.
struct uint3;
struct dim3;
// The file implements built-in CUDA variables using __declspec(property).
// https://msdn.microsoft.com/en-us/library/yhfk0thd.aspx
// All read accesses of built-in variable fields get converted into calls to a
// getter function which in turn calls the appropriate builtin to fetch the
// value.
//
// Example:
// int x = threadIdx.x;
// IR output:
// %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() #3
// PTX output:
// mov.u32 %r2, %tid.x;
#define __CUDA_DEVICE_BUILTIN(FIELD, INTRINSIC) \
__declspec(property(get = __fetch_builtin_##FIELD)) unsigned int FIELD; \
static inline __attribute__((always_inline)) \
__attribute__((device)) unsigned int __fetch_builtin_##FIELD(void) { \
return INTRINSIC; \
}
#if __cplusplus >= 201103L
#define __DELETE =delete
#else
#define __DELETE
#endif
// Make sure nobody can create instances of the special variable types. nvcc
// also disallows taking address of special variables, so we disable address-of
// operator as well.
#define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \
__attribute__((device)) TypeName() __DELETE; \
__attribute__((device)) TypeName(const TypeName &) __DELETE; \
__attribute__((device)) void operator=(const TypeName &) const __DELETE; \
__attribute__((device)) TypeName *operator&() const __DELETE
struct __cuda_builtin_threadIdx_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_tid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_tid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_tid_z());
// threadIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_threadIdx_t);
};
struct __cuda_builtin_blockIdx_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ctaid_z());
// blockIdx should be convertible to uint3 (in fact in nvcc, it *is* a
// uint3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockIdx_t);
};
struct __cuda_builtin_blockDim_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_ntid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_ntid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_ntid_z());
// blockDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_blockDim_t);
};
struct __cuda_builtin_gridDim_t {
__CUDA_DEVICE_BUILTIN(x,__nvvm_read_ptx_sreg_nctaid_x());
__CUDA_DEVICE_BUILTIN(y,__nvvm_read_ptx_sreg_nctaid_y());
__CUDA_DEVICE_BUILTIN(z,__nvvm_read_ptx_sreg_nctaid_z());
// gridDim should be convertible to dim3 (in fact in nvcc, it *is* a
// dim3). This function is defined after we pull in vector_types.h.
__attribute__((device)) operator dim3() const;
__attribute__((device)) operator uint3() const;
private:
__CUDA_DISALLOW_BUILTINVAR_ACCESS(__cuda_builtin_gridDim_t);
};
#define __CUDA_BUILTIN_VAR \
extern const __attribute__((device)) __attribute__((weak))
__CUDA_BUILTIN_VAR __cuda_builtin_threadIdx_t threadIdx;
__CUDA_BUILTIN_VAR __cuda_builtin_blockIdx_t blockIdx;
__CUDA_BUILTIN_VAR __cuda_builtin_blockDim_t blockDim;
__CUDA_BUILTIN_VAR __cuda_builtin_gridDim_t gridDim;
// warpSize should translate to read of %WARP_SZ but there's currently no
// builtin to do so. According to PTX v4.2 docs 'to date, all target
// architectures have a WARP_SZ value of 32'.
__attribute__((device)) const int warpSize = 32;
#undef __CUDA_DEVICE_BUILTIN
#undef __CUDA_BUILTIN_VAR
#undef __CUDA_DISALLOW_BUILTINVAR_ACCESS
#undef __DELETE
#endif /* __CUDA_BUILTIN_VARS_H */

View File

@ -1,512 +0,0 @@
/*===---- __clang_cuda_cmath.h - Device-side CUDA cmath support ------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_CMATH_H__
#define __CLANG_CUDA_CMATH_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
#ifndef __OPENMP_NVPTX__
#include <limits>
#endif
// CUDA lets us use various std math functions on the device side. This file
// works in concert with __clang_cuda_math_forward_declares.h to make this work.
//
// Specifically, the forward-declares header declares __device__ overloads for
// these functions in the global namespace, then pulls them into namespace std
// with 'using' statements. Then this file implements those functions, after
// their implementations have been pulled in.
//
// It's important that we declare the functions in the global namespace and pull
// them into namespace std with using statements, as opposed to simply declaring
// these functions in namespace std, because our device functions need to
// overload the standard library functions, which may be declared in the global
// namespace or in std, depending on the degree of conformance of the stdlib
// implementation. Declaring in the global namespace and pulling into namespace
// std covers all of the known knowns.
#ifdef __OPENMP_NVPTX__
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __device__ __inline__ __attribute__((always_inline))
#endif
__DEVICE__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ float abs(float __x) { return ::fabsf(__x); }
__DEVICE__ double abs(double __x) { return ::fabs(__x); }
__DEVICE__ float acos(float __x) { return ::acosf(__x); }
__DEVICE__ float asin(float __x) { return ::asinf(__x); }
__DEVICE__ float atan(float __x) { return ::atanf(__x); }
__DEVICE__ float atan2(float __x, float __y) { return ::atan2f(__x, __y); }
__DEVICE__ float ceil(float __x) { return ::ceilf(__x); }
__DEVICE__ float cos(float __x) { return ::cosf(__x); }
__DEVICE__ float cosh(float __x) { return ::coshf(__x); }
__DEVICE__ float exp(float __x) { return ::expf(__x); }
__DEVICE__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ float floor(float __x) { return ::floorf(__x); }
__DEVICE__ float fmod(float __x, float __y) { return ::fmodf(__x, __y); }
__DEVICE__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
// For inscrutable reasons, the CUDA headers define these functions for us on
// Windows.
#if !defined(_MSC_VER) || defined(__OPENMP_NVPTX__)
// For OpenMP we work around some old system headers that have non-conforming
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
// this by providing two versions of these functions, differing only in the
// return type. To avoid conflicting definitions we disable implicit base
// function generation. That means we will end up with two specializations, one
// per type, but only one has a base function defined by the system header.
#if defined(__OPENMP_NVPTX__)
#pragma omp begin declare variant match( \
implementation = {extension(disable_implicit_base)})
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
// add a suffix. This means we would clash with the names of the variants
// (note that we do not create implicit base functions here). To avoid
// this clash we add a new trait to some of them that is always true
// (this is LLVM after all ;)). It will only influence the mangled name
// of the variants inside the inner region and avoid the clash.
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
__DEVICE__ int isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ int isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ int isfinite(float __x) { return ::__finitef(__x); }
__DEVICE__ int isfinite(double __x) { return ::__isfinited(__x); }
__DEVICE__ int isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ int isnan(double __x) { return ::__isnan(__x); }
#pragma omp end declare variant
#endif
__DEVICE__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ bool isfinite(float __x) { return ::__finitef(__x); }
// For inscrutable reasons, __finite(), the double-precision version of
// __finitef, does not exist when compiling for MacOS. __isfinited is available
// everywhere and is just as good.
__DEVICE__ bool isfinite(double __x) { return ::__isfinited(__x); }
__DEVICE__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ bool isnan(double __x) { return ::__isnan(__x); }
#if defined(__OPENMP_NVPTX__)
#pragma omp end declare variant
#endif
#endif
__DEVICE__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreater(double __x, double __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ bool isgreaterequal(float __x, float __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isgreaterequal(double __x, double __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ bool isless(float __x, float __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool isless(double __x, double __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ bool islessequal(float __x, float __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessequal(double __x, double __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ bool islessgreater(float __x, float __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool islessgreater(double __x, double __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ bool isnormal(float __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isnormal(double __x) { return __builtin_isnormal(__x); }
__DEVICE__ bool isunordered(float __x, float __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ bool isunordered(double __x, double __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ float ldexp(float __arg, int __exp) {
return ::ldexpf(__arg, __exp);
}
__DEVICE__ float log(float __x) { return ::logf(__x); }
__DEVICE__ float log10(float __x) { return ::log10f(__x); }
__DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); }
__DEVICE__ float pow(float __base, float __exp) {
return ::powf(__base, __exp);
}
__DEVICE__ float pow(float __base, int __iexp) {
return ::powif(__base, __iexp);
}
__DEVICE__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
__DEVICE__ bool signbit(float __x) { return ::__signbitf(__x); }
__DEVICE__ bool signbit(double __x) { return ::__signbitd(__x); }
__DEVICE__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ float sinh(float __x) { return ::sinhf(__x); }
__DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); }
__DEVICE__ float tan(float __x) { return ::tanf(__x); }
__DEVICE__ float tanh(float __x) { return ::tanhf(__x); }
// There was a redefinition error for this this overload in CUDA mode.
// We restrict it to OpenMP mode for now, that is where it is actually needed
// anyway.
#ifdef __OPENMP_NVPTX__
__DEVICE__ float remquo(float __n, float __d, int *__q) {
return ::remquof(__n, __d, __q);
}
#endif
// Notably missing above is nexttoward. We omit it because
// libdevice doesn't provide an implementation, and we don't want to be in the
// business of implementing tricky libm functions in this header.
#ifndef __OPENMP_NVPTX__
// Now we've defined everything we promised we'd define in
// __clang_cuda_math_forward_declares.h. We need to do two additional things to
// fix up our math functions.
//
// 1) Define __device__ overloads for e.g. sin(int). The CUDA headers define
// only sin(float) and sin(double), which means that e.g. sin(0) is
// ambiguous.
//
// 2) Pull the __device__ overloads of "foobarf" math functions into namespace
// std. These are defined in the CUDA headers in the global namespace,
// independent of everything else we've done here.
// We can't use std::enable_if, because we want to be pre-C++11 compatible. But
// we go ahead and unconditionally define functions that are only available when
// compiling for C++11 to match the behavior of the CUDA headers.
template<bool __B, class __T = void>
struct __clang_cuda_enable_if {};
template <class __T> struct __clang_cuda_enable_if<true, __T> {
typedef __T type;
};
// Defines an overload of __fn that accepts one integral argument, calls
// __fn((double)x), and returns __retty.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_1(__retty, __fn) \
template <typename __T> \
__DEVICE__ \
typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, \
__retty>::type \
__fn(__T __x) { \
return ::__fn((double)__x); \
}
// Defines an overload of __fn that accepts one two arithmetic arguments, calls
// __fn((double)x, (double)y), and returns a double.
//
// Note this is different from OVERLOAD_1, which generates an overload that
// accepts only *integral* arguments.
#define __CUDA_CLANG_FN_INTEGER_OVERLOAD_2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ typename __clang_cuda_enable_if< \
std::numeric_limits<__T1>::is_specialized && \
std::numeric_limits<__T2>::is_specialized, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
return __fn((double)__x, (double)__y); \
}
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, acosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, asinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, atan2);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, atanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cbrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, ceil)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, copysign);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cos)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, cosh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erf)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, erfc)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, exp2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, expm1)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, fabs)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fdim);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, floor)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmax);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmin);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, fmod);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, fpclassify)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, hypot);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(int, ilogb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isfinite)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isgreaterequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isinf);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isless);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessequal);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, islessgreater);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnan);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, isnormal)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(bool, isunordered);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, lgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log10)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log1p)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, log2)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, logb)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long long, llround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lrint)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(long, lround)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, nearbyint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, nextafter);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, pow);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_2(double, remainder);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, rint);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, round);
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(bool, signbit)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sin)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sinh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, sqrt)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tan)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tanh)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, tgamma)
__CUDA_CLANG_FN_INTEGER_OVERLOAD_1(double, trunc);
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_1
#undef __CUDA_CLANG_FN_INTEGER_OVERLOAD_2
// Overloads for functions that don't match the patterns expected by
// __CUDA_CLANG_FN_INTEGER_OVERLOAD_{1,2}.
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized &&
std::numeric_limits<__T3>::is_specialized,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
return std::fma((double)__x, (double)__y, (double)__z);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
frexp(__T __x, int *__exp) {
return std::frexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
ldexp(__T __x, int __exp) {
return std::ldexp((double)__x, __exp);
}
template <typename __T1, typename __T2>
__DEVICE__ typename __clang_cuda_enable_if<
std::numeric_limits<__T1>::is_specialized &&
std::numeric_limits<__T2>::is_specialized,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
return std::remquo((double)__x, (double)__y, __quo);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbln(__T __x, long __exp) {
return std::scalbln((double)__x, __exp);
}
template <typename __T>
__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer,
double>::type
scalbn(__T __x, int __exp) {
return std::scalbn((double)__x, __exp);
}
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
// Pull the new overloads we defined above into namespace std.
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnormal;
using ::isunordered;
using ::ldexp;
using ::lgamma;
using ::llrint;
using ::llround;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::nearbyint;
using ::nextafter;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
// Well this is fun: We need to pull these symbols in for libc++, but we can't
// pull them in with libstdc++, because its ::isinf and ::isnan are different
// than its std::isinf and std::isnan.
#ifndef __GLIBCXX__
using ::isinf;
using ::isnan;
#endif
// Finally, pull the "foobarf" functions that CUDA defines in its headers into
// namespace std.
using ::acosf;
using ::acoshf;
using ::asinf;
using ::asinhf;
using ::atan2f;
using ::atanf;
using ::atanhf;
using ::cbrtf;
using ::ceilf;
using ::copysignf;
using ::cosf;
using ::coshf;
using ::erfcf;
using ::erff;
using ::exp2f;
using ::expf;
using ::expm1f;
using ::fabsf;
using ::fdimf;
using ::floorf;
using ::fmaf;
using ::fmaxf;
using ::fminf;
using ::fmodf;
using ::frexpf;
using ::hypotf;
using ::ilogbf;
using ::ldexpf;
using ::lgammaf;
using ::llrintf;
using ::llroundf;
using ::log10f;
using ::log1pf;
using ::log2f;
using ::logbf;
using ::logf;
using ::lrintf;
using ::lroundf;
using ::modff;
using ::nearbyintf;
using ::nextafterf;
using ::powf;
using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
using ::scalblnf;
using ::scalbnf;
using ::sinf;
using ::sinhf;
using ::sqrtf;
using ::tanf;
using ::tanhf;
using ::tgammaf;
using ::truncf;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#endif // __OPENMP_NVPTX__
#undef __DEVICE__
#endif

View File

@ -1,285 +0,0 @@
/*===-- __clang_cuda_complex_builtins - CUDA impls of runtime complex fns ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_COMPLEX_BUILTINS
#define __CLANG_CUDA_COMPLEX_BUILTINS
// This header defines __muldc3, __mulsc3, __divdc3, and __divsc3. These are
// libgcc functions that clang assumes are available when compiling c99 complex
// operations. (These implementations come from libc++, and have been modified
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
#pragma push_macro("__DEVICE__")
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp declare target
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
#else
#define __DEVICE__ __device__ inline
#endif
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
// different but equivalent function versions. TODO: For OpenMP we currently
// select the native builtins as the overload support for templates is lacking.
#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
#define _ISNANd std::isnan
#define _ISNANf std::isnan
#define _ISINFd std::isinf
#define _ISINFf std::isinf
#define _ISFINITEd std::isfinite
#define _ISFINITEf std::isfinite
#define _COPYSIGNd std::copysign
#define _COPYSIGNf std::copysign
#define _SCALBNd std::scalbn
#define _SCALBNf std::scalbn
#define _ABSd std::abs
#define _ABSf std::abs
#define _LOGBd std::logb
#define _LOGBf std::logb
// Rather than pulling in std::max from algorithm everytime, use available ::max.
#define _fmaxd max
#define _fmaxf max
#else
#ifdef __AMDGCN__
#define _ISNANd __ocml_isnan_f64
#define _ISNANf __ocml_isnan_f32
#define _ISINFd __ocml_isinf_f64
#define _ISINFf __ocml_isinf_f32
#define _ISFINITEd __ocml_isfinite_f64
#define _ISFINITEf __ocml_isfinite_f32
#define _COPYSIGNd __ocml_copysign_f64
#define _COPYSIGNf __ocml_copysign_f32
#define _SCALBNd __ocml_scalbn_f64
#define _SCALBNf __ocml_scalbn_f32
#define _ABSd __ocml_fabs_f64
#define _ABSf __ocml_fabs_f32
#define _LOGBd __ocml_logb_f64
#define _LOGBf __ocml_logb_f32
#define _fmaxd __ocml_fmax_f64
#define _fmaxf __ocml_fmax_f32
#else
#define _ISNANd __nv_isnand
#define _ISNANf __nv_isnanf
#define _ISINFd __nv_isinfd
#define _ISINFf __nv_isinff
#define _ISFINITEd __nv_isfinited
#define _ISFINITEf __nv_finitef
#define _COPYSIGNd __nv_copysign
#define _COPYSIGNf __nv_copysignf
#define _SCALBNd __nv_scalbn
#define _SCALBNf __nv_scalbnf
#define _ABSd __nv_fabs
#define _ABSf __nv_fabsf
#define _LOGBd __nv_logb
#define _LOGBf __nv_logbf
#define _fmaxd __nv_fmax
#define _fmaxf __nv_fmaxf
#endif
#endif
#if defined(__cplusplus)
extern "C" {
#endif
__DEVICE__ double _Complex __muldc3(double __a, double __b, double __c,
double __d) {
double __ac = __a * __c;
double __bd = __b * __d;
double __ad = __a * __d;
double __bc = __b * __c;
double _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
int __recalc = 0;
if (_ISINFd(__a) || _ISINFd(__b)) {
__a = _COPYSIGNd(_ISINFd(__a) ? 1 : 0, __a);
__b = _COPYSIGNd(_ISINFd(__b) ? 1 : 0, __b);
if (_ISNANd(__c))
__c = _COPYSIGNd(0, __c);
if (_ISNANd(__d))
__d = _COPYSIGNd(0, __d);
__recalc = 1;
}
if (_ISINFd(__c) || _ISINFd(__d)) {
__c = _COPYSIGNd(_ISINFd(__c) ? 1 : 0, __c);
__d = _COPYSIGNd(_ISINFd(__d) ? 1 : 0, __d);
if (_ISNANd(__a))
__a = _COPYSIGNd(0, __a);
if (_ISNANd(__b))
__b = _COPYSIGNd(0, __b);
__recalc = 1;
}
if (!__recalc &&
(_ISINFd(__ac) || _ISINFd(__bd) || _ISINFd(__ad) || _ISINFd(__bc))) {
if (_ISNANd(__a))
__a = _COPYSIGNd(0, __a);
if (_ISNANd(__b))
__b = _COPYSIGNd(0, __b);
if (_ISNANd(__c))
__c = _COPYSIGNd(0, __c);
if (_ISNANd(__d))
__d = _COPYSIGNd(0, __d);
__recalc = 1;
}
if (__recalc) {
// Can't use std::numeric_limits<double>::infinity() -- that doesn't have
// a device overload (and isn't constexpr before C++11, naturally).
__real__(z) = __builtin_huge_val() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_val() * (__a * __d + __b * __c);
}
}
return z;
}
__DEVICE__ float _Complex __mulsc3(float __a, float __b, float __c, float __d) {
float __ac = __a * __c;
float __bd = __b * __d;
float __ad = __a * __d;
float __bc = __b * __c;
float _Complex z;
__real__(z) = __ac - __bd;
__imag__(z) = __ad + __bc;
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
int __recalc = 0;
if (_ISINFf(__a) || _ISINFf(__b)) {
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
if (_ISNANf(__c))
__c = _COPYSIGNf(0, __c);
if (_ISNANf(__d))
__d = _COPYSIGNf(0, __d);
__recalc = 1;
}
if (_ISINFf(__c) || _ISINFf(__d)) {
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
if (_ISNANf(__a))
__a = _COPYSIGNf(0, __a);
if (_ISNANf(__b))
__b = _COPYSIGNf(0, __b);
__recalc = 1;
}
if (!__recalc &&
(_ISINFf(__ac) || _ISINFf(__bd) || _ISINFf(__ad) || _ISINFf(__bc))) {
if (_ISNANf(__a))
__a = _COPYSIGNf(0, __a);
if (_ISNANf(__b))
__b = _COPYSIGNf(0, __b);
if (_ISNANf(__c))
__c = _COPYSIGNf(0, __c);
if (_ISNANf(__d))
__d = _COPYSIGNf(0, __d);
__recalc = 1;
}
if (__recalc) {
__real__(z) = __builtin_huge_valf() * (__a * __c - __b * __d);
__imag__(z) = __builtin_huge_valf() * (__a * __d + __b * __c);
}
}
return z;
}
__DEVICE__ double _Complex __divdc3(double __a, double __b, double __c,
double __d) {
int __ilogbw = 0;
// Can't use std::max, because that's defined in <algorithm>, and we don't
// want to pull that in for every compile. The CUDA headers define
// ::max(float, float) and ::max(double, double), which is sufficient for us.
double __logbw = _LOGBd(_fmaxd(_ABSd(__c), _ABSd(__d)));
if (_ISFINITEd(__logbw)) {
__ilogbw = (int)__logbw;
__c = _SCALBNd(__c, -__ilogbw);
__d = _SCALBNd(__d, -__ilogbw);
}
double __denom = __c * __c + __d * __d;
double _Complex z;
__real__(z) = _SCALBNd((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = _SCALBNd((__b * __c - __a * __d) / __denom, -__ilogbw);
if (_ISNANd(__real__(z)) && _ISNANd(__imag__(z))) {
if ((__denom == 0.0) && (!_ISNANd(__a) || !_ISNANd(__b))) {
__real__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __a;
__imag__(z) = _COPYSIGNd(__builtin_huge_val(), __c) * __b;
} else if ((_ISINFd(__a) || _ISINFd(__b)) && _ISFINITEd(__c) &&
_ISFINITEd(__d)) {
__a = _COPYSIGNd(_ISINFd(__a) ? 1.0 : 0.0, __a);
__b = _COPYSIGNd(_ISINFd(__b) ? 1.0 : 0.0, __b);
__real__(z) = __builtin_huge_val() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_val() * (__b * __c - __a * __d);
} else if (_ISINFd(__logbw) && __logbw > 0.0 && _ISFINITEd(__a) &&
_ISFINITEd(__b)) {
__c = _COPYSIGNd(_ISINFd(__c) ? 1.0 : 0.0, __c);
__d = _COPYSIGNd(_ISINFd(__d) ? 1.0 : 0.0, __d);
__real__(z) = 0.0 * (__a * __c + __b * __d);
__imag__(z) = 0.0 * (__b * __c - __a * __d);
}
}
return z;
}
__DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
int __ilogbw = 0;
float __logbw = _LOGBf(_fmaxf(_ABSf(__c), _ABSf(__d)));
if (_ISFINITEf(__logbw)) {
__ilogbw = (int)__logbw;
__c = _SCALBNf(__c, -__ilogbw);
__d = _SCALBNf(__d, -__ilogbw);
}
float __denom = __c * __c + __d * __d;
float _Complex z;
__real__(z) = _SCALBNf((__a * __c + __b * __d) / __denom, -__ilogbw);
__imag__(z) = _SCALBNf((__b * __c - __a * __d) / __denom, -__ilogbw);
if (_ISNANf(__real__(z)) && _ISNANf(__imag__(z))) {
if ((__denom == 0) && (!_ISNANf(__a) || !_ISNANf(__b))) {
__real__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __a;
__imag__(z) = _COPYSIGNf(__builtin_huge_valf(), __c) * __b;
} else if ((_ISINFf(__a) || _ISINFf(__b)) && _ISFINITEf(__c) &&
_ISFINITEf(__d)) {
__a = _COPYSIGNf(_ISINFf(__a) ? 1 : 0, __a);
__b = _COPYSIGNf(_ISINFf(__b) ? 1 : 0, __b);
__real__(z) = __builtin_huge_valf() * (__a * __c + __b * __d);
__imag__(z) = __builtin_huge_valf() * (__b * __c - __a * __d);
} else if (_ISINFf(__logbw) && __logbw > 0 && _ISFINITEf(__a) &&
_ISFINITEf(__b)) {
__c = _COPYSIGNf(_ISINFf(__c) ? 1 : 0, __c);
__d = _COPYSIGNf(_ISINFf(__d) ? 1 : 0, __d);
__real__(z) = 0 * (__a * __c + __b * __d);
__imag__(z) = 0 * (__b * __c - __a * __d);
}
}
return z;
}
#if defined(__cplusplus)
} // extern "C"
#endif
#undef _ISNANd
#undef _ISNANf
#undef _ISINFd
#undef _ISINFf
#undef _COPYSIGNd
#undef _COPYSIGNf
#undef _ISFINITEd
#undef _ISFINITEf
#undef _SCALBNd
#undef _SCALBNf
#undef _ABSd
#undef _ABSf
#undef _LOGBd
#undef _LOGBf
#undef _fmaxd
#undef _fmaxf
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
#pragma omp end declare target
#endif
#pragma pop_macro("__DEVICE__")
#endif // __CLANG_CUDA_COMPLEX_BUILTINS

File diff suppressed because it is too large Load Diff

View File

@ -1,707 +0,0 @@
/*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_INTRINSICS_H__
#define __CLANG_CUDA_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// sm_30 intrinsics: __shfl_{up,down,xor}.
#define __SM_30_INTRINSICS_H__
#define __SM_30_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#pragma push_macro("__MAKE_SHUFFLES")
#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
__Type) \
inline __device__ int __FnName(int __val, __Type __offset, \
int __width = warpSize) { \
return __IntIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(float __val, __Type __offset, \
int __width = warpSize) { \
return __FloatIntrinsic(__val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(long long __val, __Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ long __FnName(long __val, __Type __offset, \
int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>( \
::__FnName(static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ unsigned long long __FnName( \
unsigned long long __val, __Type __offset, int __width = warpSize) { \
return static_cast<unsigned long long>( \
::__FnName(static_cast<long long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(double __val, __Type __offset, \
int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0,
unsigned int);
__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f,
unsigned int);
__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f,
int);
#pragma pop_macro("__MAKE_SHUFFLES")
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
#if CUDA_VERSION >= 9000
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
// __shfl_sync_* variants available in CUDA-9
#pragma push_macro("__MAKE_SYNC_SHUFFLES")
#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
__Mask, __Type) \
inline __device__ int __FnName(unsigned int __mask, int __val, \
__Type __offset, int __width = warpSize) { \
return __IntIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ float __FnName(unsigned int __mask, float __val, \
__Type __offset, int __width = warpSize) { \
return __FloatIntrinsic(__mask, __val, __offset, \
((warpSize - __width) << 8) | (__Mask)); \
} \
inline __device__ unsigned int __FnName(unsigned int __mask, \
unsigned int __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned int>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
inline __device__ long long __FnName(unsigned int __mask, long long __val, \
__Type __offset, \
int __width = warpSize) { \
struct __Bits { \
int __a, __b; \
}; \
_Static_assert(sizeof(__val) == sizeof(__Bits)); \
_Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
__Bits __tmp; \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
__tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
long long __ret; \
memcpy(&__ret, &__tmp, sizeof(__tmp)); \
return __ret; \
} \
inline __device__ unsigned long long __FnName( \
unsigned int __mask, unsigned long long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long long>( \
::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
} \
inline __device__ long __FnName(unsigned int __mask, long __val, \
__Type __offset, int __width = warpSize) { \
_Static_assert(sizeof(long) == sizeof(long long) || \
sizeof(long) == sizeof(int)); \
if (sizeof(long) == sizeof(long long)) { \
return static_cast<long>(::__FnName( \
__mask, static_cast<long long>(__val), __offset, __width)); \
} else if (sizeof(long) == sizeof(int)) { \
return static_cast<long>( \
::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
} \
} \
inline __device__ unsigned long __FnName( \
unsigned int __mask, unsigned long __val, __Type __offset, \
int __width = warpSize) { \
return static_cast<unsigned long>( \
::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
} \
inline __device__ double __FnName(unsigned int __mask, double __val, \
__Type __offset, int __width = warpSize) { \
long long __tmp; \
_Static_assert(sizeof(__tmp) == sizeof(__val)); \
memcpy(&__tmp, &__val, sizeof(__val)); \
__tmp = ::__FnName(__mask, __tmp, __offset, __width); \
double __ret; \
memcpy(&__ret, &__tmp, sizeof(__ret)); \
return __ret; \
}
__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
__nvvm_shfl_sync_idx_f32, 0x1f, int);
// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
// maxLane.
__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
__nvvm_shfl_sync_up_f32, 0, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
__nvvm_shfl_sync_down_f32, 0x1f, unsigned int);
__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
__nvvm_shfl_sync_bfly_f32, 0x1f, int);
#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
return __nvvm_bar_warp_sync(mask);
}
inline __device__ void __barrier_sync(unsigned int id) {
__nvvm_barrier_sync(id);
}
inline __device__ void __barrier_sync_count(unsigned int id,
unsigned int count) {
__nvvm_barrier_sync_cnt(id, count);
}
inline __device__ int __all_sync(unsigned int mask, int pred) {
return __nvvm_vote_all_sync(mask, pred);
}
inline __device__ int __any_sync(unsigned int mask, int pred) {
return __nvvm_vote_any_sync(mask, pred);
}
inline __device__ int __uni_sync(unsigned int mask, int pred) {
return __nvvm_vote_uni_sync(mask, pred);
}
inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
return __nvvm_vote_ballot_sync(mask, pred);
}
inline __device__ unsigned int __activemask() {
#if CUDA_VERSION < 9020
return __nvvm_vote_ballot(1);
#else
return __nvvm_activemask();
#endif
}
inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
return __nvvm_fns(mask, base, offset);
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
// Define __match* builtins CUDA-9 headers expect to see.
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
inline __device__ unsigned int __match32_any_sync(unsigned int mask,
unsigned int value) {
return __nvvm_match_any_sync_i32(mask, value);
}
inline __device__ unsigned int
__match64_any_sync(unsigned int mask, unsigned long long value) {
return __nvvm_match_any_sync_i64(mask, value);
}
inline __device__ unsigned int
__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
return __nvvm_match_all_sync_i32p(mask, value, pred);
}
inline __device__ unsigned int
__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
return __nvvm_match_all_sync_i64p(mask, value, pred);
}
#include "crt/sm_70_rt.hpp"
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
#endif // __CUDA_VERSION >= 9000
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
#define __SM_32_INTRINSICS_H__
#define __SM_32_INTRINSICS_HPP__
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); }
inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); }
inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); }
inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); }
inline __device__ long long __ldg(const long long *ptr) {
return __nvvm_ldg_ll(ptr);
}
inline __device__ unsigned char __ldg(const unsigned char *ptr) {
return __nvvm_ldg_uc(ptr);
}
inline __device__ signed char __ldg(const signed char *ptr) {
return __nvvm_ldg_uc((const unsigned char *)ptr);
}
inline __device__ unsigned short __ldg(const unsigned short *ptr) {
return __nvvm_ldg_us(ptr);
}
inline __device__ unsigned int __ldg(const unsigned int *ptr) {
return __nvvm_ldg_ui(ptr);
}
inline __device__ unsigned long __ldg(const unsigned long *ptr) {
return __nvvm_ldg_ul(ptr);
}
inline __device__ unsigned long long __ldg(const unsigned long long *ptr) {
return __nvvm_ldg_ull(ptr);
}
inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); }
inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); }
inline __device__ char2 __ldg(const char2 *ptr) {
typedef char c2 __attribute__((ext_vector_type(2)));
// We can assume that ptr is aligned at least to char2's alignment, but the
// load will assume that ptr is aligned to char2's alignment. This is only
// safe if alignof(c2) <= alignof(char2).
c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr));
char2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ char4 __ldg(const char4 *ptr) {
typedef char c4 __attribute__((ext_vector_type(4)));
c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr));
char4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ short2 __ldg(const short2 *ptr) {
typedef short s2 __attribute__((ext_vector_type(2)));
s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr));
short2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ short4 __ldg(const short4 *ptr) {
typedef short s4 __attribute__((ext_vector_type(4)));
s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr));
short4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ int2 __ldg(const int2 *ptr) {
typedef int i2 __attribute__((ext_vector_type(2)));
i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr));
int2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ int4 __ldg(const int4 *ptr) {
typedef int i4 __attribute__((ext_vector_type(4)));
i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr));
int4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ longlong2 __ldg(const longlong2 *ptr) {
typedef long long ll2 __attribute__((ext_vector_type(2)));
ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr));
longlong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar2 __ldg(const uchar2 *ptr) {
typedef unsigned char uc2 __attribute__((ext_vector_type(2)));
uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr));
uchar2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uchar4 __ldg(const uchar4 *ptr) {
typedef unsigned char uc4 __attribute__((ext_vector_type(4)));
uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr));
uchar4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ushort2 __ldg(const ushort2 *ptr) {
typedef unsigned short us2 __attribute__((ext_vector_type(2)));
us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr));
ushort2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ ushort4 __ldg(const ushort4 *ptr) {
typedef unsigned short us4 __attribute__((ext_vector_type(4)));
us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr));
ushort4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ uint2 __ldg(const uint2 *ptr) {
typedef unsigned int ui2 __attribute__((ext_vector_type(2)));
ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr));
uint2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ uint4 __ldg(const uint4 *ptr) {
typedef unsigned int ui4 __attribute__((ext_vector_type(4)));
ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr));
uint4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) {
typedef unsigned long long ull2 __attribute__((ext_vector_type(2)));
ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr));
ulonglong2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float2 __ldg(const float2 *ptr) {
typedef float f2 __attribute__((ext_vector_type(2)));
f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr));
float2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
inline __device__ float4 __ldg(const float4 *ptr) {
typedef float f4 __attribute__((ext_vector_type(4)));
f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr));
float4 ret;
ret.x = rv[0];
ret.y = rv[1];
ret.z = rv[2];
ret.w = rv[3];
return ret;
}
inline __device__ double2 __ldg(const double2 *ptr) {
typedef double d2 __attribute__((ext_vector_type(2)));
d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr));
double2 ret;
ret.x = rv[0];
ret.y = rv[1];
return ret;
}
// TODO: Implement these as intrinsics, so the backend can work its magic on
// these. Alternatively, we could implement these as plain C and try to get
// llvm to recognize the relevant patterns.
inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.l.clamp.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned result;
asm("shf.r.wrap.b32 %0, %1, %2, %3;"
: "=r"(result)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return result;
}
inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
unsigned shiftWidth) {
unsigned ret;
asm("shf.r.clamp.b32 %0, %1, %2, %3;"
: "=r"(ret)
: "r"(low32), "r"(high32), "r"(shiftWidth));
return ret;
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
#if CUDA_VERSION >= 11000
extern "C" {
__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
return (size_t)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(1))) *)__ptr;
}
__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(3))) *)__ptr;
}
__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(4))) *)__ptr;
}
__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
return (void *)(void __attribute__((address_space(5))) *)__ptr;
}
__device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) {
return __nv_cvta_generic_to_shared_impl(__ptr);
}
} // extern "C"
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
__device__ inline unsigned __reduce_add_sync(unsigned __mask,
unsigned __value) {
return __nvvm_redux_sync_add(__mask, __value);
}
__device__ inline unsigned __reduce_min_sync(unsigned __mask,
unsigned __value) {
return __nvvm_redux_sync_umin(__mask, __value);
}
__device__ inline unsigned __reduce_max_sync(unsigned __mask,
unsigned __value) {
return __nvvm_redux_sync_umax(__mask, __value);
}
__device__ inline int __reduce_min_sync(unsigned __mask, int __value) {
return __nvvm_redux_sync_min(__mask, __value);
}
__device__ inline int __reduce_max_sync(unsigned __mask, int __value) {
return __nvvm_redux_sync_max(__mask, __value);
}
__device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) {
return __nvvm_redux_sync_or(__mask, __value);
}
__device__ inline unsigned __reduce_and_sync(unsigned __mask,
unsigned __value) {
return __nvvm_redux_sync_and(__mask, __value);
}
__device__ inline unsigned __reduce_xor_sync(unsigned __mask,
unsigned __value) {
return __nvvm_redux_sync_xor(__mask, __value);
}
__device__ inline void __nv_memcpy_async_shared_global_4(void *__dst,
const void *__src,
unsigned __src_size) {
__nvvm_cp_async_ca_shared_global_4(
(void __attribute__((address_space(3))) *)__dst,
(const void __attribute__((address_space(1))) *)__src, __src_size);
}
__device__ inline void __nv_memcpy_async_shared_global_8(void *__dst,
const void *__src,
unsigned __src_size) {
__nvvm_cp_async_ca_shared_global_8(
(void __attribute__((address_space(3))) *)__dst,
(const void __attribute__((address_space(1))) *)__src, __src_size);
}
__device__ inline void __nv_memcpy_async_shared_global_16(void *__dst,
const void *__src,
unsigned __src_size) {
__nvvm_cp_async_ca_shared_global_16(
(void __attribute__((address_space(3))) *)__dst,
(const void __attribute__((address_space(1))) *)__src, __src_size);
}
__device__ inline void *
__nv_associate_access_property(const void *__ptr, unsigned long long __prop) {
// TODO: it appears to provide compiler with some sort of a hint. We do not
// know what exactly it is supposed to do. However, CUDA headers suggest that
// just passing through __ptr should not affect correctness. They do so on
// pre-sm80 GPUs where this builtin is not available.
return (void*)__ptr;
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
__device__ inline unsigned __isCtaShared(const void *ptr) {
return __isShared(ptr);
}
__device__ inline unsigned __isClusterShared(const void *__ptr) {
return __nvvm_isspacep_shared_cluster(__ptr);
}
__device__ inline void *__cluster_map_shared_rank(const void *__ptr,
unsigned __rank) {
return __nvvm_mapa((void *)__ptr, __rank);
}
__device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) {
return __nvvm_getctarank((void *)__ptr);
}
__device__ inline uint2
__cluster_map_shared_multicast(const void *__ptr,
unsigned int __cluster_cta_mask) {
return make_uint2((unsigned)__cvta_generic_to_shared(__ptr),
__cluster_cta_mask);
}
__device__ inline unsigned __clusterDimIsSpecified() {
return __nvvm_is_explicit_cluster();
}
__device__ inline dim3 __clusterDim() {
return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
__nvvm_read_ptx_sreg_cluster_nctaid_y(),
__nvvm_read_ptx_sreg_cluster_nctaid_z());
}
__device__ inline dim3 __clusterRelativeBlockIdx() {
return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
__nvvm_read_ptx_sreg_cluster_ctaid_y(),
__nvvm_read_ptx_sreg_cluster_ctaid_z());
}
__device__ inline dim3 __clusterGridDimInClusters() {
return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
__nvvm_read_ptx_sreg_nclusterid_y(),
__nvvm_read_ptx_sreg_nclusterid_z());
}
__device__ inline dim3 __clusterIdx() {
return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
__nvvm_read_ptx_sreg_clusterid_y(),
__nvvm_read_ptx_sreg_clusterid_z());
}
__device__ inline unsigned __clusterRelativeBlockRank() {
return __nvvm_read_ptx_sreg_cluster_ctarank();
}
__device__ inline unsigned __clusterSizeInBlocks() {
return __nvvm_read_ptx_sreg_cluster_nctarank();
}
__device__ inline void __cluster_barrier_arrive() {
__nvvm_barrier_cluster_arrive();
}
__device__ inline void __cluster_barrier_arrive_relaxed() {
__nvvm_barrier_cluster_arrive_relaxed();
}
__device__ inline void __cluster_barrier_wait() {
__nvvm_barrier_cluster_wait();
}
__device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
__device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) {
float2 __ret;
__asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
: "=f"(__ret.x), "=f"(__ret.y)
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
return __ret;
}
__device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) {
float2 __ret;
__asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
: "=f"(__ret.x), "=f"(__ret.y)
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
return __ret;
}
__device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) {
float2 __ret;
__asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
: "=f"(__ret.x), "=f"(__ret.y)
: "l"(__ptr), "f"(__val.x), "f"(__val.y));
return __ret;
}
__device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) {
float4 __ret;
__asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
return __ret;
}
__device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) {
float4 __ret;
__asm__(
"atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w));
return __ret;
}
__device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) {
float4 __ret;
__asm__(
"atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
: "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w)
: "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)
:);
return __ret;
}
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
#endif // CUDA_VERSION >= 11000
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)

View File

@ -1,468 +0,0 @@
/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
#if defined(__cplusplus)
extern "C" {
#endif
#if defined(__OPENMP_NVPTX__)
#define __DEVICE__
#pragma omp begin assumes ext_spmd_amenable no_openmp
#elif defined(__CUDA__)
#define __DEVICE__ __device__
#endif
__DEVICE__ int __nv_abs(int __a);
__DEVICE__ double __nv_acos(double __a);
__DEVICE__ float __nv_acosf(float __a);
__DEVICE__ double __nv_acosh(double __a);
__DEVICE__ float __nv_acoshf(float __a);
__DEVICE__ double __nv_asin(double __a);
__DEVICE__ float __nv_asinf(float __a);
__DEVICE__ double __nv_asinh(double __a);
__DEVICE__ float __nv_asinhf(float __a);
__DEVICE__ double __nv_atan2(double __a, double __b);
__DEVICE__ float __nv_atan2f(float __a, float __b);
__DEVICE__ double __nv_atan(double __a);
__DEVICE__ float __nv_atanf(float __a);
__DEVICE__ double __nv_atanh(double __a);
__DEVICE__ float __nv_atanhf(float __a);
__DEVICE__ int __nv_brev(int __a);
__DEVICE__ long long __nv_brevll(long long __a);
__DEVICE__ int __nv_byte_perm(int __a, int __b, int __c);
__DEVICE__ double __nv_cbrt(double __a);
__DEVICE__ float __nv_cbrtf(float __a);
__DEVICE__ double __nv_ceil(double __a);
__DEVICE__ float __nv_ceilf(float __a);
__DEVICE__ int __nv_clz(int __a);
__DEVICE__ int __nv_clzll(long long __a);
__DEVICE__ double __nv_copysign(double __a, double __b);
__DEVICE__ float __nv_copysignf(float __a, float __b);
__DEVICE__ double __nv_cos(double __a);
__DEVICE__ float __nv_cosf(float __a);
__DEVICE__ double __nv_cosh(double __a);
__DEVICE__ float __nv_coshf(float __a);
__DEVICE__ double __nv_cospi(double __a);
__DEVICE__ float __nv_cospif(float __a);
__DEVICE__ double __nv_cyl_bessel_i0(double __a);
__DEVICE__ float __nv_cyl_bessel_i0f(float __a);
__DEVICE__ double __nv_cyl_bessel_i1(double __a);
__DEVICE__ float __nv_cyl_bessel_i1f(float __a);
__DEVICE__ double __nv_dadd_rd(double __a, double __b);
__DEVICE__ double __nv_dadd_rn(double __a, double __b);
__DEVICE__ double __nv_dadd_ru(double __a, double __b);
__DEVICE__ double __nv_dadd_rz(double __a, double __b);
__DEVICE__ double __nv_ddiv_rd(double __a, double __b);
__DEVICE__ double __nv_ddiv_rn(double __a, double __b);
__DEVICE__ double __nv_ddiv_ru(double __a, double __b);
__DEVICE__ double __nv_ddiv_rz(double __a, double __b);
__DEVICE__ double __nv_dmul_rd(double __a, double __b);
__DEVICE__ double __nv_dmul_rn(double __a, double __b);
__DEVICE__ double __nv_dmul_ru(double __a, double __b);
__DEVICE__ double __nv_dmul_rz(double __a, double __b);
__DEVICE__ float __nv_double2float_rd(double __a);
__DEVICE__ float __nv_double2float_rn(double __a);
__DEVICE__ float __nv_double2float_ru(double __a);
__DEVICE__ float __nv_double2float_rz(double __a);
__DEVICE__ int __nv_double2hiint(double __a);
__DEVICE__ int __nv_double2int_rd(double __a);
__DEVICE__ int __nv_double2int_rn(double __a);
__DEVICE__ int __nv_double2int_ru(double __a);
__DEVICE__ int __nv_double2int_rz(double __a);
__DEVICE__ long long __nv_double2ll_rd(double __a);
__DEVICE__ long long __nv_double2ll_rn(double __a);
__DEVICE__ long long __nv_double2ll_ru(double __a);
__DEVICE__ long long __nv_double2ll_rz(double __a);
__DEVICE__ int __nv_double2loint(double __a);
__DEVICE__ unsigned int __nv_double2uint_rd(double __a);
__DEVICE__ unsigned int __nv_double2uint_rn(double __a);
__DEVICE__ unsigned int __nv_double2uint_ru(double __a);
__DEVICE__ unsigned int __nv_double2uint_rz(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rd(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rn(double __a);
__DEVICE__ unsigned long long __nv_double2ull_ru(double __a);
__DEVICE__ unsigned long long __nv_double2ull_rz(double __a);
__DEVICE__ unsigned long long __nv_double_as_longlong(double __a);
__DEVICE__ double __nv_drcp_rd(double __a);
__DEVICE__ double __nv_drcp_rn(double __a);
__DEVICE__ double __nv_drcp_ru(double __a);
__DEVICE__ double __nv_drcp_rz(double __a);
__DEVICE__ double __nv_dsqrt_rd(double __a);
__DEVICE__ double __nv_dsqrt_rn(double __a);
__DEVICE__ double __nv_dsqrt_ru(double __a);
__DEVICE__ double __nv_dsqrt_rz(double __a);
__DEVICE__ double __nv_dsub_rd(double __a, double __b);
__DEVICE__ double __nv_dsub_rn(double __a, double __b);
__DEVICE__ double __nv_dsub_ru(double __a, double __b);
__DEVICE__ double __nv_dsub_rz(double __a, double __b);
__DEVICE__ double __nv_erfc(double __a);
__DEVICE__ float __nv_erfcf(float __a);
__DEVICE__ double __nv_erfcinv(double __a);
__DEVICE__ float __nv_erfcinvf(float __a);
__DEVICE__ double __nv_erfcx(double __a);
__DEVICE__ float __nv_erfcxf(float __a);
__DEVICE__ double __nv_erf(double __a);
__DEVICE__ float __nv_erff(float __a);
__DEVICE__ double __nv_erfinv(double __a);
__DEVICE__ float __nv_erfinvf(float __a);
__DEVICE__ double __nv_exp10(double __a);
__DEVICE__ float __nv_exp10f(float __a);
__DEVICE__ double __nv_exp2(double __a);
__DEVICE__ float __nv_exp2f(float __a);
__DEVICE__ double __nv_exp(double __a);
__DEVICE__ float __nv_expf(float __a);
__DEVICE__ double __nv_expm1(double __a);
__DEVICE__ float __nv_expm1f(float __a);
__DEVICE__ double __nv_fabs(double __a);
__DEVICE__ float __nv_fabsf(float __a);
__DEVICE__ float __nv_fadd_rd(float __a, float __b);
__DEVICE__ float __nv_fadd_rn(float __a, float __b);
__DEVICE__ float __nv_fadd_ru(float __a, float __b);
__DEVICE__ float __nv_fadd_rz(float __a, float __b);
__DEVICE__ float __nv_fast_cosf(float __a);
__DEVICE__ float __nv_fast_exp10f(float __a);
__DEVICE__ float __nv_fast_expf(float __a);
__DEVICE__ float __nv_fast_fdividef(float __a, float __b);
__DEVICE__ float __nv_fast_log10f(float __a);
__DEVICE__ float __nv_fast_log2f(float __a);
__DEVICE__ float __nv_fast_logf(float __a);
__DEVICE__ float __nv_fast_powf(float __a, float __b);
__DEVICE__ void __nv_fast_sincosf(float __a, float *__s, float *__c);
__DEVICE__ float __nv_fast_sinf(float __a);
__DEVICE__ float __nv_fast_tanf(float __a);
__DEVICE__ double __nv_fdim(double __a, double __b);
__DEVICE__ float __nv_fdimf(float __a, float __b);
__DEVICE__ float __nv_fdiv_rd(float __a, float __b);
__DEVICE__ float __nv_fdiv_rn(float __a, float __b);
__DEVICE__ float __nv_fdiv_ru(float __a, float __b);
__DEVICE__ float __nv_fdiv_rz(float __a, float __b);
__DEVICE__ int __nv_ffs(int __a);
__DEVICE__ int __nv_ffsll(long long __a);
__DEVICE__ int __nv_finitef(float __a);
__DEVICE__ unsigned short __nv_float2half_rn(float __a);
__DEVICE__ int __nv_float2int_rd(float __a);
__DEVICE__ int __nv_float2int_rn(float __a);
__DEVICE__ int __nv_float2int_ru(float __a);
__DEVICE__ int __nv_float2int_rz(float __a);
__DEVICE__ long long __nv_float2ll_rd(float __a);
__DEVICE__ long long __nv_float2ll_rn(float __a);
__DEVICE__ long long __nv_float2ll_ru(float __a);
__DEVICE__ long long __nv_float2ll_rz(float __a);
__DEVICE__ unsigned int __nv_float2uint_rd(float __a);
__DEVICE__ unsigned int __nv_float2uint_rn(float __a);
__DEVICE__ unsigned int __nv_float2uint_ru(float __a);
__DEVICE__ unsigned int __nv_float2uint_rz(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rd(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rn(float __a);
__DEVICE__ unsigned long long __nv_float2ull_ru(float __a);
__DEVICE__ unsigned long long __nv_float2ull_rz(float __a);
__DEVICE__ int __nv_float_as_int(float __a);
__DEVICE__ unsigned int __nv_float_as_uint(float __a);
__DEVICE__ double __nv_floor(double __a);
__DEVICE__ float __nv_floorf(float __a);
__DEVICE__ double __nv_fma(double __a, double __b, double __c);
__DEVICE__ float __nv_fmaf(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rd(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rn(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_ru(float __a, float __b, float __c);
__DEVICE__ float __nv_fmaf_rz(float __a, float __b, float __c);
__DEVICE__ double __nv_fma_rd(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rn(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_ru(double __a, double __b, double __c);
__DEVICE__ double __nv_fma_rz(double __a, double __b, double __c);
__DEVICE__ double __nv_fmax(double __a, double __b);
__DEVICE__ float __nv_fmaxf(float __a, float __b);
__DEVICE__ double __nv_fmin(double __a, double __b);
__DEVICE__ float __nv_fminf(float __a, float __b);
__DEVICE__ double __nv_fmod(double __a, double __b);
__DEVICE__ float __nv_fmodf(float __a, float __b);
__DEVICE__ float __nv_fmul_rd(float __a, float __b);
__DEVICE__ float __nv_fmul_rn(float __a, float __b);
__DEVICE__ float __nv_fmul_ru(float __a, float __b);
__DEVICE__ float __nv_fmul_rz(float __a, float __b);
__DEVICE__ float __nv_frcp_rd(float __a);
__DEVICE__ float __nv_frcp_rn(float __a);
__DEVICE__ float __nv_frcp_ru(float __a);
__DEVICE__ float __nv_frcp_rz(float __a);
__DEVICE__ double __nv_frexp(double __a, int *__b);
__DEVICE__ float __nv_frexpf(float __a, int *__b);
__DEVICE__ float __nv_frsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_rd(float __a);
__DEVICE__ float __nv_fsqrt_rn(float __a);
__DEVICE__ float __nv_fsqrt_ru(float __a);
__DEVICE__ float __nv_fsqrt_rz(float __a);
__DEVICE__ float __nv_fsub_rd(float __a, float __b);
__DEVICE__ float __nv_fsub_rn(float __a, float __b);
__DEVICE__ float __nv_fsub_ru(float __a, float __b);
__DEVICE__ float __nv_fsub_rz(float __a, float __b);
__DEVICE__ int __nv_hadd(int __a, int __b);
__DEVICE__ float __nv_half2float(unsigned short __h);
__DEVICE__ double __nv_hiloint2double(int __a, int __b);
__DEVICE__ double __nv_hypot(double __a, double __b);
__DEVICE__ float __nv_hypotf(float __a, float __b);
__DEVICE__ int __nv_ilogb(double __a);
__DEVICE__ int __nv_ilogbf(float __a);
__DEVICE__ double __nv_int2double_rn(int __a);
__DEVICE__ float __nv_int2float_rd(int __a);
__DEVICE__ float __nv_int2float_rn(int __a);
__DEVICE__ float __nv_int2float_ru(int __a);
__DEVICE__ float __nv_int2float_rz(int __a);
__DEVICE__ float __nv_int_as_float(int __a);
__DEVICE__ int __nv_isfinited(double __a);
__DEVICE__ int __nv_isinfd(double __a);
__DEVICE__ int __nv_isinff(float __a);
__DEVICE__ int __nv_isnand(double __a);
__DEVICE__ int __nv_isnanf(float __a);
__DEVICE__ double __nv_j0(double __a);
__DEVICE__ float __nv_j0f(float __a);
__DEVICE__ double __nv_j1(double __a);
__DEVICE__ float __nv_j1f(float __a);
__DEVICE__ float __nv_jnf(int __a, float __b);
__DEVICE__ double __nv_jn(int __a, double __b);
__DEVICE__ double __nv_ldexp(double __a, int __b);
__DEVICE__ float __nv_ldexpf(float __a, int __b);
__DEVICE__ double __nv_lgamma(double __a);
__DEVICE__ float __nv_lgammaf(float __a);
__DEVICE__ double __nv_ll2double_rd(long long __a);
__DEVICE__ double __nv_ll2double_rn(long long __a);
__DEVICE__ double __nv_ll2double_ru(long long __a);
__DEVICE__ double __nv_ll2double_rz(long long __a);
__DEVICE__ float __nv_ll2float_rd(long long __a);
__DEVICE__ float __nv_ll2float_rn(long long __a);
__DEVICE__ float __nv_ll2float_ru(long long __a);
__DEVICE__ float __nv_ll2float_rz(long long __a);
__DEVICE__ long long __nv_llabs(long long __a);
__DEVICE__ long long __nv_llmax(long long __a, long long __b);
__DEVICE__ long long __nv_llmin(long long __a, long long __b);
__DEVICE__ long long __nv_llrint(double __a);
__DEVICE__ long long __nv_llrintf(float __a);
__DEVICE__ long long __nv_llround(double __a);
__DEVICE__ long long __nv_llroundf(float __a);
__DEVICE__ double __nv_log10(double __a);
__DEVICE__ float __nv_log10f(float __a);
__DEVICE__ double __nv_log1p(double __a);
__DEVICE__ float __nv_log1pf(float __a);
__DEVICE__ double __nv_log2(double __a);
__DEVICE__ float __nv_log2f(float __a);
__DEVICE__ double __nv_logb(double __a);
__DEVICE__ float __nv_logbf(float __a);
__DEVICE__ double __nv_log(double __a);
__DEVICE__ float __nv_logf(float __a);
__DEVICE__ double __nv_longlong_as_double(long long __a);
__DEVICE__ int __nv_max(int __a, int __b);
__DEVICE__ int __nv_min(int __a, int __b);
__DEVICE__ double __nv_modf(double __a, double *__b);
__DEVICE__ float __nv_modff(float __a, float *__b);
__DEVICE__ int __nv_mul24(int __a, int __b);
__DEVICE__ long long __nv_mul64hi(long long __a, long long __b);
__DEVICE__ int __nv_mulhi(int __a, int __b);
__DEVICE__ double __nv_nan(const signed char *__a);
__DEVICE__ float __nv_nanf(const signed char *__a);
__DEVICE__ double __nv_nearbyint(double __a);
__DEVICE__ float __nv_nearbyintf(float __a);
__DEVICE__ double __nv_nextafter(double __a, double __b);
__DEVICE__ float __nv_nextafterf(float __a, float __b);
__DEVICE__ double __nv_norm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_norm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_norm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_norm4df(float __a, float __b, float __c, float __d);
__DEVICE__ double __nv_normcdf(double __a);
__DEVICE__ float __nv_normcdff(float __a);
__DEVICE__ double __nv_normcdfinv(double __a);
__DEVICE__ float __nv_normcdfinvf(float __a);
__DEVICE__ float __nv_normf(int __a, const float *__b);
__DEVICE__ double __nv_norm(int __a, const double *__b);
__DEVICE__ int __nv_popc(unsigned int __a);
__DEVICE__ int __nv_popcll(unsigned long long __a);
__DEVICE__ double __nv_pow(double __a, double __b);
__DEVICE__ float __nv_powf(float __a, float __b);
__DEVICE__ double __nv_powi(double __a, int __b);
__DEVICE__ float __nv_powif(float __a, int __b);
__DEVICE__ double __nv_rcbrt(double __a);
__DEVICE__ float __nv_rcbrtf(float __a);
__DEVICE__ double __nv_rcp64h(double __a);
__DEVICE__ double __nv_remainder(double __a, double __b);
__DEVICE__ float __nv_remainderf(float __a, float __b);
__DEVICE__ double __nv_remquo(double __a, double __b, int *__c);
__DEVICE__ float __nv_remquof(float __a, float __b, int *__c);
__DEVICE__ int __nv_rhadd(int __a, int __b);
__DEVICE__ double __nv_rhypot(double __a, double __b);
__DEVICE__ float __nv_rhypotf(float __a, float __b);
__DEVICE__ double __nv_rint(double __a);
__DEVICE__ float __nv_rintf(float __a);
__DEVICE__ double __nv_rnorm3d(double __a, double __b, double __c);
__DEVICE__ float __nv_rnorm3df(float __a, float __b, float __c);
__DEVICE__ double __nv_rnorm4d(double __a, double __b, double __c, double __d);
__DEVICE__ float __nv_rnorm4df(float __a, float __b, float __c, float __d);
__DEVICE__ float __nv_rnormf(int __a, const float *__b);
__DEVICE__ double __nv_rnorm(int __a, const double *__b);
__DEVICE__ double __nv_round(double __a);
__DEVICE__ float __nv_roundf(float __a);
__DEVICE__ double __nv_rsqrt(double __a);
__DEVICE__ float __nv_rsqrtf(float __a);
__DEVICE__ int __nv_sad(int __a, int __b, int __c);
__DEVICE__ float __nv_saturatef(float __a);
__DEVICE__ double __nv_scalbn(double __a, int __b);
__DEVICE__ float __nv_scalbnf(float __a, int __b);
__DEVICE__ int __nv_signbitd(double __a);
__DEVICE__ int __nv_signbitf(float __a);
__DEVICE__ void __nv_sincos(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincosf(float __a, float *__b, float *__c);
__DEVICE__ void __nv_sincospi(double __a, double *__b, double *__c);
__DEVICE__ void __nv_sincospif(float __a, float *__b, float *__c);
__DEVICE__ double __nv_sin(double __a);
__DEVICE__ float __nv_sinf(float __a);
__DEVICE__ double __nv_sinh(double __a);
__DEVICE__ float __nv_sinhf(float __a);
__DEVICE__ double __nv_sinpi(double __a);
__DEVICE__ float __nv_sinpif(float __a);
__DEVICE__ double __nv_sqrt(double __a);
__DEVICE__ float __nv_sqrtf(float __a);
__DEVICE__ double __nv_tan(double __a);
__DEVICE__ float __nv_tanf(float __a);
__DEVICE__ double __nv_tanh(double __a);
__DEVICE__ float __nv_tanhf(float __a);
__DEVICE__ double __nv_tgamma(double __a);
__DEVICE__ float __nv_tgammaf(float __a);
__DEVICE__ double __nv_trunc(double __a);
__DEVICE__ float __nv_truncf(float __a);
__DEVICE__ int __nv_uhadd(unsigned int __a, unsigned int __b);
__DEVICE__ double __nv_uint2double_rn(unsigned int __i);
__DEVICE__ float __nv_uint2float_rd(unsigned int __a);
__DEVICE__ float __nv_uint2float_rn(unsigned int __a);
__DEVICE__ float __nv_uint2float_ru(unsigned int __a);
__DEVICE__ float __nv_uint2float_rz(unsigned int __a);
__DEVICE__ float __nv_uint_as_float(unsigned int __a);
__DEVICE__ double __nv_ull2double_rd(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rn(unsigned long long __a);
__DEVICE__ double __nv_ull2double_ru(unsigned long long __a);
__DEVICE__ double __nv_ull2double_rz(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rd(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rn(unsigned long long __a);
__DEVICE__ float __nv_ull2float_ru(unsigned long long __a);
__DEVICE__ float __nv_ull2float_rz(unsigned long long __a);
__DEVICE__ unsigned long long __nv_ullmax(unsigned long long __a,
unsigned long long __b);
__DEVICE__ unsigned long long __nv_ullmin(unsigned long long __a,
unsigned long long __b);
__DEVICE__ unsigned int __nv_umax(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umin(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned long long __nv_umul64hi(unsigned long long __a,
unsigned long long __b);
__DEVICE__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b);
__DEVICE__ unsigned int __nv_usad(unsigned int __a, unsigned int __b,
unsigned int __c);
#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020
__DEVICE__ int __nv_vabs2(int __a);
__DEVICE__ int __nv_vabs4(int __a);
__DEVICE__ int __nv_vabsdiffs2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffs4(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu2(int __a, int __b);
__DEVICE__ int __nv_vabsdiffu4(int __a, int __b);
__DEVICE__ int __nv_vabsss2(int __a);
__DEVICE__ int __nv_vabsss4(int __a);
__DEVICE__ int __nv_vadd2(int __a, int __b);
__DEVICE__ int __nv_vadd4(int __a, int __b);
__DEVICE__ int __nv_vaddss2(int __a, int __b);
__DEVICE__ int __nv_vaddss4(int __a, int __b);
__DEVICE__ int __nv_vaddus2(int __a, int __b);
__DEVICE__ int __nv_vaddus4(int __a, int __b);
__DEVICE__ int __nv_vavgs2(int __a, int __b);
__DEVICE__ int __nv_vavgs4(int __a, int __b);
__DEVICE__ int __nv_vavgu2(int __a, int __b);
__DEVICE__ int __nv_vavgu4(int __a, int __b);
__DEVICE__ int __nv_vcmpeq2(int __a, int __b);
__DEVICE__ int __nv_vcmpeq4(int __a, int __b);
__DEVICE__ int __nv_vcmpges2(int __a, int __b);
__DEVICE__ int __nv_vcmpges4(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgeu4(int __a, int __b);
__DEVICE__ int __nv_vcmpgts2(int __a, int __b);
__DEVICE__ int __nv_vcmpgts4(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu2(int __a, int __b);
__DEVICE__ int __nv_vcmpgtu4(int __a, int __b);
__DEVICE__ int __nv_vcmples2(int __a, int __b);
__DEVICE__ int __nv_vcmples4(int __a, int __b);
__DEVICE__ int __nv_vcmpleu2(int __a, int __b);
__DEVICE__ int __nv_vcmpleu4(int __a, int __b);
__DEVICE__ int __nv_vcmplts2(int __a, int __b);
__DEVICE__ int __nv_vcmplts4(int __a, int __b);
__DEVICE__ int __nv_vcmpltu2(int __a, int __b);
__DEVICE__ int __nv_vcmpltu4(int __a, int __b);
__DEVICE__ int __nv_vcmpne2(int __a, int __b);
__DEVICE__ int __nv_vcmpne4(int __a, int __b);
__DEVICE__ int __nv_vhaddu2(int __a, int __b);
__DEVICE__ int __nv_vhaddu4(int __a, int __b);
__DEVICE__ int __nv_vmaxs2(int __a, int __b);
__DEVICE__ int __nv_vmaxs4(int __a, int __b);
__DEVICE__ int __nv_vmaxu2(int __a, int __b);
__DEVICE__ int __nv_vmaxu4(int __a, int __b);
__DEVICE__ int __nv_vmins2(int __a, int __b);
__DEVICE__ int __nv_vmins4(int __a, int __b);
__DEVICE__ int __nv_vminu2(int __a, int __b);
__DEVICE__ int __nv_vminu4(int __a, int __b);
__DEVICE__ int __nv_vneg2(int __a);
__DEVICE__ int __nv_vneg4(int __a);
__DEVICE__ int __nv_vnegss2(int __a);
__DEVICE__ int __nv_vnegss4(int __a);
__DEVICE__ int __nv_vsads2(int __a, int __b);
__DEVICE__ int __nv_vsads4(int __a, int __b);
__DEVICE__ int __nv_vsadu2(int __a, int __b);
__DEVICE__ int __nv_vsadu4(int __a, int __b);
__DEVICE__ int __nv_vseteq2(int __a, int __b);
__DEVICE__ int __nv_vseteq4(int __a, int __b);
__DEVICE__ int __nv_vsetges2(int __a, int __b);
__DEVICE__ int __nv_vsetges4(int __a, int __b);
__DEVICE__ int __nv_vsetgeu2(int __a, int __b);
__DEVICE__ int __nv_vsetgeu4(int __a, int __b);
__DEVICE__ int __nv_vsetgts2(int __a, int __b);
__DEVICE__ int __nv_vsetgts4(int __a, int __b);
__DEVICE__ int __nv_vsetgtu2(int __a, int __b);
__DEVICE__ int __nv_vsetgtu4(int __a, int __b);
__DEVICE__ int __nv_vsetles2(int __a, int __b);
__DEVICE__ int __nv_vsetles4(int __a, int __b);
__DEVICE__ int __nv_vsetleu2(int __a, int __b);
__DEVICE__ int __nv_vsetleu4(int __a, int __b);
__DEVICE__ int __nv_vsetlts2(int __a, int __b);
__DEVICE__ int __nv_vsetlts4(int __a, int __b);
__DEVICE__ int __nv_vsetltu2(int __a, int __b);
__DEVICE__ int __nv_vsetltu4(int __a, int __b);
__DEVICE__ int __nv_vsetne2(int __a, int __b);
__DEVICE__ int __nv_vsetne4(int __a, int __b);
__DEVICE__ int __nv_vsub2(int __a, int __b);
__DEVICE__ int __nv_vsub4(int __a, int __b);
__DEVICE__ int __nv_vsubss2(int __a, int __b);
__DEVICE__ int __nv_vsubss4(int __a, int __b);
__DEVICE__ int __nv_vsubus2(int __a, int __b);
__DEVICE__ int __nv_vsubus4(int __a, int __b);
#endif // CUDA_VERSION
__DEVICE__ double __nv_y0(double __a);
__DEVICE__ float __nv_y0f(float __a);
__DEVICE__ double __nv_y1(double __a);
__DEVICE__ float __nv_y1f(float __a);
__DEVICE__ float __nv_ynf(int __a, float __b);
__DEVICE__ double __nv_yn(int __a, double __b);
#if defined(__OPENMP_NVPTX__)
#pragma omp end assumes ext_spmd_amenable no_openmp
#endif
#if defined(__cplusplus)
} // extern "C"
#endif
#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__

View File

@ -1,348 +0,0 @@
/*===---- __clang_cuda_math.h - Device-side CUDA math support --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_MATH_H__
#define __CLANG_CUDA_MATH_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
#ifndef __OPENMP_NVPTX__
#if CUDA_VERSION < 9000
#error This file is intended to be used with CUDA-9+ only.
#endif
#endif
// __DEVICE__ is a helper macro with common set of attributes for the wrappers
// we implement in this file. We need static in order to avoid emitting unused
// functions and __forceinline__ helps inlining these wrappers at -O1.
#pragma push_macro("__DEVICE__")
#ifdef __OPENMP_NVPTX__
#if defined(__cplusplus)
#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __attribute__((always_inline, nothrow))
#endif
#else
#define __DEVICE__ static __device__ __forceinline__
#endif
// Specialized version of __DEVICE__ for functions with void return type. Needed
// because the OpenMP overlay requires constexpr functions here but prior to
// c++14 void return functions could not be constexpr.
#pragma push_macro("__DEVICE_VOID__")
#if defined(__OPENMP_NVPTX__) && defined(__cplusplus) && __cplusplus < 201402L
#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow))
#else
#define __DEVICE_VOID__ __DEVICE__
#endif
// libdevice provides fast low precision and slow full-recision implementations
// for some functions. Which one gets selected depends on
// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if
// -ffast-math or -fgpu-approx-transcendentals are in effect.
#pragma push_macro("__FAST_OR_SLOW")
#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
#define __FAST_OR_SLOW(fast, slow) fast
#else
#define __FAST_OR_SLOW(fast, slow) slow
#endif
__DEVICE__ int abs(int __a) { return __nv_abs(__a); }
__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); }
__DEVICE__ double acos(double __a) { return __nv_acos(__a); }
__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); }
__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); }
__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); }
__DEVICE__ double asin(double __a) { return __nv_asin(__a); }
__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); }
__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); }
__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); }
__DEVICE__ double atan(double __a) { return __nv_atan(__a); }
__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); }
__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); }
__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); }
__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); }
__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); }
__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); }
__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); }
__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); }
__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); }
__DEVICE__ double copysign(double __a, double __b) {
return __nv_copysign(__a, __b);
}
__DEVICE__ float copysignf(float __a, float __b) {
return __nv_copysignf(__a, __b);
}
__DEVICE__ double cos(double __a) { return __nv_cos(__a); }
__DEVICE__ float cosf(float __a) {
return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a);
}
__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); }
__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); }
__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); }
__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); }
__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); }
__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); }
__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); }
__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); }
__DEVICE__ double erf(double __a) { return __nv_erf(__a); }
__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); }
__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); }
__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); }
__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); }
__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); }
__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); }
__DEVICE__ float erff(float __a) { return __nv_erff(__a); }
__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); }
__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); }
__DEVICE__ double exp(double __a) { return __nv_exp(__a); }
__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); }
__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); }
__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); }
__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); }
__DEVICE__ float expf(float __a) { return __nv_expf(__a); }
__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); }
__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); }
__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); }
__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); }
__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); }
__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; }
__DEVICE__ float fdividef(float __a, float __b) {
#if __FAST_MATH__ && !__CUDA_PREC_DIV
return __nv_fast_fdividef(__a, __b);
#else
return __a / __b;
#endif
}
__DEVICE__ double floor(double __f) { return __nv_floor(__f); }
__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); }
__DEVICE__ double fma(double __a, double __b, double __c) {
return __nv_fma(__a, __b, __c);
}
__DEVICE__ float fmaf(float __a, float __b, float __c) {
return __nv_fmaf(__a, __b, __c);
}
__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); }
__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); }
__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); }
__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); }
__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); }
__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); }
__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); }
__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); }
__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); }
__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); }
__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); }
__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); }
__DEVICE__ double j0(double __a) { return __nv_j0(__a); }
__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); }
__DEVICE__ double j1(double __a) { return __nv_j1(__a); }
__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); }
__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); }
__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); }
#if defined(__LP64__) || defined(_WIN64)
__DEVICE__ long labs(long __a) { return __nv_llabs(__a); };
#else
__DEVICE__ long labs(long __a) { return __nv_abs(__a); };
#endif
__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); }
__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); }
__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); }
__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); }
__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); }
__DEVICE__ long long llmax(long long __a, long long __b) {
return __nv_llmax(__a, __b);
}
__DEVICE__ long long llmin(long long __a, long long __b) {
return __nv_llmin(__a, __b);
}
__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); }
__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); }
__DEVICE__ long long llround(double __a) { return __nv_llround(__a); }
__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); }
__DEVICE__ double round(double __a) { return __nv_round(__a); }
__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); }
__DEVICE__ double log(double __a) { return __nv_log(__a); }
__DEVICE__ double log10(double __a) { return __nv_log10(__a); }
__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); }
__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); }
__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); }
__DEVICE__ double log2(double __a) { return __nv_log2(__a); }
__DEVICE__ float log2f(float __a) {
return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a);
}
__DEVICE__ double logb(double __a) { return __nv_logb(__a); }
__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); }
__DEVICE__ float logf(float __a) {
return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a);
}
#if defined(__LP64__) || defined(_WIN64)
__DEVICE__ long lrint(double __a) { return llrint(__a); }
__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); }
__DEVICE__ long lround(double __a) { return llround(__a); }
__DEVICE__ long lroundf(float __a) { return llroundf(__a); }
#else
__DEVICE__ long lrint(double __a) { return (long)rint(__a); }
__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); }
__DEVICE__ long lround(double __a) { return round(__a); }
__DEVICE__ long lroundf(float __a) { return roundf(__a); }
#endif
__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); }
__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); }
__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); }
__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); }
__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); }
__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); }
__DEVICE__ double nextafter(double __a, double __b) {
return __nv_nextafter(__a, __b);
}
__DEVICE__ float nextafterf(float __a, float __b) {
return __nv_nextafterf(__a, __b);
}
__DEVICE__ double norm(int __dim, const double *__t) {
return __nv_norm(__dim, __t);
}
__DEVICE__ double norm3d(double __a, double __b, double __c) {
return __nv_norm3d(__a, __b, __c);
}
__DEVICE__ float norm3df(float __a, float __b, float __c) {
return __nv_norm3df(__a, __b, __c);
}
__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) {
return __nv_norm4d(__a, __b, __c, __d);
}
__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) {
return __nv_norm4df(__a, __b, __c, __d);
}
__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); }
__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); }
__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); }
__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); }
__DEVICE__ float normf(int __dim, const float *__t) {
return __nv_normf(__dim, __t);
}
__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); }
__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); }
__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); }
__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); }
__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); }
__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); }
__DEVICE__ double remainder(double __a, double __b) {
return __nv_remainder(__a, __b);
}
__DEVICE__ float remainderf(float __a, float __b) {
return __nv_remainderf(__a, __b);
}
__DEVICE__ double remquo(double __a, double __b, int *__c) {
return __nv_remquo(__a, __b, __c);
}
__DEVICE__ float remquof(float __a, float __b, int *__c) {
return __nv_remquof(__a, __b, __c);
}
__DEVICE__ double rhypot(double __a, double __b) {
return __nv_rhypot(__a, __b);
}
__DEVICE__ float rhypotf(float __a, float __b) {
return __nv_rhypotf(__a, __b);
}
// __nv_rint* in libdevice is buggy and produces incorrect results.
__DEVICE__ double rint(double __a) { return __builtin_rint(__a); }
__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); }
__DEVICE__ double rnorm(int __a, const double *__b) {
return __nv_rnorm(__a, __b);
}
__DEVICE__ double rnorm3d(double __a, double __b, double __c) {
return __nv_rnorm3d(__a, __b, __c);
}
__DEVICE__ float rnorm3df(float __a, float __b, float __c) {
return __nv_rnorm3df(__a, __b, __c);
}
__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) {
return __nv_rnorm4d(__a, __b, __c, __d);
}
__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) {
return __nv_rnorm4df(__a, __b, __c, __d);
}
__DEVICE__ float rnormf(int __dim, const float *__t) {
return __nv_rnormf(__dim, __t);
}
__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); }
__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); }
__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); }
__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); }
__DEVICE__ double scalbln(double __a, long __b) {
if (__b > INT_MAX)
return __a > 0 ? HUGE_VAL : -HUGE_VAL;
if (__b < INT_MIN)
return __a > 0 ? 0.0 : -0.0;
return scalbn(__a, (int)__b);
}
__DEVICE__ float scalblnf(float __a, long __b) {
if (__b > INT_MAX)
return __a > 0 ? HUGE_VALF : -HUGE_VALF;
if (__b < INT_MIN)
return __a > 0 ? 0.f : -0.f;
return scalbnf(__a, (int)__b);
}
__DEVICE__ double sin(double __a) { return __nv_sin(__a); }
__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) {
return __nv_sincos(__a, __s, __c);
}
__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) {
return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __s, __c);
}
__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) {
return __nv_sincospi(__a, __s, __c);
}
__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) {
return __nv_sincospif(__a, __s, __c);
}
__DEVICE__ float sinf(float __a) {
return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a);
}
__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); }
__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); }
__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); }
__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); }
__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); }
__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); }
__DEVICE__ double tan(double __a) { return __nv_tan(__a); }
__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); }
__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); }
__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); }
__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); }
__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); }
__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); }
__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); }
__DEVICE__ unsigned long long ullmax(unsigned long long __a,
unsigned long long __b) {
return __nv_ullmax(__a, __b);
}
__DEVICE__ unsigned long long ullmin(unsigned long long __a,
unsigned long long __b) {
return __nv_ullmin(__a, __b);
}
__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) {
return __nv_umax(__a, __b);
}
__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) {
return __nv_umin(__a, __b);
}
__DEVICE__ double y0(double __a) { return __nv_y0(__a); }
__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); }
__DEVICE__ double y1(double __a) { return __nv_y1(__a); }
__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); }
__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); }
__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
#pragma pop_macro("__DEVICE__")
#pragma pop_macro("__DEVICE_VOID__")
#pragma pop_macro("__FAST_OR_SLOW")
#endif // __CLANG_CUDA_MATH_H__

View File

@ -1,284 +0,0 @@
/*===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#define __CLANG__CUDA_MATH_FORWARD_DECLARES_H__
#if !defined(__CUDA__) && !__HIP__
#error "This file is for CUDA/HIP compilation only."
#endif
// This file forward-declares of some math functions we (or the CUDA headers)
// will define later. We need to do this, and do it before cmath is included,
// because the standard library may have constexpr math functions. In the
// absence of a prior __device__ decl, those constexpr functions may become
// implicitly host+device. host+device functions can't be overloaded, so that
// would preclude the use of our own __device__ overloads for these functions.
#pragma push_macro("__DEVICE__")
#define __DEVICE__ \
static __inline__ __attribute__((always_inline)) __attribute__((device))
__DEVICE__ long abs(long);
__DEVICE__ long long abs(long long);
__DEVICE__ double abs(double);
__DEVICE__ float abs(float);
__DEVICE__ int abs(int);
__DEVICE__ double acos(double);
__DEVICE__ float acos(float);
__DEVICE__ double acosh(double);
__DEVICE__ float acosh(float);
__DEVICE__ double asin(double);
__DEVICE__ float asin(float);
__DEVICE__ double asinh(double);
__DEVICE__ float asinh(float);
__DEVICE__ double atan2(double, double);
__DEVICE__ float atan2(float, float);
__DEVICE__ double atan(double);
__DEVICE__ float atan(float);
__DEVICE__ double atanh(double);
__DEVICE__ float atanh(float);
__DEVICE__ double cbrt(double);
__DEVICE__ float cbrt(float);
__DEVICE__ double ceil(double);
__DEVICE__ float ceil(float);
__DEVICE__ double copysign(double, double);
__DEVICE__ float copysign(float, float);
__DEVICE__ double cos(double);
__DEVICE__ float cos(float);
__DEVICE__ double cosh(double);
__DEVICE__ float cosh(float);
__DEVICE__ double erfc(double);
__DEVICE__ float erfc(float);
__DEVICE__ double erf(double);
__DEVICE__ float erf(float);
__DEVICE__ double exp2(double);
__DEVICE__ float exp2(float);
__DEVICE__ double exp(double);
__DEVICE__ float exp(float);
__DEVICE__ double expm1(double);
__DEVICE__ float expm1(float);
__DEVICE__ double fabs(double);
__DEVICE__ float fabs(float);
__DEVICE__ double fdim(double, double);
__DEVICE__ float fdim(float, float);
__DEVICE__ double floor(double);
__DEVICE__ float floor(float);
__DEVICE__ double fma(double, double, double);
__DEVICE__ float fma(float, float, float);
__DEVICE__ double fmax(double, double);
__DEVICE__ float fmax(float, float);
__DEVICE__ double fmin(double, double);
__DEVICE__ float fmin(float, float);
__DEVICE__ double fmod(double, double);
__DEVICE__ float fmod(float, float);
__DEVICE__ int fpclassify(double);
__DEVICE__ int fpclassify(float);
__DEVICE__ double frexp(double, int *);
__DEVICE__ float frexp(float, int *);
__DEVICE__ double hypot(double, double);
__DEVICE__ float hypot(float, float);
__DEVICE__ int ilogb(double);
__DEVICE__ int ilogb(float);
#ifdef _MSC_VER
__DEVICE__ bool isfinite(long double);
#endif
__DEVICE__ bool isfinite(double);
__DEVICE__ bool isfinite(float);
__DEVICE__ bool isgreater(double, double);
__DEVICE__ bool isgreaterequal(double, double);
__DEVICE__ bool isgreaterequal(float, float);
__DEVICE__ bool isgreater(float, float);
#ifdef _MSC_VER
__DEVICE__ bool isinf(long double);
#endif
__DEVICE__ bool isinf(double);
__DEVICE__ bool isinf(float);
__DEVICE__ bool isless(double, double);
__DEVICE__ bool islessequal(double, double);
__DEVICE__ bool islessequal(float, float);
__DEVICE__ bool isless(float, float);
__DEVICE__ bool islessgreater(double, double);
__DEVICE__ bool islessgreater(float, float);
#ifdef _MSC_VER
__DEVICE__ bool isnan(long double);
#endif
__DEVICE__ bool isnan(double);
__DEVICE__ bool isnan(float);
__DEVICE__ bool isnormal(double);
__DEVICE__ bool isnormal(float);
__DEVICE__ bool isunordered(double, double);
__DEVICE__ bool isunordered(float, float);
__DEVICE__ long labs(long);
__DEVICE__ double ldexp(double, int);
__DEVICE__ float ldexp(float, int);
__DEVICE__ double lgamma(double);
__DEVICE__ float lgamma(float);
__DEVICE__ long long llabs(long long);
__DEVICE__ long long llrint(double);
__DEVICE__ long long llrint(float);
__DEVICE__ double log10(double);
__DEVICE__ float log10(float);
__DEVICE__ double log1p(double);
__DEVICE__ float log1p(float);
__DEVICE__ double log2(double);
__DEVICE__ float log2(float);
__DEVICE__ double logb(double);
__DEVICE__ float logb(float);
__DEVICE__ double log(double);
__DEVICE__ float log(float);
__DEVICE__ long lrint(double);
__DEVICE__ long lrint(float);
__DEVICE__ long lround(double);
__DEVICE__ long lround(float);
__DEVICE__ long long llround(float); // No llround(double).
__DEVICE__ double modf(double, double *);
__DEVICE__ float modf(float, float *);
__DEVICE__ double nan(const char *);
__DEVICE__ float nanf(const char *);
__DEVICE__ double nearbyint(double);
__DEVICE__ float nearbyint(float);
__DEVICE__ double nextafter(double, double);
__DEVICE__ float nextafter(float, float);
__DEVICE__ double pow(double, double);
__DEVICE__ double pow(double, int);
__DEVICE__ float pow(float, float);
__DEVICE__ float pow(float, int);
__DEVICE__ double remainder(double, double);
__DEVICE__ float remainder(float, float);
__DEVICE__ double remquo(double, double, int *);
__DEVICE__ float remquo(float, float, int *);
__DEVICE__ double rint(double);
__DEVICE__ float rint(float);
__DEVICE__ double round(double);
__DEVICE__ float round(float);
__DEVICE__ double scalbln(double, long);
__DEVICE__ float scalbln(float, long);
__DEVICE__ double scalbn(double, int);
__DEVICE__ float scalbn(float, int);
#ifdef _MSC_VER
__DEVICE__ bool signbit(long double);
#endif
__DEVICE__ bool signbit(double);
__DEVICE__ bool signbit(float);
__DEVICE__ double sin(double);
__DEVICE__ float sin(float);
__DEVICE__ double sinh(double);
__DEVICE__ float sinh(float);
__DEVICE__ double sqrt(double);
__DEVICE__ float sqrt(float);
__DEVICE__ double tan(double);
__DEVICE__ float tan(float);
__DEVICE__ double tanh(double);
__DEVICE__ float tanh(float);
__DEVICE__ double tgamma(double);
__DEVICE__ float tgamma(float);
__DEVICE__ double trunc(double);
__DEVICE__ float trunc(float);
// Notably missing above is nexttoward, which we don't define on
// the device side because libdevice doesn't give us an implementation, and we
// don't want to be in the business of writing one ourselves.
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
using ::abs;
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isinf;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnan;
using ::isnormal;
using ::isunordered;
using ::labs;
using ::ldexp;
using ::lgamma;
using ::llabs;
using ::llrint;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::llround;
using ::modf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nextafter;
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#pragma pop_macro("__DEVICE__")
#endif

View File

@ -1,503 +0,0 @@
/*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/*
* WARNING: This header is intended to be directly -include'd by
* the compiler and is not supposed to be included by users.
*
* CUDA headers are implemented in a way that currently makes it
* impossible for user code to #include directly when compiling with
* Clang. They present different view of CUDA-supplied functions
* depending on where in NVCC's compilation pipeline the headers are
* included. Neither of these modes provides function definitions with
* correct attributes, so we use preprocessor to force the headers
* into a form that Clang can use.
*
* Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
* this file during every CUDA compilation.
*/
#ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
#define __CLANG_CUDA_RUNTIME_WRAPPER_H__
#if defined(__CUDA__) && defined(__clang__)
// Include some forward declares that must come before cmath.
#include <__clang_cuda_math_forward_declares.h>
// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
// enabled depend on it to avoid using __float128, which is unsupported in
// CUDA.
#define __CUDACC__
// Include some standard headers to avoid CUDA headers including them
// while some required macros (like __THROW) are in a weird state.
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#include <string.h>
#undef __CUDACC__
// Preserve common macros that will be changed below by us or by CUDA
// headers.
#pragma push_macro("__THROW")
#pragma push_macro("__CUDA_ARCH__")
// WARNING: Preprocessor hacks below are based on specific details of
// CUDA-7.x headers and are not expected to work with any other
// version of CUDA headers.
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
#elif CUDA_VERSION < 7000
#error "Unsupported CUDA version!"
#endif
#pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
#if CUDA_VERSION >= 10000
#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
#endif
// Make largest subset of device functions available during host
// compilation.
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 9999
#endif
#include "__clang_cuda_builtin_vars.h"
// No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
// has taken care of builtin variables declared in the file.
#define __DEVICE_LAUNCH_PARAMETERS_H__
// {math,device}_functions.h only have declarations of the
// functions. We don't need them as we're going to pull in their
// definitions from .hpp files.
#define __DEVICE_FUNCTIONS_H__
#define __MATH_FUNCTIONS_H__
#define __COMMON_FUNCTIONS_H__
// device_functions_decls is replaced by __clang_cuda_device_functions.h
// included below.
#define __DEVICE_FUNCTIONS_DECLS_H__
#undef __CUDACC__
#if CUDA_VERSION < 9000
#define __CUDABE__
#else
#define __CUDACC__
#define __CUDA_LIBDEVICE__
#endif
// Disables definitions of device-side runtime support stubs in
// cuda_device_runtime_api.h
#include "host_defines.h"
#undef __CUDACC__
#include "driver_types.h"
#include "host_config.h"
// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
// functional equivalent of what we need.
#pragma push_macro("nv_weak")
#define nv_weak weak
#undef __CUDABE__
#undef __CUDA_LIBDEVICE__
#define __CUDACC__
#include "cuda_runtime.h"
#pragma pop_macro("nv_weak")
#undef __CUDACC__
#define __CUDABE__
// CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
// not have at the moment. Emulate them with a builtin memcpy/memset.
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
#if CUDA_VERSION < 9000
#include "crt/device_runtime.h"
#endif
#include "crt/host_runtime.h"
// device_runtime.h defines __cxa_* macros that will conflict with
// cxxabi.h.
// FIXME: redefine these as __device__ functions.
#undef __cxa_vec_ctor
#undef __cxa_vec_cctor
#undef __cxa_vec_dtor
#undef __cxa_vec_new
#undef __cxa_vec_new2
#undef __cxa_vec_new3
#undef __cxa_vec_delete2
#undef __cxa_vec_delete
#undef __cxa_vec_delete3
#undef __cxa_pure_virtual
// math_functions.hpp expects this host function be defined on MacOS, but it
// ends up not being there because of the games we play here. Just define it
// ourselves; it's simple enough.
#ifdef __APPLE__
inline __host__ double __signbitd(double x) {
return std::signbit(x);
}
#endif
// CUDA 9.1 no longer provides declarations for libdevice functions, so we need
// to provide our own.
#include <__clang_cuda_libdevice_declares.h>
// Wrappers for many device-side standard library functions, incl. math
// functions, became compiler builtins in CUDA-9 and have been removed from the
// CUDA headers. Clang now provides its own implementation of the wrappers.
#if CUDA_VERSION >= 9000
#include <__clang_cuda_device_functions.h>
#include <__clang_cuda_math.h>
#endif
// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
// counterpart does not do it, so we need to make it empty here to keep
// following CUDA includes happy.
#undef __THROW
#define __THROW
// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
// Previous versions used to check whether they are defined or not.
// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
// here to detect the switch.
#if defined(CU_DEVICE_INVALID)
#if !defined(__USE_FAST_MATH__)
#define __USE_FAST_MATH__ 0
#endif
#if !defined(__CUDA_PREC_DIV)
#define __CUDA_PREC_DIV 0
#endif
#endif
// Temporarily poison __host__ macro to ensure it's not used by any of
// the headers we're about to include.
#pragma push_macro("__host__")
#define __host__ UNEXPECTED_HOST_ATTRIBUTE
// device_functions.hpp and math_functions*.hpp use 'static
// __forceinline__' (with no __device__) for definitions of device
// functions. Temporarily redefine __forceinline__ to include
// __device__.
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
#if CUDA_VERSION < 9000
#include "device_functions.hpp"
#endif
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
// sin and exp. This is controlled in clang by -fgpu-approx-transcendentals.
//
// device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
// slow divides), so we need to scope our define carefully here.
#pragma push_macro("__USE_FAST_MATH__")
#if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__)
#define __USE_FAST_MATH__ 1
#endif
#if CUDA_VERSION >= 9000
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
#endif
#pragma pop_macro("__USE_FAST_MATH__")
#if CUDA_VERSION < 9000
#include "math_functions_dbl_ptx3.hpp"
#endif
#pragma pop_macro("__forceinline__")
// Pull in host-only functions that are only available when neither
// __CUDACC__ nor __CUDABE__ are defined.
#undef __MATH_FUNCTIONS_HPP__
#undef __CUDABE__
#if CUDA_VERSION < 9000
#include "math_functions.hpp"
#endif
// Alas, additional overloads for these functions are hard to get to.
// Considering that we only need these overloads for a few functions,
// we can provide them here.
static inline float rsqrt(float __a) { return rsqrtf(__a); }
static inline float rcbrt(float __a) { return rcbrtf(__a); }
static inline float sinpi(float __a) { return sinpif(__a); }
static inline float cospi(float __a) { return cospif(__a); }
static inline void sincospi(float __a, float *__b, float *__c) {
return sincospif(__a, __b, __c);
}
static inline float erfcinv(float __a) { return erfcinvf(__a); }
static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
static inline float normcdf(float __a) { return normcdff(__a); }
static inline float erfcx(float __a) { return erfcxf(__a); }
#if CUDA_VERSION < 9000
// For some reason single-argument variant is not always declared by
// CUDA headers. Alas, device_functions.hpp included below needs it.
static inline __device__ void __brkpt(int __c) { __brkpt(); }
#endif
// Now include *.hpp with definitions of various GPU functions. Alas,
// a lot of thins get declared/defined with __host__ attribute which
// we don't want and we have to define it out. We also have to include
// {device,math}_functions.hpp again in order to extract the other
// branch of #if/else inside.
#define __host__
#undef __CUDABE__
#define __CUDACC__
#if CUDA_VERSION >= 9000
// Some atomic functions became compiler builtins in CUDA-9 , so we need their
// declarations.
#include "device_atomic_functions.h"
#endif
#undef __DEVICE_FUNCTIONS_HPP__
#include "device_atomic_functions.hpp"
#if CUDA_VERSION >= 9000
#include "crt/device_functions.hpp"
#include "crt/device_double_functions.hpp"
#else
#include "device_functions.hpp"
#define __CUDABE__
#include "device_double_functions.h"
#undef __CUDABE__
#endif
#include "sm_20_atomic_functions.hpp"
// Predicate functions used in `__builtin_assume` need to have no side effect.
// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
// them as pure ones.
#pragma push_macro("__isGlobal")
#pragma push_macro("__isShared")
#pragma push_macro("__isConstant")
#pragma push_macro("__isLocal")
#define __isGlobal __ignored_cuda___isGlobal
#define __isShared __ignored_cuda___isShared
#define __isConstant __ignored_cuda___isConstant
#define __isLocal __ignored_cuda___isLocal
#include "sm_20_intrinsics.hpp"
#pragma pop_macro("__isGlobal")
#pragma pop_macro("__isShared")
#pragma pop_macro("__isConstant")
#pragma pop_macro("__isLocal")
#pragma push_macro("__DEVICE__")
#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
__DEVICE__ unsigned int __isGlobal(const void *p) {
return __nvvm_isspacep_global(p);
}
__DEVICE__ unsigned int __isShared(const void *p) {
return __nvvm_isspacep_shared(p);
}
__DEVICE__ unsigned int __isConstant(const void *p) {
return __nvvm_isspacep_const(p);
}
__DEVICE__ unsigned int __isLocal(const void *p) {
return __nvvm_isspacep_local(p);
}
#pragma pop_macro("__DEVICE__")
#include "sm_32_atomic_functions.hpp"
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
// __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
// define them using builtins so that the optimizer can reason about and across
// these instructions. In particular, using intrinsics for ldg gets us the
// [addr+imm] addressing mode, which, although it doesn't actually exist in the
// hardware, seems to generate faster machine code because ptxas can more easily
// reason about our code.
#if CUDA_VERSION >= 8000
#pragma push_macro("__CUDA_ARCH__")
#undef __CUDA_ARCH__
#include "sm_60_atomic_functions.hpp"
#include "sm_61_intrinsics.hpp"
#pragma pop_macro("__CUDA_ARCH__")
#endif
#undef __MATH_FUNCTIONS_HPP__
// math_functions.hpp defines ::signbit as a __host__ __device__ function. This
// conflicts with libstdc++'s constexpr ::signbit, so we have to rename
// math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
// conditional on __GNUC__. :)
#pragma push_macro("signbit")
#pragma push_macro("__GNUC__")
#undef __GNUC__
#define signbit __ignored_cuda_signbit
// CUDA-9 omits device-side definitions of some math functions if it sees
// include guard from math.h wrapper from libstdc++. We have to undo the header
// guard temporarily to get the definitions we need.
#pragma push_macro("_GLIBCXX_MATH_H")
#pragma push_macro("_LIBCPP_VERSION")
#if CUDA_VERSION >= 9000
#undef _GLIBCXX_MATH_H
// We also need to undo another guard that checks for libc++ 3.8+
#ifdef _LIBCPP_VERSION
#define _LIBCPP_VERSION 3700
#endif
#endif
#if CUDA_VERSION >= 9000
#include "crt/math_functions.hpp"
#else
#include "math_functions.hpp"
#endif
#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("_LIBCPP_VERSION")
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")
#pragma pop_macro("__host__")
// __clang_cuda_texture_intrinsics.h must be included first in order to provide
// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
// The implementation requires c++11 and only works with CUDA-9 or newer.
#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
// clang-format off
#include <__clang_cuda_texture_intrinsics.h>
// clang-format on
#else
#if CUDA_VERSION >= 9000
// Provide a hint that texture support needs C++11.
template <typename T> struct __nv_tex_needs_cxx11 {
const static bool value = false;
};
template <class T>
__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
cudaTextureObject_t obj,
float x) {
_Static_assert(__nv_tex_needs_cxx11<T>::value,
"Texture support requires C++11");
}
#else
// Textures in CUDA-8 and older are not supported by clang.There's no
// convenient way to intercept texture use in these versions, so we can't
// produce a meaningful error. The source code that attempts to use textures
// will continue to fail as it does now.
#endif // CUDA_VERSION
#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
#include "texture_fetch_functions.h"
#include "texture_indirect_functions.h"
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
#pragma pop_macro("__CUDA_ARCH__")
#pragma pop_macro("__THROW")
// Set up compiler macros expected to be seen during compilation.
#undef __CUDABE__
#define __CUDACC__
extern "C" {
// Device-side CUDA system calls.
// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
// We need these declarations and wrappers for device-side
// malloc/free/printf calls to work without relying on
// -fcuda-disable-target-call-checks option.
__device__ int vprintf(const char *, const char *);
__device__ void free(void *) __attribute((nothrow));
__device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
// __assertfail() used to have a `noreturn` attribute. Unfortunately that
// contributed to triggering the longstanding bug in ptxas when assert was used
// in sufficiently convoluted code. See
// https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
__device__ void __assertfail(const char *__message, const char *__file,
unsigned __line, const char *__function,
size_t __charSize);
// In order for standard assert() macro on linux to work we need to
// provide device-side __assert_fail()
__device__ static inline void __assert_fail(const char *__message,
const char *__file, unsigned __line,
const char *__function) {
__assertfail(__message, __file, __line, __function, sizeof(char));
}
// Clang will convert printf into vprintf, but we still need
// device-side declaration for it.
__device__ int printf(const char *, ...);
} // extern "C"
// We also need device-side std::malloc and std::free.
namespace std {
__device__ static inline void free(void *__ptr) { ::free(__ptr); }
__device__ static inline void *malloc(size_t __size) {
return ::malloc(__size);
}
} // namespace std
// Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
// come after we've pulled in the definition of uint3 and dim3.
__device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
return {x, y, z};
}
__device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
return {x, y, z};
}
__device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
return {x, y, z};
}
__device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
return dim3(x, y, z);
}
__device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
return {x, y, z};
}
#include <__clang_cuda_cmath.h>
#include <__clang_cuda_intrinsics.h>
#include <__clang_cuda_complex_builtins.h>
// curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
// mode, giving them their "proper" types of dim3 and uint3. This is
// incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
// hack, force-include the header (nvcc doesn't include it by default) but
// redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
// only used here for the redeclarations of blockDim and threadIdx.)
#pragma push_macro("dim3")
#pragma push_macro("uint3")
#define dim3 __cuda_builtin_blockDim_t
#define uint3 __cuda_builtin_threadIdx_t
#include "curand_mtgp32_kernel.h"
#pragma pop_macro("dim3")
#pragma pop_macro("uint3")
#pragma pop_macro("__USE_FAST_MATH__")
#pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
// CUDA runtime uses this undocumented function to access kernel launch
// configuration. The declaration is in crt/device_functions.h but that file
// includes a lot of other stuff we don't want. Instead, we'll provide our own
// declaration for it here.
#if CUDA_VERSION >= 9020
extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
size_t sharedMem = 0,
void *stream = 0);
#endif
#endif // __CUDA__
#endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

View File

@ -1,742 +0,0 @@
/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*
* This header provides in-header implmentations for NVCC's built-in
* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
* built-in is unusual as it's actually a set of function overloads that use the
* first string literal argument as one of the overload parameters.
*/
#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
#ifndef __CUDA__
#error "This file is for CUDA compilation only."
#endif
// __nv_tex_surf_handler() provided by this header as a macro.
#define __nv_tex_surf_handler(__op, __ptr, ...) \
::__cuda_tex::__tex_fetch< \
::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
__VA_ARGS__)
#pragma push_macro("__ASM_OUT")
#pragma push_macro("__ASM_OUTP")
#pragma push_macro("__Args")
#pragma push_macro("__ID")
#pragma push_macro("__IDV")
#pragma push_macro("__IMPL_2DGATHER")
#pragma push_macro("__IMPL_ALIAS")
#pragma push_macro("__IMPL_ALIASI")
#pragma push_macro("__IMPL_F1")
#pragma push_macro("__IMPL_F3")
#pragma push_macro("__IMPL_F3N")
#pragma push_macro("__IMPL_F3S")
#pragma push_macro("__IMPL_S")
#pragma push_macro("__IMPL_S3")
#pragma push_macro("__IMPL_S3I")
#pragma push_macro("__IMPL_S3N")
#pragma push_macro("__IMPL_S3NI")
#pragma push_macro("__IMPL_S3S")
#pragma push_macro("__IMPL_S3SI")
#pragma push_macro("__IMPL_SI")
#pragma push_macro("__L")
#pragma push_macro("__STRIP_PARENS")
// Put all functions into anonymous namespace so they have internal linkage.
// The device-only function here must be internal in order to avoid ODR
// violations in case they are used from the files compiled with
// -fgpu-rdc. E.g. a library and an app using it may be built with a different
// version of this header file.
namespace {
// Put the implmentation into its own namespace so we don't pollute the TU.
namespace __cuda_tex {
// First, we need a perfect hash function and a few constexpr helper functions
// for converting a string literal into a numeric value which can be used to
// parametrize a template. We can not use string literals for that as that would
// require C++20.
//
// The hash function was generated with 'gperf' and then manually converted into
// its constexpr equivalent.
//
// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
// function has a collision for any of the texture operations, the compilation
// will fail due to an attempt to redefine a tag with the same value. If the
// header compiles, then the hash function is good enough for the job.
constexpr int __tex_len(const char *s) {
return (s[0] == 0) ? 0
: (s[1] == 0) ? 1
: (s[2] == 0) ? 2
: (s[3] == 0) ? 3
: (s[4] == 0) ? 4
: (s[5] == 0) ? 5
: (s[6] == 0) ? 6
: (s[7] == 0) ? 7
: (s[8] == 0) ? 8
: (s[9] == 0) ? 9
: (s[10] == 0) ? 10
: (s[11] == 0) ? 11
: (s[12] == 0) ? 12
: (s[13] == 0) ? 13
: (s[14] == 0) ? 14
: (s[15] == 0) ? 15
: (s[16] == 0) ? 16
: (s[17] == 0) ? 17
: (s[18] == 0) ? 18
: (s[19] == 0) ? 19
: (s[20] == 0) ? 20
: (s[21] == 0) ? 21
: (s[22] == 0) ? 22
: (s[23] == 0) ? 23
: (s[24] == 0) ? 24
: (s[25] == 0) ? 25
: (s[26] == 0) ? 26
: (s[27] == 0) ? 27
: (s[28] == 0) ? 28
: (s[29] == 0) ? 29
: (s[30] == 0) ? 30
: (s[31] == 0) ? 31
: 32;
}
constexpr int __tex_hash_map(int c) {
return (c == 49) ? 10
: (c == 50) ? 0
: (c == 51) ? 100
: (c == 52) ? 30
: (c == 67) ? 10
: (c == 68) ? 0
: (c == 69) ? 25
: (c == 72) ? 70
: (c == 77) ? 0
: (c == 96) ? 44
: (c == 99) ? 10
: (c == 100) ? 5
: (c == 101) ? 60
: (c == 102) ? 40
: (c == 103) ? 70
: (c == 104) ? 25
: (c == 112) ? 0
: (c == 114) ? 45
: (c == 117) ? 5
: (c == 118) ? 85
: (c == 120) ? 20
: 225;
}
constexpr int __tex_op_hash(const char *str) {
return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
}
// Tag type to identify particular texture operation.
template <int N> struct __Tag;
#define __ID(__op) __Tag<__tex_op_hash(__op)>
// Tags for variants of particular operation. E.g. tex2Dgather can translate
// into 4 different instructions.
#define __IDV(__op, __variant) \
__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
// Helper classes for figuring out key data types for derived types.
// E.g. char2 has __base_t = char, __fetch_t = char4
template <class> struct __TypeInfoT;
// Type info for the fundamental types.
template <> struct __TypeInfoT<float> {
using __base_t = float;
using __fetch_t = float4;
};
template <> struct __TypeInfoT<char> {
using __base_t = char;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<signed char> {
using __base_t = signed char;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned char> {
using __base_t = unsigned char;
using __fetch_t = uint4;
};
template <> struct __TypeInfoT<short> {
using __base_t = short;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned short> {
using __base_t = unsigned short;
using __fetch_t = uint4;
};
template <> struct __TypeInfoT<int> {
using __base_t = int;
using __fetch_t = int4;
};
template <> struct __TypeInfoT<unsigned int> {
using __base_t = unsigned int;
using __fetch_t = uint4;
};
// Derived base/fetch types for N-element vectors.
template <class __T> struct __TypeInfoT {
using __base_t = decltype(__T::x);
using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
};
// Classes that implement specific texture ops.
template <class __op> struct __tex_fetch_v4;
// Helper macros to strip parens from a macro argument.
#define __Args(...) __VA_ARGS__
#define __STRIP_PARENS(__X) __X
#define __L(__X) __STRIP_PARENS(__Args __X)
// Construct inline assembly output args.
// Results are stored in a temp var __r.
// isResident bool is pointed to by __ir
// Asm args for return values. It's a 4-element vector
#define __ASM_OUT(__t) \
("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
// .. possibly combined with a predicate.
#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
// Implements a single variant of texture fetch instruction.
#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
template <> \
__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
__rt __r; \
asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
return __r; \
}
// Implements texture fetch instructions for int4/uint4/float4 data types.
#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(float4, float4, __args, \
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
__asm_args)
// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
// types. Similar to above, but returns a boolean 'isPresent' value in addition
// to texture data,
#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUTP("r"), __asm_args) \
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
__ASM_OUTP("r"), __asm_args) \
__IMPL_F1(float4, float4, __args, \
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
__asm_args)
// Similar to F3, but for integer data which is returned as normalized floats.
// Only instantiates fetch functions for int4/uint4.
#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
__ASM_OUT("r"), __asm_args) \
__IMPL_F1(float4, uint4, __args, \
__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
__asm_args)
// Instantiates __tex_fetch_v4 with regular fetch functions.
#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
// Same, but for sparse ops. Only available on sm_60+
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
#else
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#endif
// Same, but for normalized float ops.
#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
template <> struct __tex_fetch_v4<__op> { \
template <class T> \
__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
}
// Regular and normalized float ops share a lot of similarities. This macro
// instantiates both variants -- normal for __op and normalized for __opn.
#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
// Convenience macros which converts string literal __op into a __Tag,
#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
__asm_args) \
__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
__asm_args)
// CUDA headers have some 'legacy' texture oprerations that duplicate
// functionality. So, we just inherit it, instead of refining a copy.
#define __IMPL_ALIASI(__op, __opn) \
template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
// Now we can instantiate everything we need for each specific texture fetch
// variant.
__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
__IMPL_ALIAS("__itex1D", "__tex1D_v2");
__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
("f"(__x), "f"(__dPdx), "f"(__dPdy)));
__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
(float __x, int __layer), "tex.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
(float __x, int __layer, float __dPdx, float __dPdy),
"tex.grad.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
("r"(__layer), "f"(__x), "f"(__level)));
__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
("f"(__x), "f"(__level)));
__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
// 2D
__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
__IMPL_ALIAS("__itex2D", "__tex2D_v2");
__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
" selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y)));
__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
"tex.grad.2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
"f"(__dPdy->y)));
__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
__IMPL_S3S("__itex2DGrad_sparse",
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
"f"(__dPdy->y)));
__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("r"(__layer), "f"(__x), "f"(__y)));
__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
__IMPL_S3S("__itex2DLayered_sparse",
(float __x, float __y, int __layer, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y)));
__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
(float __x, float __y, int __layer, const float2 *__dPdx,
const float2 *__dPdy),
"tex.grad.a2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdy->x), "f"(__dPdy->y)));
__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
__IMPL_S3S(
"__itex2DLayeredGrad_sparse",
(float __x, float __y, int __layer, const float2 *__dPdx,
const float2 *__dPdy, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdy->x), "f"(__dPdy->y)));
__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
__IMPL_S3S("__itex2DLayeredLod_sparse",
(float __x, float __y, int __layer, float __level,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.a2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
("f"(__x), "f"(__y), "f"(__level)));
__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
__IMPL_S3S("__itex2DLod_sparse",
(float __x, float __y, float __level, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.2d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__level)));
// 2D gather is special. Unlike other variants that translate into exactly one
// asm instruction, it uses one of the four different instructions selected by
// __comp. We implement each instruction variant separately, and dispatch the
// right one from the manually implemented 'umbrella' fetch.
#define __IMPL_2DGATHER(variant, instr) \
__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
__IDV("__tex2Dgather_rmnf_v2", variant), \
(float __x, float __y, int __comp), instr, "f32", \
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
__IDV("__tex2Dgather_v2", variant)); \
__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
(float __x, float __y, unsigned char *__ir, int __comp), \
"{.reg .pred %%p0;\n\t" instr, "f32", \
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
"selp.u16 %4, 1, 0, %%p0; }", \
("f"(__x), "f"(__y)));
__IMPL_2DGATHER(0, "tld4.r.2d.v4");
__IMPL_2DGATHER(1, "tld4.g.2d.v4");
__IMPL_2DGATHER(2, "tld4.b.2d.v4");
__IMPL_2DGATHER(3, "tld4.a.2d.v4");
// Umbrella dispatcher -- calls into specific 2Dgather variant.
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
template <class __T>
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
__obj, __x, __y, __comp);
case 1:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
__obj, __x, __y, __comp);
case 2:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
__obj, __x, __y, __comp);
case 3:
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
__obj, __x, __y, __comp);
}
}
};
__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
template <class __T>
__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
float __y, int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
__obj, __x, __y, __comp);
case 1:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
__obj, __x, __y, __comp);
case 2:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
__obj, __x, __y, __comp);
case 3:
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
__obj, __x, __y, __comp);
}
}
};
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
template <class __T>
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
unsigned char *__ir, int __comp) {
switch (__comp) {
case 0:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 1:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 2:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
case 3:
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
__obj, __x, __y, __ir, __comp);
}
}
};
#endif
// 3D
__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itex3D", "__tex3D_v2");
__IMPL_S3S("__itex3D_sparse",
(float __x, float __y, float __z, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.3d.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
__IMPL_S3S("__itex3DGrad_sparse",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.grad.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
__IMPL_S3S("__itex3DLod_sparse",
(float __x, float __y, float __z, float __level,
unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.level.3d.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
// Cubemap
__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
(float __x, float __y, float __z), "tex.cube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
__IMPL_S3S("__itexCubemap_sparse",
(float __x, float __y, float __z, unsigned char *__ir),
"{.reg .pred %%p0;\n\t"
"tex.cube.v4",
"f32",
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
"selp.u16 %4, 1, 0, %%p0; }",
("f"(__x), "f"(__y), "f"(__z)));
__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
(float __x, float __y, float __z, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.cube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
const float4 *__dPdy),
"tex.grad.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
"f"(__dPdy->z)));
__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
(float __x, float __y, float __z, int __layer, float __level),
"tex.level.acube.v4", "f32",
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
// Helper class for extracting slice of data from V4 fetch results.
template <class __DestT, class __SrcT> struct __convert {
template <int __NElements = sizeof(__DestT) /
sizeof(typename __TypeInfoT<__DestT>::__base_t)>
__device__ static __DestT __run(__SrcT __v);
template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
template <> __device__ static __DestT __run<2>(__SrcT __v) {
return {__v.x, __v.y};
}
template <> __device__ static __DestT __run<3>(__SrcT __v) {
return {__v.x, __v.y, __v.z};
}
template <> __device__ static __DestT __run<4>(__SrcT __v) {
return {__v.x, __v.y, __v.z, __v.w};
}
};
// These are the top-level function overloads the __nv_tex_surf_handler expands
// to. Each overload deals with one of the several ways __nv_tex_surf_handler
// is called by CUDA headers. In the end, each of the overloads does the same
// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
// fetch texture data and which `__convert::run` is needed to convert it into
// appropriate return type.
// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
// Data type and return type are based on ret.
template <class __op, class __T, class... __Args>
__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
*__ptr = __convert<__T, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
}
#if CUDA_VERSION < 12000
// texture<> objects get magically converted into a texture reference. However,
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
// cheat a bit and use inline assembly to do it. It costs us an extra register
// and a move, but that is easy for ptxas to optimize away.
template <class __T>
__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
cudaTextureObject_t __obj;
asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
return __obj;
}
// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
// Data type and return type is based on ret.
template <class __op, class __T, class __HandleT, class... __Args>
__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
*__ptr = __convert<__T, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
// cudaReadModeNormalizedFloat fetches always return float4.
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
__device__ static void
__tex_fetch(__DataT *, __RetT *__ptr,
texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
*__ptr = __convert<__RetT, float4>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
// For cudaReadModeElementType fetch return type is based on type_dummy.
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
__device__ static void
__tex_fetch(__DataT *, __RetT *__ptr,
texture<__DataT, __TexT, cudaReadModeElementType> __handle,
__Args... __args) {
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
*__ptr = __convert<__RetT, __FetchT>::__run(
__tex_fetch_v4<__op>::template __run<__FetchT>(
__tex_handle_to_obj(__handle), __args...));
}
#endif // CUDA_VERSION
} // namespace __cuda_tex
} // namespace
#pragma pop_macro("__ASM_OUT")
#pragma pop_macro("__ASM_OUTP")
#pragma pop_macro("__Args")
#pragma pop_macro("__ID")
#pragma pop_macro("__IDV")
#pragma pop_macro("__IMPL_2DGATHER")
#pragma pop_macro("__IMPL_ALIAS")
#pragma pop_macro("__IMPL_ALIASI")
#pragma pop_macro("__IMPL_F1")
#pragma pop_macro("__IMPL_F3")
#pragma pop_macro("__IMPL_F3N")
#pragma pop_macro("__IMPL_F3S")
#pragma pop_macro("__IMPL_S")
#pragma pop_macro("__IMPL_S3")
#pragma pop_macro("__IMPL_S3I")
#pragma pop_macro("__IMPL_S3N")
#pragma pop_macro("__IMPL_S3NI")
#pragma pop_macro("__IMPL_S3S")
#pragma pop_macro("__IMPL_S3SI")
#pragma pop_macro("__IMPL_SI")
#pragma pop_macro("__L")
#pragma pop_macro("__STRIP_PARENS")
#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__

View File

@ -1,842 +0,0 @@
/*===---- __clang_hip_cmath.h - HIP cmath decls -----------------------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_HIP_CMATH_H__
#define __CLANG_HIP_CMATH_H__
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
#endif
#if !defined(__HIPCC_RTC__)
#if defined(__cplusplus)
#include <limits>
#include <type_traits>
#include <utility>
#endif
#include <limits.h>
#include <stdint.h>
#endif // !defined(__HIPCC_RTC__)
#pragma push_macro("__DEVICE__")
#pragma push_macro("__CONSTEXPR__")
#ifdef __OPENMP_AMDGCN__
#define __DEVICE__ static __attribute__((always_inline, nothrow))
#define __CONSTEXPR__ constexpr
#else
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
#define __CONSTEXPR__
#endif // __OPENMP_AMDGCN__
// Start with functions that cannot be defined by DEF macros below.
#if defined(__cplusplus)
#if defined __OPENMP_AMDGCN__
__DEVICE__ __CONSTEXPR__ float fabs(float __x) { return ::fabsf(__x); }
__DEVICE__ __CONSTEXPR__ float sin(float __x) { return ::sinf(__x); }
__DEVICE__ __CONSTEXPR__ float cos(float __x) { return ::cosf(__x); }
#endif
__DEVICE__ __CONSTEXPR__ double abs(double __x) { return ::fabs(__x); }
__DEVICE__ __CONSTEXPR__ float abs(float __x) { return ::fabsf(__x); }
__DEVICE__ __CONSTEXPR__ long long abs(long long __n) { return ::llabs(__n); }
__DEVICE__ __CONSTEXPR__ long abs(long __n) { return ::labs(__n); }
__DEVICE__ __CONSTEXPR__ float fma(float __x, float __y, float __z) {
return ::fmaf(__x, __y, __z);
}
#if !defined(__HIPCC_RTC__)
// The value returned by fpclassify is platform dependent, therefore it is not
// supported by hipRTC.
__DEVICE__ __CONSTEXPR__ int fpclassify(float __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
__DEVICE__ __CONSTEXPR__ int fpclassify(double __x) {
return __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL,
FP_ZERO, __x);
}
#endif // !defined(__HIPCC_RTC__)
__DEVICE__ __CONSTEXPR__ float frexp(float __arg, int *__exp) {
return ::frexpf(__arg, __exp);
}
#if defined(__OPENMP_AMDGCN__)
// For OpenMP we work around some old system headers that have non-conforming
// `isinf(float)` and `isnan(float)` implementations that return an `int`. We do
// this by providing two versions of these functions, differing only in the
// return type. To avoid conflicting definitions we disable implicit base
// function generation. That means we will end up with two specializations, one
// per type, but only one has a base function defined by the system header.
#pragma omp begin declare variant match( \
implementation = {extension(disable_implicit_base)})
// FIXME: We lack an extension to customize the mangling of the variants, e.g.,
// add a suffix. This means we would clash with the names of the variants
// (note that we do not create implicit base functions here). To avoid
// this clash we add a new trait to some of them that is always true
// (this is LLVM after all ;)). It will only influence the mangled name
// of the variants inside the inner region and avoid the clash.
#pragma omp begin declare variant match(implementation = {vendor(llvm)})
__DEVICE__ __CONSTEXPR__ int isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ __CONSTEXPR__ int isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ __CONSTEXPR__ int isfinite(float __x) { return ::__finitef(__x); }
__DEVICE__ __CONSTEXPR__ int isfinite(double __x) { return ::__finite(__x); }
__DEVICE__ __CONSTEXPR__ int isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ __CONSTEXPR__ int isnan(double __x) { return ::__isnan(__x); }
#pragma omp end declare variant
#endif // defined(__OPENMP_AMDGCN__)
__DEVICE__ __CONSTEXPR__ bool isinf(float __x) { return ::__isinff(__x); }
__DEVICE__ __CONSTEXPR__ bool isinf(double __x) { return ::__isinf(__x); }
__DEVICE__ __CONSTEXPR__ bool isfinite(float __x) { return ::__finitef(__x); }
__DEVICE__ __CONSTEXPR__ bool isfinite(double __x) { return ::__finite(__x); }
__DEVICE__ __CONSTEXPR__ bool isnan(float __x) { return ::__isnanf(__x); }
__DEVICE__ __CONSTEXPR__ bool isnan(double __x) { return ::__isnan(__x); }
#if defined(__OPENMP_AMDGCN__)
#pragma omp end declare variant
#endif // defined(__OPENMP_AMDGCN__)
__DEVICE__ __CONSTEXPR__ bool isgreater(float __x, float __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isgreater(double __x, double __y) {
return __builtin_isgreater(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isgreaterequal(float __x, float __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isgreaterequal(double __x, double __y) {
return __builtin_isgreaterequal(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isless(float __x, float __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isless(double __x, double __y) {
return __builtin_isless(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool islessequal(float __x, float __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool islessequal(double __x, double __y) {
return __builtin_islessequal(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool islessgreater(float __x, float __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool islessgreater(double __x, double __y) {
return __builtin_islessgreater(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isnormal(float __x) {
return __builtin_isnormal(__x);
}
__DEVICE__ __CONSTEXPR__ bool isnormal(double __x) {
return __builtin_isnormal(__x);
}
__DEVICE__ __CONSTEXPR__ bool isunordered(float __x, float __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ __CONSTEXPR__ bool isunordered(double __x, double __y) {
return __builtin_isunordered(__x, __y);
}
__DEVICE__ __CONSTEXPR__ float modf(float __x, float *__iptr) {
return ::modff(__x, __iptr);
}
__DEVICE__ __CONSTEXPR__ float pow(float __base, int __iexp) {
return ::powif(__base, __iexp);
}
__DEVICE__ __CONSTEXPR__ double pow(double __base, int __iexp) {
return ::powi(__base, __iexp);
}
__DEVICE__ __CONSTEXPR__ float remquo(float __x, float __y, int *__quo) {
return ::remquof(__x, __y, __quo);
}
__DEVICE__ __CONSTEXPR__ float scalbln(float __x, long int __n) {
return ::scalblnf(__x, __n);
}
__DEVICE__ __CONSTEXPR__ bool signbit(float __x) { return ::__signbitf(__x); }
__DEVICE__ __CONSTEXPR__ bool signbit(double __x) { return ::__signbit(__x); }
// Notably missing above is nexttoward. We omit it because
// ocml doesn't provide an implementation, and we don't want to be in the
// business of implementing tricky libm functions in this header.
// Other functions.
__DEVICE__ __CONSTEXPR__ _Float16 fma(_Float16 __x, _Float16 __y,
_Float16 __z) {
return __builtin_fmaf16(__x, __y, __z);
}
__DEVICE__ __CONSTEXPR__ _Float16 pow(_Float16 __base, int __iexp) {
return __ocml_pown_f16(__base, __iexp);
}
#ifndef __OPENMP_AMDGCN__
// BEGIN DEF_FUN and HIP_OVERLOAD
// BEGIN DEF_FUN
#pragma push_macro("__DEF_FUN1")
#pragma push_macro("__DEF_FUN2")
#pragma push_macro("__DEF_FUN2_FI")
// Define cmath functions with float argument and returns __retty.
#define __DEF_FUN1(__retty, __func) \
__DEVICE__ __CONSTEXPR__ __retty __func(float __x) { return __func##f(__x); }
// Define cmath functions with two float arguments and returns __retty.
#define __DEF_FUN2(__retty, __func) \
__DEVICE__ __CONSTEXPR__ __retty __func(float __x, float __y) { \
return __func##f(__x, __y); \
}
// Define cmath functions with a float and an int argument and returns __retty.
#define __DEF_FUN2_FI(__retty, __func) \
__DEVICE__ __CONSTEXPR__ __retty __func(float __x, int __y) { \
return __func##f(__x, __y); \
}
__DEF_FUN1(float, acos)
__DEF_FUN1(float, acosh)
__DEF_FUN1(float, asin)
__DEF_FUN1(float, asinh)
__DEF_FUN1(float, atan)
__DEF_FUN2(float, atan2)
__DEF_FUN1(float, atanh)
__DEF_FUN1(float, cbrt)
__DEF_FUN1(float, ceil)
__DEF_FUN2(float, copysign)
__DEF_FUN1(float, cos)
__DEF_FUN1(float, cosh)
__DEF_FUN1(float, erf)
__DEF_FUN1(float, erfc)
__DEF_FUN1(float, exp)
__DEF_FUN1(float, exp2)
__DEF_FUN1(float, expm1)
__DEF_FUN1(float, fabs)
__DEF_FUN2(float, fdim)
__DEF_FUN1(float, floor)
__DEF_FUN2(float, fmax)
__DEF_FUN2(float, fmin)
__DEF_FUN2(float, fmod)
__DEF_FUN2(float, hypot)
__DEF_FUN1(int, ilogb)
__DEF_FUN2_FI(float, ldexp)
__DEF_FUN1(float, lgamma)
__DEF_FUN1(float, log)
__DEF_FUN1(float, log10)
__DEF_FUN1(float, log1p)
__DEF_FUN1(float, log2)
__DEF_FUN1(float, logb)
__DEF_FUN1(long long, llrint)
__DEF_FUN1(long long, llround)
__DEF_FUN1(long, lrint)
__DEF_FUN1(long, lround)
__DEF_FUN1(float, nearbyint)
__DEF_FUN2(float, nextafter)
__DEF_FUN2(float, pow)
__DEF_FUN2(float, remainder)
__DEF_FUN1(float, rint)
__DEF_FUN1(float, round)
__DEF_FUN2_FI(float, scalbn)
__DEF_FUN1(float, sin)
__DEF_FUN1(float, sinh)
__DEF_FUN1(float, sqrt)
__DEF_FUN1(float, tan)
__DEF_FUN1(float, tanh)
__DEF_FUN1(float, tgamma)
__DEF_FUN1(float, trunc)
#pragma pop_macro("__DEF_FUN1")
#pragma pop_macro("__DEF_FUN2")
#pragma pop_macro("__DEF_FUN2_FI")
// END DEF_FUN
// BEGIN HIP_OVERLOAD
#pragma push_macro("__HIP_OVERLOAD1")
#pragma push_macro("__HIP_OVERLOAD2")
// __hip_enable_if::type is a type function which returns __T if __B is true.
template <bool __B, class __T = void> struct __hip_enable_if {};
template <class __T> struct __hip_enable_if<true, __T> { typedef __T type; };
namespace __hip {
template <class _Tp> struct is_integral {
enum { value = 0 };
};
template <> struct is_integral<bool> {
enum { value = 1 };
};
template <> struct is_integral<char> {
enum { value = 1 };
};
template <> struct is_integral<signed char> {
enum { value = 1 };
};
template <> struct is_integral<unsigned char> {
enum { value = 1 };
};
template <> struct is_integral<wchar_t> {
enum { value = 1 };
};
template <> struct is_integral<short> {
enum { value = 1 };
};
template <> struct is_integral<unsigned short> {
enum { value = 1 };
};
template <> struct is_integral<int> {
enum { value = 1 };
};
template <> struct is_integral<unsigned int> {
enum { value = 1 };
};
template <> struct is_integral<long> {
enum { value = 1 };
};
template <> struct is_integral<unsigned long> {
enum { value = 1 };
};
template <> struct is_integral<long long> {
enum { value = 1 };
};
template <> struct is_integral<unsigned long long> {
enum { value = 1 };
};
// ToDo: specializes is_arithmetic<_Float16>
template <class _Tp> struct is_arithmetic {
enum { value = 0 };
};
template <> struct is_arithmetic<bool> {
enum { value = 1 };
};
template <> struct is_arithmetic<char> {
enum { value = 1 };
};
template <> struct is_arithmetic<signed char> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned char> {
enum { value = 1 };
};
template <> struct is_arithmetic<wchar_t> {
enum { value = 1 };
};
template <> struct is_arithmetic<short> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned short> {
enum { value = 1 };
};
template <> struct is_arithmetic<int> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned int> {
enum { value = 1 };
};
template <> struct is_arithmetic<long> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long> {
enum { value = 1 };
};
template <> struct is_arithmetic<long long> {
enum { value = 1 };
};
template <> struct is_arithmetic<unsigned long long> {
enum { value = 1 };
};
template <> struct is_arithmetic<float> {
enum { value = 1 };
};
template <> struct is_arithmetic<double> {
enum { value = 1 };
};
struct true_type {
static const __constant__ bool value = true;
};
struct false_type {
static const __constant__ bool value = false;
};
template <typename __T, typename __U> struct is_same : public false_type {};
template <typename __T> struct is_same<__T, __T> : public true_type {};
template <typename __T> struct add_rvalue_reference { typedef __T &&type; };
template <typename __T> typename add_rvalue_reference<__T>::type declval();
// decltype is only available in C++11 and above.
#if __cplusplus >= 201103L
// __hip_promote
template <class _Tp> struct __numeric_type {
static void __test(...);
static _Float16 __test(_Float16);
static float __test(float);
static double __test(char);
static double __test(int);
static double __test(unsigned);
static double __test(long);
static double __test(unsigned long);
static double __test(long long);
static double __test(unsigned long long);
static double __test(double);
// No support for long double, use double instead.
static double __test(long double);
typedef decltype(__test(declval<_Tp>())) type;
static const bool value = !is_same<type, void>::value;
};
template <> struct __numeric_type<void> { static const bool value = true; };
template <class _A1, class _A2 = void, class _A3 = void,
bool = __numeric_type<_A1>::value &&__numeric_type<_A2>::value
&&__numeric_type<_A3>::value>
class __promote_imp {
public:
static const bool value = false;
};
template <class _A1, class _A2, class _A3>
class __promote_imp<_A1, _A2, _A3, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
typedef typename __promote_imp<_A3>::type __type3;
public:
typedef decltype(__type1() + __type2() + __type3()) type;
static const bool value = true;
};
template <class _A1, class _A2> class __promote_imp<_A1, _A2, void, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
public:
typedef decltype(__type1() + __type2()) type;
static const bool value = true;
};
template <class _A1> class __promote_imp<_A1, void, void, true> {
public:
typedef typename __numeric_type<_A1>::type type;
static const bool value = true;
};
template <class _A1, class _A2 = void, class _A3 = void>
class __promote : public __promote_imp<_A1, _A2, _A3> {};
#endif //__cplusplus >= 201103L
} // namespace __hip
// __HIP_OVERLOAD1 is used to resolve function calls with integer argument to
// avoid compilation error due to ambibuity. e.g. floor(5) is resolved with
// floor(double).
#define __HIP_OVERLOAD1(__retty, __fn) \
template <typename __T> \
__DEVICE__ __CONSTEXPR__ \
typename __hip_enable_if<__hip::is_integral<__T>::value, __retty>::type \
__fn(__T __x) { \
return ::__fn((double)__x); \
}
// __HIP_OVERLOAD2 is used to resolve function calls with mixed float/double
// or integer argument to avoid compilation error due to ambibuity. e.g.
// max(5.0f, 6.0) is resolved with max(double, double).
#if __cplusplus >= 201103L
#define __HIP_OVERLOAD2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ __CONSTEXPR__ typename __hip_enable_if< \
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value, \
typename __hip::__promote<__T1, __T2>::type>::type \
__fn(__T1 __x, __T2 __y) { \
typedef typename __hip::__promote<__T1, __T2>::type __result_type; \
return __fn((__result_type)__x, (__result_type)__y); \
}
#else
#define __HIP_OVERLOAD2(__retty, __fn) \
template <typename __T1, typename __T2> \
__DEVICE__ __CONSTEXPR__ \
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value && \
__hip::is_arithmetic<__T2>::value, \
__retty>::type \
__fn(__T1 __x, __T2 __y) { \
return __fn((double)__x, (double)__y); \
}
#endif
__HIP_OVERLOAD1(double, acos)
__HIP_OVERLOAD1(double, acosh)
__HIP_OVERLOAD1(double, asin)
__HIP_OVERLOAD1(double, asinh)
__HIP_OVERLOAD1(double, atan)
__HIP_OVERLOAD2(double, atan2)
__HIP_OVERLOAD1(double, atanh)
__HIP_OVERLOAD1(double, cbrt)
__HIP_OVERLOAD1(double, ceil)
__HIP_OVERLOAD2(double, copysign)
__HIP_OVERLOAD1(double, cos)
__HIP_OVERLOAD1(double, cosh)
__HIP_OVERLOAD1(double, erf)
__HIP_OVERLOAD1(double, erfc)
__HIP_OVERLOAD1(double, exp)
__HIP_OVERLOAD1(double, exp2)
__HIP_OVERLOAD1(double, expm1)
__HIP_OVERLOAD1(double, fabs)
__HIP_OVERLOAD2(double, fdim)
__HIP_OVERLOAD1(double, floor)
__HIP_OVERLOAD2(double, fmax)
__HIP_OVERLOAD2(double, fmin)
__HIP_OVERLOAD2(double, fmod)
#if !defined(__HIPCC_RTC__)
__HIP_OVERLOAD1(int, fpclassify)
#endif // !defined(__HIPCC_RTC__)
__HIP_OVERLOAD2(double, hypot)
__HIP_OVERLOAD1(int, ilogb)
__HIP_OVERLOAD1(bool, isfinite)
__HIP_OVERLOAD2(bool, isgreater)
__HIP_OVERLOAD2(bool, isgreaterequal)
__HIP_OVERLOAD1(bool, isinf)
__HIP_OVERLOAD2(bool, isless)
__HIP_OVERLOAD2(bool, islessequal)
__HIP_OVERLOAD2(bool, islessgreater)
__HIP_OVERLOAD1(bool, isnan)
__HIP_OVERLOAD1(bool, isnormal)
__HIP_OVERLOAD2(bool, isunordered)
__HIP_OVERLOAD1(double, lgamma)
__HIP_OVERLOAD1(double, log)
__HIP_OVERLOAD1(double, log10)
__HIP_OVERLOAD1(double, log1p)
__HIP_OVERLOAD1(double, log2)
__HIP_OVERLOAD1(double, logb)
__HIP_OVERLOAD1(long long, llrint)
__HIP_OVERLOAD1(long long, llround)
__HIP_OVERLOAD1(long, lrint)
__HIP_OVERLOAD1(long, lround)
__HIP_OVERLOAD1(double, nearbyint)
__HIP_OVERLOAD2(double, nextafter)
__HIP_OVERLOAD2(double, pow)
__HIP_OVERLOAD2(double, remainder)
__HIP_OVERLOAD1(double, rint)
__HIP_OVERLOAD1(double, round)
__HIP_OVERLOAD1(bool, signbit)
__HIP_OVERLOAD1(double, sin)
__HIP_OVERLOAD1(double, sinh)
__HIP_OVERLOAD1(double, sqrt)
__HIP_OVERLOAD1(double, tan)
__HIP_OVERLOAD1(double, tanh)
__HIP_OVERLOAD1(double, tgamma)
__HIP_OVERLOAD1(double, trunc)
// Overload these but don't add them to std, they are not part of cmath.
__HIP_OVERLOAD2(double, max)
__HIP_OVERLOAD2(double, min)
// Additional Overloads that don't quite match HIP_OVERLOAD.
#if __cplusplus >= 201103L
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ __CONSTEXPR__ typename __hip_enable_if<
__hip::is_arithmetic<__T1>::value && __hip::is_arithmetic<__T2>::value &&
__hip::is_arithmetic<__T3>::value,
typename __hip::__promote<__T1, __T2, __T3>::type>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
typedef typename __hip::__promote<__T1, __T2, __T3>::type __result_type;
return ::fma((__result_type)__x, (__result_type)__y, (__result_type)__z);
}
#else
template <typename __T1, typename __T2, typename __T3>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value &&
__hip::is_arithmetic<__T3>::value,
double>::type
fma(__T1 __x, __T2 __y, __T3 __z) {
return ::fma((double)__x, (double)__y, (double)__z);
}
#endif
template <typename __T>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
frexp(__T __x, int *__exp) {
return ::frexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
ldexp(__T __x, int __exp) {
return ::ldexp((double)__x, __exp);
}
template <typename __T>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
modf(__T __x, double *__exp) {
return ::modf((double)__x, __exp);
}
#if __cplusplus >= 201103L
template <typename __T1, typename __T2>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value,
typename __hip::__promote<__T1, __T2>::type>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
typedef typename __hip::__promote<__T1, __T2>::type __result_type;
return ::remquo((__result_type)__x, (__result_type)__y, __quo);
}
#else
template <typename __T1, typename __T2>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_arithmetic<__T1>::value &&
__hip::is_arithmetic<__T2>::value,
double>::type
remquo(__T1 __x, __T2 __y, int *__quo) {
return ::remquo((double)__x, (double)__y, __quo);
}
#endif
template <typename __T>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
scalbln(__T __x, long int __exp) {
return ::scalbln((double)__x, __exp);
}
template <typename __T>
__DEVICE__ __CONSTEXPR__
typename __hip_enable_if<__hip::is_integral<__T>::value, double>::type
scalbn(__T __x, int __exp) {
return ::scalbn((double)__x, __exp);
}
#pragma pop_macro("__HIP_OVERLOAD1")
#pragma pop_macro("__HIP_OVERLOAD2")
// END HIP_OVERLOAD
// END DEF_FUN and HIP_OVERLOAD
#endif // ifndef __OPENMP_AMDGCN__
#endif // defined(__cplusplus)
#ifndef __OPENMP_AMDGCN__
// Define these overloads inside the namespace our standard library uses.
#if !defined(__HIPCC_RTC__)
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif // _LIBCPP_BEGIN_NAMESPACE_STD
// Pull the new overloads we defined above into namespace std.
// using ::abs; - This may be considered for C++.
using ::acos;
using ::acosh;
using ::asin;
using ::asinh;
using ::atan;
using ::atan2;
using ::atanh;
using ::cbrt;
using ::ceil;
using ::copysign;
using ::cos;
using ::cosh;
using ::erf;
using ::erfc;
using ::exp;
using ::exp2;
using ::expm1;
using ::fabs;
using ::fdim;
using ::floor;
using ::fma;
using ::fmax;
using ::fmin;
using ::fmod;
using ::fpclassify;
using ::frexp;
using ::hypot;
using ::ilogb;
using ::isfinite;
using ::isgreater;
using ::isgreaterequal;
using ::isless;
using ::islessequal;
using ::islessgreater;
using ::isnormal;
using ::isunordered;
using ::ldexp;
using ::lgamma;
using ::llrint;
using ::llround;
using ::log;
using ::log10;
using ::log1p;
using ::log2;
using ::logb;
using ::lrint;
using ::lround;
using ::modf;
// using ::nan; - This may be considered for C++.
// using ::nanf; - This may be considered for C++.
// using ::nanl; - This is not yet defined.
using ::nearbyint;
using ::nextafter;
// using ::nexttoward; - Omit this since we do not have a definition.
using ::pow;
using ::remainder;
using ::remquo;
using ::rint;
using ::round;
using ::scalbln;
using ::scalbn;
using ::signbit;
using ::sin;
using ::sinh;
using ::sqrt;
using ::tan;
using ::tanh;
using ::tgamma;
using ::trunc;
// Well this is fun: We need to pull these symbols in for libc++, but we can't
// pull them in with libstdc++, because its ::isinf and ::isnan are different
// than its std::isinf and std::isnan.
#ifndef __GLIBCXX__
using ::isinf;
using ::isnan;
#endif
// Finally, pull the "foobarf" functions that HIP defines into std.
using ::acosf;
using ::acoshf;
using ::asinf;
using ::asinhf;
using ::atan2f;
using ::atanf;
using ::atanhf;
using ::cbrtf;
using ::ceilf;
using ::copysignf;
using ::cosf;
using ::coshf;
using ::erfcf;
using ::erff;
using ::exp2f;
using ::expf;
using ::expm1f;
using ::fabsf;
using ::fdimf;
using ::floorf;
using ::fmaf;
using ::fmaxf;
using ::fminf;
using ::fmodf;
using ::frexpf;
using ::hypotf;
using ::ilogbf;
using ::ldexpf;
using ::lgammaf;
using ::llrintf;
using ::llroundf;
using ::log10f;
using ::log1pf;
using ::log2f;
using ::logbf;
using ::logf;
using ::lrintf;
using ::lroundf;
using ::modff;
using ::nearbyintf;
using ::nextafterf;
// using ::nexttowardf; - Omit this since we do not have a definition.
using ::powf;
using ::remainderf;
using ::remquof;
using ::rintf;
using ::roundf;
using ::scalblnf;
using ::scalbnf;
using ::sinf;
using ::sinhf;
using ::sqrtf;
using ::tanf;
using ::tanhf;
using ::tgammaf;
using ::truncf;
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif // _GLIBCXX_BEGIN_NAMESPACE_VERSION
} // namespace std
#endif // _LIBCPP_END_NAMESPACE_STD
#endif // !defined(__HIPCC_RTC__)
// Define device-side math functions from <ymath.h> on MSVC.
#if !defined(__HIPCC_RTC__)
#if defined(_MSC_VER)
// Before VS2019, `<ymath.h>` is also included in `<limits>` and other headers.
// But, from VS2019, it's only included in `<complex>`. Need to include
// `<ymath.h>` here to ensure C functions declared there won't be markded as
// `__host__` and `__device__` through `<complex>` wrapper.
#include <ymath.h>
#if defined(__cplusplus)
extern "C" {
#endif // defined(__cplusplus)
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Cosh(double x,
double y) {
return cosh(x) * y;
}
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FCosh(float x,
float y) {
return coshf(x) * y;
}
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _Dtest(double *p) {
return fpclassify(*p);
}
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) short _FDtest(float *p) {
return fpclassify(*p);
}
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) double _Sinh(double x,
double y) {
return sinh(x) * y;
}
__DEVICE__ __CONSTEXPR__ __attribute__((overloadable)) float _FSinh(float x,
float y) {
return sinhf(x) * y;
}
#if defined(__cplusplus)
}
#endif // defined(__cplusplus)
#endif // defined(_MSC_VER)
#endif // !defined(__HIPCC_RTC__)
#endif // ifndef __OPENMP_AMDGCN__
#pragma pop_macro("__DEVICE__")
#pragma pop_macro("__CONSTEXPR__")
#endif // __CLANG_HIP_CMATH_H__

View File

@ -1,353 +0,0 @@
/*===---- __clang_hip_libdevice_declares.h - HIP device library decls -------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_HIP_LIBDEVICE_DECLARES_H__
#define __CLANG_HIP_LIBDEVICE_DECLARES_H__
#if !defined(__HIPCC_RTC__) && __has_include("hip/hip_version.h")
#include "hip/hip_version.h"
#endif // __has_include("hip/hip_version.h")
#ifdef __cplusplus
extern "C" {
#endif
// BEGIN FLOAT
__device__ __attribute__((const)) float __ocml_acos_f32(float);
__device__ __attribute__((pure)) float __ocml_acosh_f32(float);
__device__ __attribute__((const)) float __ocml_asin_f32(float);
__device__ __attribute__((pure)) float __ocml_asinh_f32(float);
__device__ __attribute__((const)) float __ocml_atan2_f32(float, float);
__device__ __attribute__((const)) float __ocml_atan_f32(float);
__device__ __attribute__((pure)) float __ocml_atanh_f32(float);
__device__ __attribute__((pure)) float __ocml_cbrt_f32(float);
__device__ __attribute__((const)) float __ocml_ceil_f32(float);
__device__ __attribute__((const)) __device__ float __ocml_copysign_f32(float,
float);
__device__ float __ocml_cos_f32(float);
__device__ float __ocml_native_cos_f32(float);
__device__ __attribute__((pure)) __device__ float __ocml_cosh_f32(float);
__device__ float __ocml_cospi_f32(float);
__device__ float __ocml_i0_f32(float);
__device__ float __ocml_i1_f32(float);
__device__ __attribute__((pure)) float __ocml_erfc_f32(float);
__device__ __attribute__((pure)) float __ocml_erfcinv_f32(float);
__device__ __attribute__((pure)) float __ocml_erfcx_f32(float);
__device__ __attribute__((pure)) float __ocml_erf_f32(float);
__device__ __attribute__((pure)) float __ocml_erfinv_f32(float);
__device__ __attribute__((pure)) float __ocml_exp10_f32(float);
__device__ __attribute__((pure)) float __ocml_native_exp10_f32(float);
__device__ __attribute__((pure)) float __ocml_exp2_f32(float);
__device__ __attribute__((pure)) float __ocml_exp_f32(float);
__device__ __attribute__((pure)) float __ocml_native_exp_f32(float);
__device__ __attribute__((pure)) float __ocml_expm1_f32(float);
__device__ __attribute__((const)) float __ocml_fabs_f32(float);
__device__ __attribute__((const)) float __ocml_fdim_f32(float, float);
__device__ __attribute__((const)) float __ocml_floor_f32(float);
__device__ __attribute__((const)) float __ocml_fma_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fmax_f32(float, float);
__device__ __attribute__((const)) float __ocml_fmin_f32(float, float);
__device__ __attribute__((const)) __device__ float __ocml_fmod_f32(float,
float);
__device__ float __ocml_frexp_f32(float,
__attribute__((address_space(5))) int *);
__device__ __attribute__((const)) float __ocml_hypot_f32(float, float);
__device__ __attribute__((const)) int __ocml_ilogb_f32(float);
__device__ __attribute__((const)) int __ocml_isfinite_f32(float);
__device__ __attribute__((const)) int __ocml_isinf_f32(float);
__device__ __attribute__((const)) int __ocml_isnan_f32(float);
__device__ float __ocml_j0_f32(float);
__device__ float __ocml_j1_f32(float);
__device__ __attribute__((const)) float __ocml_ldexp_f32(float, int);
__device__ float __ocml_lgamma_f32(float);
__device__ __attribute__((pure)) float __ocml_log10_f32(float);
__device__ __attribute__((pure)) float __ocml_native_log10_f32(float);
__device__ __attribute__((pure)) float __ocml_log1p_f32(float);
__device__ __attribute__((pure)) float __ocml_log2_f32(float);
__device__ __attribute__((pure)) float __ocml_native_log2_f32(float);
__device__ __attribute__((const)) float __ocml_logb_f32(float);
__device__ __attribute__((pure)) float __ocml_log_f32(float);
__device__ __attribute__((pure)) float __ocml_native_log_f32(float);
__device__ float __ocml_modf_f32(float,
__attribute__((address_space(5))) float *);
__device__ __attribute__((const)) float __ocml_nearbyint_f32(float);
__device__ __attribute__((const)) float __ocml_nextafter_f32(float, float);
__device__ __attribute__((const)) float __ocml_len3_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_len4_f32(float, float, float,
float);
__device__ __attribute__((pure)) float __ocml_ncdf_f32(float);
__device__ __attribute__((pure)) float __ocml_ncdfinv_f32(float);
__device__ __attribute__((pure)) float __ocml_pow_f32(float, float);
__device__ __attribute__((pure)) float __ocml_pown_f32(float, int);
__device__ __attribute__((pure)) float __ocml_rcbrt_f32(float);
__device__ __attribute__((const)) float __ocml_remainder_f32(float, float);
__device__ float __ocml_remquo_f32(float, float,
__attribute__((address_space(5))) int *);
__device__ __attribute__((const)) float __ocml_rhypot_f32(float, float);
__device__ __attribute__((const)) float __ocml_rint_f32(float);
__device__ __attribute__((const)) float __ocml_rlen3_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_rlen4_f32(float, float, float,
float);
__device__ __attribute__((const)) float __ocml_round_f32(float);
__device__ __attribute__((pure)) float __ocml_rsqrt_f32(float);
__device__ __attribute__((const)) float __ocml_scalb_f32(float, float);
__device__ __attribute__((const)) float __ocml_scalbn_f32(float, int);
__device__ __attribute__((const)) int __ocml_signbit_f32(float);
__device__ float __ocml_sincos_f32(float,
__attribute__((address_space(5))) float *);
__device__ float __ocml_sincospi_f32(float,
__attribute__((address_space(5))) float *);
__device__ float __ocml_sin_f32(float);
__device__ float __ocml_native_sin_f32(float);
__device__ __attribute__((pure)) float __ocml_sinh_f32(float);
__device__ float __ocml_sinpi_f32(float);
__device__ __attribute__((const)) float __ocml_sqrt_f32(float);
__device__ __attribute__((const)) float __ocml_native_sqrt_f32(float);
__device__ float __ocml_tan_f32(float);
__device__ __attribute__((pure)) float __ocml_tanh_f32(float);
__device__ float __ocml_tgamma_f32(float);
__device__ __attribute__((const)) float __ocml_trunc_f32(float);
__device__ float __ocml_y0_f32(float);
__device__ float __ocml_y1_f32(float);
// BEGIN INTRINSICS
__device__ __attribute__((const)) float __ocml_add_rte_f32(float, float);
__device__ __attribute__((const)) float __ocml_add_rtn_f32(float, float);
__device__ __attribute__((const)) float __ocml_add_rtp_f32(float, float);
__device__ __attribute__((const)) float __ocml_add_rtz_f32(float, float);
__device__ __attribute__((const)) float __ocml_sub_rte_f32(float, float);
__device__ __attribute__((const)) float __ocml_sub_rtn_f32(float, float);
__device__ __attribute__((const)) float __ocml_sub_rtp_f32(float, float);
__device__ __attribute__((const)) float __ocml_sub_rtz_f32(float, float);
__device__ __attribute__((const)) float __ocml_mul_rte_f32(float, float);
__device__ __attribute__((const)) float __ocml_mul_rtn_f32(float, float);
__device__ __attribute__((const)) float __ocml_mul_rtp_f32(float, float);
__device__ __attribute__((const)) float __ocml_mul_rtz_f32(float, float);
__device__ __attribute__((const)) float __ocml_div_rte_f32(float, float);
__device__ __attribute__((const)) float __ocml_div_rtn_f32(float, float);
__device__ __attribute__((const)) float __ocml_div_rtp_f32(float, float);
__device__ __attribute__((const)) float __ocml_div_rtz_f32(float, float);
__device__ __attribute__((const)) float __ocml_sqrt_rte_f32(float);
__device__ __attribute__((const)) float __ocml_sqrt_rtn_f32(float);
__device__ __attribute__((const)) float __ocml_sqrt_rtp_f32(float);
__device__ __attribute__((const)) float __ocml_sqrt_rtz_f32(float);
__device__ __attribute__((const)) float __ocml_fma_rte_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fma_rtn_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fma_rtp_f32(float, float, float);
__device__ __attribute__((const)) float __ocml_fma_rtz_f32(float, float, float);
// END INTRINSICS
// END FLOAT
// BEGIN DOUBLE
__device__ __attribute__((const)) double __ocml_acos_f64(double);
__device__ __attribute__((pure)) double __ocml_acosh_f64(double);
__device__ __attribute__((const)) double __ocml_asin_f64(double);
__device__ __attribute__((pure)) double __ocml_asinh_f64(double);
__device__ __attribute__((const)) double __ocml_atan2_f64(double, double);
__device__ __attribute__((const)) double __ocml_atan_f64(double);
__device__ __attribute__((pure)) double __ocml_atanh_f64(double);
__device__ __attribute__((pure)) double __ocml_cbrt_f64(double);
__device__ __attribute__((const)) double __ocml_ceil_f64(double);
__device__ __attribute__((const)) double __ocml_copysign_f64(double, double);
__device__ double __ocml_cos_f64(double);
__device__ __attribute__((pure)) double __ocml_cosh_f64(double);
__device__ double __ocml_cospi_f64(double);
__device__ double __ocml_i0_f64(double);
__device__ double __ocml_i1_f64(double);
__device__ __attribute__((pure)) double __ocml_erfc_f64(double);
__device__ __attribute__((pure)) double __ocml_erfcinv_f64(double);
__device__ __attribute__((pure)) double __ocml_erfcx_f64(double);
__device__ __attribute__((pure)) double __ocml_erf_f64(double);
__device__ __attribute__((pure)) double __ocml_erfinv_f64(double);
__device__ __attribute__((pure)) double __ocml_exp10_f64(double);
__device__ __attribute__((pure)) double __ocml_exp2_f64(double);
__device__ __attribute__((pure)) double __ocml_exp_f64(double);
__device__ __attribute__((pure)) double __ocml_expm1_f64(double);
__device__ __attribute__((const)) double __ocml_fabs_f64(double);
__device__ __attribute__((const)) double __ocml_fdim_f64(double, double);
__device__ __attribute__((const)) double __ocml_floor_f64(double);
__device__ __attribute__((const)) double __ocml_fma_f64(double, double, double);
__device__ __attribute__((const)) double __ocml_fmax_f64(double, double);
__device__ __attribute__((const)) double __ocml_fmin_f64(double, double);
__device__ __attribute__((const)) double __ocml_fmod_f64(double, double);
__device__ double __ocml_frexp_f64(double,
__attribute__((address_space(5))) int *);
__device__ __attribute__((const)) double __ocml_hypot_f64(double, double);
__device__ __attribute__((const)) int __ocml_ilogb_f64(double);
__device__ __attribute__((const)) int __ocml_isfinite_f64(double);
__device__ __attribute__((const)) int __ocml_isinf_f64(double);
__device__ __attribute__((const)) int __ocml_isnan_f64(double);
__device__ double __ocml_j0_f64(double);
__device__ double __ocml_j1_f64(double);
__device__ __attribute__((const)) double __ocml_ldexp_f64(double, int);
__device__ double __ocml_lgamma_f64(double);
__device__ __attribute__((pure)) double __ocml_log10_f64(double);
__device__ __attribute__((pure)) double __ocml_log1p_f64(double);
__device__ __attribute__((pure)) double __ocml_log2_f64(double);
__device__ __attribute__((const)) double __ocml_logb_f64(double);
__device__ __attribute__((pure)) double __ocml_log_f64(double);
__device__ double __ocml_modf_f64(double,
__attribute__((address_space(5))) double *);
__device__ __attribute__((const)) double __ocml_nearbyint_f64(double);
__device__ __attribute__((const)) double __ocml_nextafter_f64(double, double);
__device__ __attribute__((const)) double __ocml_len3_f64(double, double,
double);
__device__ __attribute__((const)) double __ocml_len4_f64(double, double, double,
double);
__device__ __attribute__((pure)) double __ocml_ncdf_f64(double);
__device__ __attribute__((pure)) double __ocml_ncdfinv_f64(double);
__device__ __attribute__((pure)) double __ocml_pow_f64(double, double);
__device__ __attribute__((pure)) double __ocml_pown_f64(double, int);
__device__ __attribute__((pure)) double __ocml_rcbrt_f64(double);
__device__ __attribute__((const)) double __ocml_remainder_f64(double, double);
__device__ double __ocml_remquo_f64(double, double,
__attribute__((address_space(5))) int *);
__device__ __attribute__((const)) double __ocml_rhypot_f64(double, double);
__device__ __attribute__((const)) double __ocml_rint_f64(double);
__device__ __attribute__((const)) double __ocml_rlen3_f64(double, double,
double);
__device__ __attribute__((const)) double __ocml_rlen4_f64(double, double,
double, double);
__device__ __attribute__((const)) double __ocml_round_f64(double);
__device__ __attribute__((pure)) double __ocml_rsqrt_f64(double);
__device__ __attribute__((const)) double __ocml_scalb_f64(double, double);
__device__ __attribute__((const)) double __ocml_scalbn_f64(double, int);
__device__ __attribute__((const)) int __ocml_signbit_f64(double);
__device__ double __ocml_sincos_f64(double,
__attribute__((address_space(5))) double *);
__device__ double
__ocml_sincospi_f64(double, __attribute__((address_space(5))) double *);
__device__ double __ocml_sin_f64(double);
__device__ __attribute__((pure)) double __ocml_sinh_f64(double);
__device__ double __ocml_sinpi_f64(double);
__device__ __attribute__((const)) double __ocml_sqrt_f64(double);
__device__ double __ocml_tan_f64(double);
__device__ __attribute__((pure)) double __ocml_tanh_f64(double);
__device__ double __ocml_tgamma_f64(double);
__device__ __attribute__((const)) double __ocml_trunc_f64(double);
__device__ double __ocml_y0_f64(double);
__device__ double __ocml_y1_f64(double);
// BEGIN INTRINSICS
__device__ __attribute__((const)) double __ocml_add_rte_f64(double, double);
__device__ __attribute__((const)) double __ocml_add_rtn_f64(double, double);
__device__ __attribute__((const)) double __ocml_add_rtp_f64(double, double);
__device__ __attribute__((const)) double __ocml_add_rtz_f64(double, double);
__device__ __attribute__((const)) double __ocml_sub_rte_f64(double, double);
__device__ __attribute__((const)) double __ocml_sub_rtn_f64(double, double);
__device__ __attribute__((const)) double __ocml_sub_rtp_f64(double, double);
__device__ __attribute__((const)) double __ocml_sub_rtz_f64(double, double);
__device__ __attribute__((const)) double __ocml_mul_rte_f64(double, double);
__device__ __attribute__((const)) double __ocml_mul_rtn_f64(double, double);
__device__ __attribute__((const)) double __ocml_mul_rtp_f64(double, double);
__device__ __attribute__((const)) double __ocml_mul_rtz_f64(double, double);
__device__ __attribute__((const)) double __ocml_div_rte_f64(double, double);
__device__ __attribute__((const)) double __ocml_div_rtn_f64(double, double);
__device__ __attribute__((const)) double __ocml_div_rtp_f64(double, double);
__device__ __attribute__((const)) double __ocml_div_rtz_f64(double, double);
__device__ __attribute__((const)) double __ocml_sqrt_rte_f64(double);
__device__ __attribute__((const)) double __ocml_sqrt_rtn_f64(double);
__device__ __attribute__((const)) double __ocml_sqrt_rtp_f64(double);
__device__ __attribute__((const)) double __ocml_sqrt_rtz_f64(double);
__device__ __attribute__((const)) double __ocml_fma_rte_f64(double, double,
double);
__device__ __attribute__((const)) double __ocml_fma_rtn_f64(double, double,
double);
__device__ __attribute__((const)) double __ocml_fma_rtp_f64(double, double,
double);
__device__ __attribute__((const)) double __ocml_fma_rtz_f64(double, double,
double);
__device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
__device__ _Float16 __ocml_cos_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
__device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
_Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
__device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
__device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
__device__ _Float16 __ocml_sin_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
__device__ __attribute__((const)) _Float16 __ocml_trunc_f16(_Float16);
__device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
typedef _Float16 __2f16 __attribute__((ext_vector_type(2)));
typedef short __2i16 __attribute__((ext_vector_type(2)));
// We need to match C99's bool and get an i1 in the IR.
#ifdef __cplusplus
typedef bool __ockl_bool;
#else
typedef _Bool __ockl_bool;
#endif
__device__ __attribute__((const)) float __ockl_fdot2(__2f16 a, __2f16 b,
float c, __ockl_bool s);
__device__ __attribute__((const)) __2f16 __ocml_ceil_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_fabs_2f16(__2f16);
__device__ __2f16 __ocml_cos_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp10_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_exp2_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_floor_2f16(__2f16);
__device__ __attribute__((const))
__2f16 __ocml_fma_2f16(__2f16, __2f16, __2f16);
__device__ __attribute__((const)) __2i16 __ocml_isinf_2f16(__2f16);
__device__ __attribute__((const)) __2i16 __ocml_isnan_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
__device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 560
#define __DEPRECATED_SINCE_HIP_560(X) __attribute__((deprecated(X)))
#else
#define __DEPRECATED_SINCE_HIP_560(X)
#endif
// Deprecated, should be removed when rocm releases using it are no longer
// relevant.
__DEPRECATED_SINCE_HIP_560("use ((_Float16)1.0) / ")
__device__ inline _Float16 __llvm_amdgcn_rcp_f16(_Float16 x) {
return ((_Float16)1.0f) / x;
}
__DEPRECATED_SINCE_HIP_560("use ((__2f16)1.0) / ")
__device__ inline __2f16
__llvm_amdgcn_rcp_2f16(__2f16 __x)
{
return ((__2f16)1.0f) / __x;
}
#undef __DEPRECATED_SINCE_HIP_560
__device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
__device__ __2f16 __ocml_sin_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
__device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // __CLANG_HIP_LIBDEVICE_DECLARES_H__

File diff suppressed because it is too large Load Diff

View File

@ -1,159 +0,0 @@
/*===---- __clang_hip_runtime_wrapper.h - HIP runtime support ---------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
/*
* WARNING: This header is intended to be directly -include'd by
* the compiler and is not supposed to be included by users.
*
*/
#ifndef __CLANG_HIP_RUNTIME_WRAPPER_H__
#define __CLANG_HIP_RUNTIME_WRAPPER_H__
#if __HIP__
#define __host__ __attribute__((host))
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
#define __shared__ __attribute__((shared))
#define __constant__ __attribute__((constant))
#define __managed__ __attribute__((managed))
#if !defined(__cplusplus) || __cplusplus < 201103L
#define nullptr NULL;
#endif
#ifdef __cplusplus
extern "C" {
__attribute__((__visibility__("default")))
__attribute__((weak))
__attribute__((noreturn))
__device__ void __cxa_pure_virtual(void) {
__builtin_trap();
}
__attribute__((__visibility__("default")))
__attribute__((weak))
__attribute__((noreturn))
__device__ void __cxa_deleted_virtual(void) {
__builtin_trap();
}
}
#endif //__cplusplus
#if !defined(__HIPCC_RTC__)
#if __has_include("hip/hip_version.h")
#include "hip/hip_version.h"
#endif // __has_include("hip/hip_version.h")
#endif // __HIPCC_RTC__
typedef __SIZE_TYPE__ __hip_size_t;
#ifdef __cplusplus
extern "C" {
#endif //__cplusplus
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
__device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
__device__ void __ockl_dm_dealloc(unsigned long long __addr);
#if __has_feature(address_sanitizer)
__device__ unsigned long long __asan_malloc_impl(unsigned long long __size,
unsigned long long __pc);
__device__ void __asan_free_impl(unsigned long long __addr,
unsigned long long __pc);
__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
return (void *)__asan_malloc_impl(__size, __pc);
}
__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
__asan_free_impl((unsigned long long)__ptr, __pc);
}
#else // __has_feature(address_sanitizer)
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
return (void *) __ockl_dm_alloc(__size);
}
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__ockl_dm_dealloc((unsigned long long)__ptr);
}
#endif // __has_feature(address_sanitizer)
#else // HIP version check
#if __HIP_ENABLE_DEVICE_MALLOC__
__device__ void *__hip_malloc(__hip_size_t __size);
__device__ void *__hip_free(void *__ptr);
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
return __hip_malloc(__size);
}
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__hip_free(__ptr);
}
#else // __HIP_ENABLE_DEVICE_MALLOC__
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
__builtin_trap();
return (void *)0;
}
__attribute__((weak)) inline __device__ void free(void *__ptr) {
__builtin_trap();
}
#endif // __HIP_ENABLE_DEVICE_MALLOC__
#endif // HIP version check
#ifdef __cplusplus
} // extern "C"
#endif //__cplusplus
#if !defined(__HIPCC_RTC__)
#include <cmath>
#include <cstdlib>
#include <stdlib.h>
#if __has_include("hip/hip_version.h")
#include "hip/hip_version.h"
#endif // __has_include("hip/hip_version.h")
#else
typedef __SIZE_TYPE__ size_t;
// Define macros which are needed to declare HIP device API's without standard
// C/C++ headers. This is for readability so that these API's can be written
// the same way as non-hipRTC use case. These macros need to be popped so that
// they do not pollute users' name space.
#pragma push_macro("NULL")
#pragma push_macro("uint32_t")
#pragma push_macro("uint64_t")
#pragma push_macro("CHAR_BIT")
#pragma push_macro("INT_MAX")
#define NULL (void *)0
#define uint32_t __UINT32_TYPE__
#define uint64_t __UINT64_TYPE__
#define CHAR_BIT __CHAR_BIT__
#define INT_MAX __INTMAX_MAX__
#endif // __HIPCC_RTC__
#include <__clang_hip_libdevice_declares.h>
#include <__clang_hip_math.h>
#include <__clang_hip_stdlib.h>
#if defined(__HIPCC_RTC__)
#include <__clang_hip_cmath.h>
#else
#include <__clang_cuda_math_forward_declares.h>
#include <__clang_hip_cmath.h>
#include <__clang_cuda_complex_builtins.h>
#include <algorithm>
#include <complex>
#include <new>
#endif // __HIPCC_RTC__
#define __CLANG_HIP_RUNTIME_WRAPPER_INCLUDED__ 1
#if defined(__HIPCC_RTC__)
#pragma pop_macro("NULL")
#pragma pop_macro("uint32_t")
#pragma pop_macro("uint64_t")
#pragma pop_macro("CHAR_BIT")
#pragma pop_macro("INT_MAX")
#endif // __HIPCC_RTC__
#endif // __HIP__
#endif // __CLANG_HIP_RUNTIME_WRAPPER_H__

View File

@ -1,43 +0,0 @@
/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
*
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
* See https://llvm.org/LICENSE.txt for license information.
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_HIP_STDLIB_H__
#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
#error "This file is for HIP and OpenMP AMDGCN device compilation only."
#endif
#if !defined(__cplusplus)
#include <limits.h>
#ifdef __OPENMP_AMDGCN__
#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
#else
#define __DEVICE__ static __device__ inline __attribute__((always_inline))
#endif
__DEVICE__
int abs(int __x) {
int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
__DEVICE__
long labs(long __x) {
long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
__DEVICE__
long long llabs(long long __x) {
long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
return (__x ^ __sgn) - __sgn;
}
#endif // !defined(__cplusplus)
#endif // #define __CLANG_HIP_STDLIB_H__

View File

@ -1,116 +0,0 @@
/*===---- algorithm - CUDA wrapper for <algorithm> -------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM
#define __CLANG_CUDA_WRAPPERS_ALGORITHM
// This header defines __device__ overloads of std::min/max.
//
// Ideally we'd declare these functions only if we're <= C++11. In C++14,
// these functions are constexpr, and so are implicitly __host__ __device__.
//
// However, the compiler being in C++14 mode does not imply that the standard
// library supports C++14. There is no macro we can test to check that the
// stdlib has constexpr std::min/max. Thus we have to unconditionally define
// our device overloads.
//
// A host+device function cannot be overloaded, and a constexpr function
// implicitly become host device if there's no explicitly host or device
// overload preceding it. So the simple thing to do would be to declare our
// device min/max overloads, and then #include_next <algorithm>. This way our
// device overloads would come first, and so if we have a C++14 stdlib, its
// min/max won't become host+device and conflict with our device overloads.
//
// But that also doesn't work. libstdc++ is evil and declares std::min/max in
// an internal header that is included *before* <algorithm>. Thus by the time
// we're inside of this file, std::min/max may already have been declared, and
// thus we can't prevent them from becoming host+device if they're constexpr.
//
// Therefore we perpetrate the following hack: We mark our __device__ overloads
// with __attribute__((enable_if(true, ""))). This causes the signature of the
// function to change without changing anything else about it. (Except that
// overload resolution will prefer it over the __host__ __device__ version
// rather than considering them equally good).
#include_next <algorithm>
// We need to define these overloads in exactly the namespace our standard
// library uses (including the right inline namespace), otherwise they won't be
// picked up by other functions in the standard library (e.g. functions in
// <complex>). Thus the ugliness below.
#ifdef _LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_BEGIN_NAMESPACE_STD
#else
namespace std {
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_BEGIN_NAMESPACE_VERSION
#endif
#endif
#pragma push_macro("_CPP14_CONSTEXPR")
#if __cplusplus >= 201402L
#define _CPP14_CONSTEXPR constexpr
#else
#define _CPP14_CONSTEXPR
#endif
template <class __T, class __Cmp>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__a, __b) ? __b : __a;
}
template <class __T>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
max(const __T &__a, const __T &__b) {
return __a < __b ? __b : __a;
}
template <class __T, class __Cmp>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b, __Cmp __cmp) {
return __cmp(__b, __a) ? __b : __a;
}
template <class __T>
__attribute__((enable_if(true, "")))
inline _CPP14_CONSTEXPR __host__ __device__ const __T &
min(const __T &__a, const __T &__b) {
return __b < __a ? __b : __a;
}
#pragma pop_macro("_CPP14_CONSTEXPR")
#ifdef _LIBCPP_END_NAMESPACE_STD
_LIBCPP_END_NAMESPACE_STD
#else
#ifdef _GLIBCXX_BEGIN_NAMESPACE_VERSION
_GLIBCXX_END_NAMESPACE_VERSION
#endif
} // namespace std
#endif
#endif // __CLANG_CUDA_WRAPPERS_ALGORITHM

View File

@ -1,9 +0,0 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/basic_string.h"
#pragma pop_macro("__noinline__")

View File

@ -1,9 +0,0 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/basic_string.tcc"
#pragma pop_macro("__noinline__")

View File

@ -1,9 +0,0 @@
// CUDA headers define __noinline__ which interferes with libstdc++'s use of
// `__attribute((__noinline__))`. In order to avoid compilation error,
// temporarily unset __noinline__ when we include affected libstdc++ header.
#pragma push_macro("__noinline__")
#undef __noinline__
#include_next "bits/shared_ptr_base.h"
#pragma pop_macro("__noinline__")

View File

@ -1,90 +0,0 @@
/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_CMATH
#define __CLANG_CUDA_WRAPPERS_CMATH
#include_next <cmath>
#if defined(_LIBCPP_STD_VER)
// libc++ will need long double variants of these functions, but CUDA does not
// provide them. We'll provide their declarations, which should allow the
// headers to parse, but would not allow accidental use of them on a GPU.
__attribute__((device)) long double logb(long double);
__attribute__((device)) long double scalbn(long double, int);
namespace std {
// For __constexpr_fmin/fmax we only need device-side overloads before c++14
// where they are not constexpr.
#if _LIBCPP_STD_VER < 14
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
return __builtin_fmaxf(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
return __builtin_fmax(__x, __y);
}
__attribute__((device))
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
return __builtin_fmaxl(__x, __y);
}
template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
using __result_type = typename __promote<_Tp, _Up>::type;
return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
}
#endif // _LIBCPP_STD_VER < 14
// For logb/scalbn templates we must always provide device overloads because
// libc++ implementation uses __builtin_XXX which gets translated into a libcall
// which we can't handle on GPU. We need to forward those to CUDA-provided
// implementations.
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
return ::logb(__x);
}
template <class _Tp>
__attribute__((device))
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
return ::scalbn(__x, __exp);
}
} // namespace std//
#endif // _LIBCPP_STD_VER
#endif // include guard

View File

@ -1,90 +0,0 @@
/*===---- complex - CUDA wrapper for <complex> ------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_COMPLEX
#define __CLANG_CUDA_WRAPPERS_COMPLEX
// Wrapper around <complex> that forces its functions to be __host__
// __device__.
// First, include host-only headers we think are likely to be included by
// <complex>, so that the pragma below only applies to <complex> itself.
#if __cplusplus >= 201103L
#include <type_traits>
#endif
#include <stdexcept>
#include <cmath>
#include <sstream>
// Next, include our <algorithm> wrapper, to ensure that device overloads of
// std::min/max are available.
#include <algorithm>
#pragma clang force_cuda_host_device begin
// When compiling for device, ask libstdc++ to use its own implements of
// complex functions, rather than calling builtins (which resolve to library
// functions that don't exist when compiling CUDA device code).
//
// This is a little dicey, because it causes libstdc++ to define a different
// set of overloads on host and device.
//
// // Present only when compiling for host.
// __host__ __device__ void complex<float> sin(const complex<float>& x) {
// return __builtin_csinf(x);
// }
//
// // Present when compiling for host and for device.
// template <typename T>
// void __host__ __device__ complex<T> sin(const complex<T>& x) {
// return complex<T>(sin(x.real()) * cosh(x.imag()),
// cos(x.real()), sinh(x.imag()));
// }
//
// This is safe because when compiling for device, all function calls in
// __host__ code to sin() will still resolve to *something*, even if they don't
// resolve to the same function as they resolve to when compiling for host. We
// don't care that they don't resolve to the right function because we won't
// codegen this host code when compiling for device.
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma push_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#define _GLIBCXX_USE_C99_COMPLEX 0
#define _GLIBCXX_USE_C99_COMPLEX_TR1 0
// Work around a compatibility issue with libstdc++ 11.1.0
// https://bugs.llvm.org/show_bug.cgi?id=50383
#pragma push_macro("__failed_assertion")
#if _GLIBCXX_RELEASE == 11
#define __failed_assertion __cuda_failed_assertion
#endif
#include_next <complex>
#pragma pop_macro("__failed_assertion")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX_TR1")
#pragma pop_macro("_GLIBCXX_USE_C99_COMPLEX")
#pragma clang force_cuda_host_device end
#endif // include guard

View File

@ -1,106 +0,0 @@
/*===---- new - CUDA wrapper for <new> -------------------------------------===
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __CLANG_CUDA_WRAPPERS_NEW
#define __CLANG_CUDA_WRAPPERS_NEW
#include_next <new>
#if !defined(__device__)
// The header has been included too early from the standard C++ library
// and CUDA-specific macros are not available yet.
// Undo the include guard and try again later.
#undef __CLANG_CUDA_WRAPPERS_NEW
#else
#pragma push_macro("CUDA_NOEXCEPT")
#if __cplusplus >= 201103L
#define CUDA_NOEXCEPT noexcept
#else
#define CUDA_NOEXCEPT
#endif
// Device overrides for non-placement new and delete.
__device__ inline void *operator new(__SIZE_TYPE__ size) {
if (size == 0) {
size = 1;
}
return ::malloc(size);
}
__device__ inline void *operator new(__SIZE_TYPE__ size,
const std::nothrow_t &) CUDA_NOEXCEPT {
return ::operator new(size);
}
__device__ inline void *operator new[](__SIZE_TYPE__ size) {
return ::operator new(size);
}
__device__ inline void *operator new[](__SIZE_TYPE__ size,
const std::nothrow_t &) {
return ::operator new(size);
}
__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
if (ptr) {
::free(ptr);
}
}
__device__ inline void operator delete(void *ptr,
const std::nothrow_t &) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void *ptr,
const std::nothrow_t &) CUDA_NOEXCEPT {
::operator delete(ptr);
}
// Sized delete, C++14 only.
#if __cplusplus >= 201402L
__device__ inline void operator delete(void *ptr,
__SIZE_TYPE__ size) CUDA_NOEXCEPT {
::operator delete(ptr);
}
__device__ inline void operator delete[](void *ptr,
__SIZE_TYPE__ size) CUDA_NOEXCEPT {
::operator delete(ptr);
}
#endif
// Device overrides for placement new and delete.
__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
#pragma pop_macro("CUDA_NOEXCEPT")
#endif // __device__
#endif // include guard

View File

@ -1,829 +0,0 @@
//===----- opencl-c-base.h - OpenCL C language base definitions -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _OPENCL_BASE_H_
#define _OPENCL_BASE_H_
// Define extension macros
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
// For SPIR and SPIR-V all extensions are supported.
#if defined(__SPIR__) || defined(__SPIRV__)
#define cl_khr_subgroup_extended_types 1
#define cl_khr_subgroup_non_uniform_vote 1
#define cl_khr_subgroup_ballot 1
#define cl_khr_subgroup_non_uniform_arithmetic 1
#define cl_khr_subgroup_shuffle 1
#define cl_khr_subgroup_shuffle_relative 1
#define cl_khr_subgroup_clustered_reduce 1
#define cl_khr_subgroup_rotate 1
#define cl_khr_extended_bit_ops 1
#define cl_khr_integer_dot_product 1
#define __opencl_c_integer_dot_product_input_4x8bit 1
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
#define cl_ext_float_atomics 1
#ifdef cl_khr_fp16
#define __opencl_c_ext_fp16_global_atomic_load_store 1
#define __opencl_c_ext_fp16_local_atomic_load_store 1
#define __opencl_c_ext_fp16_global_atomic_add 1
#define __opencl_c_ext_fp16_local_atomic_add 1
#define __opencl_c_ext_fp16_global_atomic_min_max 1
#define __opencl_c_ext_fp16_local_atomic_min_max 1
#endif
#ifdef cl_khr_fp64
#define __opencl_c_ext_fp64_global_atomic_add 1
#define __opencl_c_ext_fp64_local_atomic_add 1
#define __opencl_c_ext_fp64_global_atomic_min_max 1
#define __opencl_c_ext_fp64_local_atomic_min_max 1
#endif
#define __opencl_c_ext_fp32_global_atomic_add 1
#define __opencl_c_ext_fp32_local_atomic_add 1
#define __opencl_c_ext_fp32_global_atomic_min_max 1
#define __opencl_c_ext_fp32_local_atomic_min_max 1
#define __opencl_c_ext_image_raw10_raw12 1
#define cl_khr_kernel_clock 1
#define __opencl_c_kernel_clock_scope_device 1
#define __opencl_c_kernel_clock_scope_work_group 1
#define __opencl_c_kernel_clock_scope_sub_group 1
#endif // defined(__SPIR__) || defined(__SPIRV__)
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
// Define feature macros for OpenCL C 2.0
#if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200)
#define __opencl_c_pipes 1
#define __opencl_c_generic_address_space 1
#define __opencl_c_work_group_collective_functions 1
#define __opencl_c_atomic_order_acq_rel 1
#define __opencl_c_atomic_order_seq_cst 1
#define __opencl_c_atomic_scope_device 1
#define __opencl_c_atomic_scope_all_devices 1
#define __opencl_c_device_enqueue 1
#define __opencl_c_read_write_images 1
#define __opencl_c_program_scope_global_variables 1
#define __opencl_c_images 1
#endif
// Define header-only feature macros for OpenCL C 3.0.
#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
// For the SPIR and SPIR-V target all features are supported.
#if defined(__SPIR__) || defined(__SPIRV__)
#define __opencl_c_work_group_collective_functions 1
#define __opencl_c_atomic_order_seq_cst 1
#define __opencl_c_atomic_scope_device 1
#define __opencl_c_atomic_scope_all_devices 1
#define __opencl_c_read_write_images 1
#endif // defined(__SPIR__)
// Undefine any feature macros that have been explicitly disabled using
// an __undef_<feature> macro.
#ifdef __undef___opencl_c_work_group_collective_functions
#undef __opencl_c_work_group_collective_functions
#endif
#ifdef __undef___opencl_c_atomic_order_seq_cst
#undef __opencl_c_atomic_order_seq_cst
#endif
#ifdef __undef___opencl_c_atomic_scope_device
#undef __opencl_c_atomic_scope_device
#endif
#ifdef __undef___opencl_c_atomic_scope_all_devices
#undef __opencl_c_atomic_scope_all_devices
#endif
#ifdef __undef___opencl_c_read_write_images
#undef __opencl_c_read_write_images
#endif
#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
#if !defined(__opencl_c_generic_address_space)
// Internal feature macro to provide named (global, local, private) address
// space overloads for builtin functions that take a pointer argument.
#define __opencl_c_named_address_space_builtins 1
#endif // !defined(__opencl_c_generic_address_space)
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
// Internal feature macro to provide subgroup builtins.
#define __opencl_subgroup_builtins 1
#endif
// built-in scalar data types:
/**
* An unsigned 8-bit integer.
*/
typedef unsigned char uchar;
/**
* An unsigned 16-bit integer.
*/
typedef unsigned short ushort;
/**
* An unsigned 32-bit integer.
*/
typedef unsigned int uint;
/**
* An unsigned 64-bit integer.
*/
typedef unsigned long ulong;
/**
* The unsigned integer type of the result of the sizeof operator. This
* is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __SIZE_TYPE__ size_t;
/**
* A signed integer type that is the result of subtracting two pointers.
* This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit signed integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __PTRDIFF_TYPE__ ptrdiff_t;
/**
* A signed integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __INTPTR_TYPE__ intptr_t;
/**
* An unsigned integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __UINTPTR_TYPE__ uintptr_t;
// built-in vector data types:
typedef char char2 __attribute__((ext_vector_type(2)));
typedef char char3 __attribute__((ext_vector_type(3)));
typedef char char4 __attribute__((ext_vector_type(4)));
typedef char char8 __attribute__((ext_vector_type(8)));
typedef char char16 __attribute__((ext_vector_type(16)));
typedef uchar uchar2 __attribute__((ext_vector_type(2)));
typedef uchar uchar3 __attribute__((ext_vector_type(3)));
typedef uchar uchar4 __attribute__((ext_vector_type(4)));
typedef uchar uchar8 __attribute__((ext_vector_type(8)));
typedef uchar uchar16 __attribute__((ext_vector_type(16)));
typedef short short2 __attribute__((ext_vector_type(2)));
typedef short short3 __attribute__((ext_vector_type(3)));
typedef short short4 __attribute__((ext_vector_type(4)));
typedef short short8 __attribute__((ext_vector_type(8)));
typedef short short16 __attribute__((ext_vector_type(16)));
typedef ushort ushort2 __attribute__((ext_vector_type(2)));
typedef ushort ushort3 __attribute__((ext_vector_type(3)));
typedef ushort ushort4 __attribute__((ext_vector_type(4)));
typedef ushort ushort8 __attribute__((ext_vector_type(8)));
typedef ushort ushort16 __attribute__((ext_vector_type(16)));
typedef int int2 __attribute__((ext_vector_type(2)));
typedef int int3 __attribute__((ext_vector_type(3)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
typedef int int16 __attribute__((ext_vector_type(16)));
typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint3 __attribute__((ext_vector_type(3)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
typedef uint uint8 __attribute__((ext_vector_type(8)));
typedef uint uint16 __attribute__((ext_vector_type(16)));
typedef long long2 __attribute__((ext_vector_type(2)));
typedef long long3 __attribute__((ext_vector_type(3)));
typedef long long4 __attribute__((ext_vector_type(4)));
typedef long long8 __attribute__((ext_vector_type(8)));
typedef long long16 __attribute__((ext_vector_type(16)));
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
typedef ulong ulong3 __attribute__((ext_vector_type(3)));
typedef ulong ulong4 __attribute__((ext_vector_type(4)));
typedef ulong ulong8 __attribute__((ext_vector_type(8)));
typedef ulong ulong16 __attribute__((ext_vector_type(16)));
typedef float float2 __attribute__((ext_vector_type(2)));
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float8 __attribute__((ext_vector_type(8)));
typedef float float16 __attribute__((ext_vector_type(16)));
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef half half2 __attribute__((ext_vector_type(2)));
typedef half half3 __attribute__((ext_vector_type(3)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef half half8 __attribute__((ext_vector_type(8)));
typedef half half16 __attribute__((ext_vector_type(16)));
#endif
#ifdef cl_khr_fp64
#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
typedef double double2 __attribute__((ext_vector_type(2)));
typedef double double3 __attribute__((ext_vector_type(3)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef double double8 __attribute__((ext_vector_type(8)));
typedef double double16 __attribute__((ext_vector_type(16)));
#endif
// An internal alias for half, for use by OpenCLBuiltins.td.
#define __half half
#if defined(__OPENCL_CPP_VERSION__)
#define NULL nullptr
#elif defined(__OPENCL_C_VERSION__)
#define NULL ((void*)0)
#endif
/**
* Value of maximum non-infinite single-precision floating-point
* number.
*/
#define MAXFLOAT 0x1.fffffep127f
/**
* A positive float constant expression. HUGE_VALF evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VALF (__builtin_huge_valf())
/**
* A positive double constant expression. HUGE_VAL evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VAL (__builtin_huge_val())
/**
* A constant expression of type float representing positive or
* unsigned infinity.
*/
#define INFINITY (__builtin_inff())
/**
* A constant expression of type float representing a quiet NaN.
*/
#define NAN as_float(INT_MAX)
#define FP_ILOGB0 INT_MIN
#define FP_ILOGBNAN INT_MAX
#define FLT_DIG 6
#define FLT_MANT_DIG 24
#define FLT_MAX_10_EXP +38
#define FLT_MAX_EXP +128
#define FLT_MIN_10_EXP -37
#define FLT_MIN_EXP -125
#define FLT_RADIX 2
#define FLT_MAX 0x1.fffffep127f
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
#define M_E_F 2.71828182845904523536028747135266250f
#define M_LOG2E_F 1.44269504088896340735992468100189214f
#define M_LOG10E_F 0.434294481903251827651128918916605082f
#define M_LN2_F 0.693147180559945309417232121458176568f
#define M_LN10_F 2.30258509299404568401799145468436421f
#define M_PI_F 3.14159265358979323846264338327950288f
#define M_PI_2_F 1.57079632679489661923132169163975144f
#define M_PI_4_F 0.785398163397448309615660845819875721f
#define M_1_PI_F 0.318309886183790671537767526745028724f
#define M_2_PI_F 0.636619772367581343075535053490057448f
#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
#define M_SQRT2_F 1.41421356237309504880168872420969808f
#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
#define DBL_DIG 15
#define DBL_MANT_DIG 53
#define DBL_MAX_10_EXP +308
#define DBL_MAX_EXP +1024
#define DBL_MIN_10_EXP -307
#define DBL_MIN_EXP -1021
#define DBL_RADIX 2
#define DBL_MAX 0x1.fffffffffffffp1023
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
#define M_LOG10E 0x1.bcb7b1526e50ep-2
#define M_LN2 0x1.62e42fefa39efp-1
#define M_LN10 0x1.26bb1bbb55516p+1
#define M_PI 0x1.921fb54442d18p+1
#define M_PI_2 0x1.921fb54442d18p+0
#define M_PI_4 0x1.921fb54442d18p-1
#define M_1_PI 0x1.45f306dc9c883p-2
#define M_2_PI 0x1.45f306dc9c883p-1
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
#define M_SQRT2 0x1.6a09e667f3bcdp+0
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
#ifdef cl_khr_fp16
#define HALF_DIG 3
#define HALF_MANT_DIG 11
#define HALF_MAX_10_EXP +4
#define HALF_MAX_EXP +16
#define HALF_MIN_10_EXP -4
#define HALF_MIN_EXP -13
#define HALF_RADIX 2
#define HALF_MAX ((0x1.ffcp15h))
#define HALF_MIN ((0x1.0p-14h))
#define HALF_EPSILON ((0x1.0p-10h))
#define M_E_H 2.71828182845904523536028747135266250h
#define M_LOG2E_H 1.44269504088896340735992468100189214h
#define M_LOG10E_H 0.434294481903251827651128918916605082h
#define M_LN2_H 0.693147180559945309417232121458176568h
#define M_LN10_H 2.30258509299404568401799145468436421h
#define M_PI_H 3.14159265358979323846264338327950288h
#define M_PI_2_H 1.57079632679489661923132169163975144h
#define M_PI_4_H 0.785398163397448309615660845819875721h
#define M_1_PI_H 0.318309886183790671537767526745028724h
#define M_2_PI_H 0.636619772367581343075535053490057448h
#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h
#define M_SQRT2_H 1.41421356237309504880168872420969808h
#define M_SQRT1_2_H 0.707106781186547524400844362104849039h
#endif //cl_khr_fp16
#define CHAR_BIT 8
#define SCHAR_MAX 127
#define SCHAR_MIN (-128)
#define UCHAR_MAX 255
#define CHAR_MAX SCHAR_MAX
#define CHAR_MIN SCHAR_MIN
#define USHRT_MAX 65535
#define SHRT_MAX 32767
#define SHRT_MIN (-32768)
#define UINT_MAX 0xffffffff
#define INT_MAX 2147483647
#define INT_MIN (-2147483647-1)
#define ULONG_MAX 0xffffffffffffffffUL
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN (-0x7fffffffffffffffL-1)
// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
typedef uint cl_mem_fence_flags;
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to local memory
*/
#define CLK_LOCAL_MEM_FENCE 0x01
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to global memory
*/
#define CLK_GLOBAL_MEM_FENCE 0x02
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
#if defined(__opencl_c_atomic_scope_all_devices)
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
memory_scope_all_devices = memory_scope_all_svm_devices,
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
#endif // defined(__opencl_c_atomic_scope_all_devices)
/**
* Subgroups have different requirements on forward progress, so just test
* all the relevant macros.
* CL 3.0 sub-groups "they are not guaranteed to make independent forward progress"
* KHR subgroups "Subgroups within a workgroup are independent, make forward progress with respect to each other"
*/
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;
/**
* Queue a memory fence to ensure correct ordering of memory
* operations between work-items of a work-group to
* image memory.
*/
#define CLK_IMAGE_MEM_FENCE 0x04
#ifndef ATOMIC_VAR_INIT
#define ATOMIC_VAR_INIT(x) (x)
#endif //ATOMIC_VAR_INIT
#define ATOMIC_FLAG_INIT 0
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
#if defined(__opencl_c_atomic_order_seq_cst)
memory_order_seq_cst = __ATOMIC_SEQ_CST
#endif
} memory_order;
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
// These values need to match the runtime equivalent
//
// Addressing Mode.
//
#define CLK_ADDRESS_NONE 0
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
#define CLK_ADDRESS_CLAMP 4
#define CLK_ADDRESS_REPEAT 6
#define CLK_ADDRESS_MIRRORED_REPEAT 8
//
// Coordination Normalization
//
#define CLK_NORMALIZED_COORDS_FALSE 0
#define CLK_NORMALIZED_COORDS_TRUE 1
//
// Filtering Mode.
//
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
#ifdef cl_khr_gl_msaa_sharing
#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
#endif //cl_khr_gl_msaa_sharing
//
// Channel Datatype.
//
#define CLK_SNORM_INT8 0x10D0
#define CLK_SNORM_INT16 0x10D1
#define CLK_UNORM_INT8 0x10D2
#define CLK_UNORM_INT16 0x10D3
#define CLK_UNORM_SHORT_565 0x10D4
#define CLK_UNORM_SHORT_555 0x10D5
#define CLK_UNORM_INT_101010 0x10D6
#define CLK_SIGNED_INT8 0x10D7
#define CLK_SIGNED_INT16 0x10D8
#define CLK_SIGNED_INT32 0x10D9
#define CLK_UNSIGNED_INT8 0x10DA
#define CLK_UNSIGNED_INT16 0x10DB
#define CLK_UNSIGNED_INT32 0x10DC
#define CLK_HALF_FLOAT 0x10DD
#define CLK_FLOAT 0x10DE
#define CLK_UNORM_INT24 0x10DF
#if __OPENCL_C_VERSION__ >= CL_VERSION_3_0
#define CLK_UNORM_INT_101010_2 0x10E0
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
#ifdef __opencl_c_ext_image_raw10_raw12
#define CLK_UNSIGNED_INT_RAW10_EXT 0x10E3
#define CLK_UNSIGNED_INT_RAW12_EXT 0x10E4
#endif // __opencl_c_ext_image_raw10_raw12
// Channel order, numbering must be aligned with cl_channel_order in cl.h
//
#define CLK_R 0x10B0
#define CLK_A 0x10B1
#define CLK_RG 0x10B2
#define CLK_RA 0x10B3
#define CLK_RGB 0x10B4
#define CLK_RGBA 0x10B5
#define CLK_BGRA 0x10B6
#define CLK_ARGB 0x10B7
#define CLK_INTENSITY 0x10B8
#define CLK_LUMINANCE 0x10B9
#define CLK_Rx 0x10BA
#define CLK_RGx 0x10BB
#define CLK_RGBx 0x10BC
#define CLK_DEPTH 0x10BD
#define CLK_DEPTH_STENCIL 0x10BE
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_sRGB 0x10BF
#define CLK_sRGBx 0x10C0
#define CLK_sRGBA 0x10C1
#define CLK_sBGRA 0x10C2
#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v2.0 s6.13.16 - Pipe Functions
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
// OpenCL v2.0 s6.13.17 - Enqueue Kernels
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
#define CLK_SUCCESS 0
#define CLK_ENQUEUE_FAILURE -101
#define CLK_INVALID_QUEUE -102
#define CLK_INVALID_NDRANGE -160
#define CLK_INVALID_EVENT_WAIT_LIST -57
#define CLK_DEVICE_QUEUE_FULL -161
#define CLK_INVALID_ARG_SIZE -51
#define CLK_EVENT_ALLOCATION_FAILURE -100
#define CLK_OUT_OF_RESOURCES -5
#define CLK_NULL_QUEUE 0
#define CLK_NULL_EVENT (__builtin_astype(((__SIZE_MAX__)), clk_event_t))
// execution model related definitions
#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0
#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1
#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2
typedef int kernel_enqueue_flags_t;
typedef int clk_profiling_info;
// Profiling info name (see capture_event_profiling_info)
#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
#define MAX_WORK_DIM 3
#ifdef __opencl_c_device_enqueue
typedef struct {
unsigned int workDimension;
size_t globalWorkOffset[MAX_WORK_DIM];
size_t globalWorkSize[MAX_WORK_DIM];
size_t localWorkSize[MAX_WORK_DIM];
} ndrange_t;
#endif // __opencl_c_device_enqueue
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
/**
* OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
* Reinterprets a data type as another data type of the same size
*/
#define as_char(x) __builtin_astype((x), char)
#define as_char2(x) __builtin_astype((x), char2)
#define as_char3(x) __builtin_astype((x), char3)
#define as_char4(x) __builtin_astype((x), char4)
#define as_char8(x) __builtin_astype((x), char8)
#define as_char16(x) __builtin_astype((x), char16)
#define as_uchar(x) __builtin_astype((x), uchar)
#define as_uchar2(x) __builtin_astype((x), uchar2)
#define as_uchar3(x) __builtin_astype((x), uchar3)
#define as_uchar4(x) __builtin_astype((x), uchar4)
#define as_uchar8(x) __builtin_astype((x), uchar8)
#define as_uchar16(x) __builtin_astype((x), uchar16)
#define as_short(x) __builtin_astype((x), short)
#define as_short2(x) __builtin_astype((x), short2)
#define as_short3(x) __builtin_astype((x), short3)
#define as_short4(x) __builtin_astype((x), short4)
#define as_short8(x) __builtin_astype((x), short8)
#define as_short16(x) __builtin_astype((x), short16)
#define as_ushort(x) __builtin_astype((x), ushort)
#define as_ushort2(x) __builtin_astype((x), ushort2)
#define as_ushort3(x) __builtin_astype((x), ushort3)
#define as_ushort4(x) __builtin_astype((x), ushort4)
#define as_ushort8(x) __builtin_astype((x), ushort8)
#define as_ushort16(x) __builtin_astype((x), ushort16)
#define as_int(x) __builtin_astype((x), int)
#define as_int2(x) __builtin_astype((x), int2)
#define as_int3(x) __builtin_astype((x), int3)
#define as_int4(x) __builtin_astype((x), int4)
#define as_int8(x) __builtin_astype((x), int8)
#define as_int16(x) __builtin_astype((x), int16)
#define as_uint(x) __builtin_astype((x), uint)
#define as_uint2(x) __builtin_astype((x), uint2)
#define as_uint3(x) __builtin_astype((x), uint3)
#define as_uint4(x) __builtin_astype((x), uint4)
#define as_uint8(x) __builtin_astype((x), uint8)
#define as_uint16(x) __builtin_astype((x), uint16)
#define as_long(x) __builtin_astype((x), long)
#define as_long2(x) __builtin_astype((x), long2)
#define as_long3(x) __builtin_astype((x), long3)
#define as_long4(x) __builtin_astype((x), long4)
#define as_long8(x) __builtin_astype((x), long8)
#define as_long16(x) __builtin_astype((x), long16)
#define as_ulong(x) __builtin_astype((x), ulong)
#define as_ulong2(x) __builtin_astype((x), ulong2)
#define as_ulong3(x) __builtin_astype((x), ulong3)
#define as_ulong4(x) __builtin_astype((x), ulong4)
#define as_ulong8(x) __builtin_astype((x), ulong8)
#define as_ulong16(x) __builtin_astype((x), ulong16)
#define as_float(x) __builtin_astype((x), float)
#define as_float2(x) __builtin_astype((x), float2)
#define as_float3(x) __builtin_astype((x), float3)
#define as_float4(x) __builtin_astype((x), float4)
#define as_float8(x) __builtin_astype((x), float8)
#define as_float16(x) __builtin_astype((x), float16)
#ifdef cl_khr_fp64
#define as_double(x) __builtin_astype((x), double)
#define as_double2(x) __builtin_astype((x), double2)
#define as_double3(x) __builtin_astype((x), double3)
#define as_double4(x) __builtin_astype((x), double4)
#define as_double8(x) __builtin_astype((x), double8)
#define as_double16(x) __builtin_astype((x), double16)
#endif // cl_khr_fp64
#ifdef cl_khr_fp16
#define as_half(x) __builtin_astype((x), half)
#define as_half2(x) __builtin_astype((x), half2)
#define as_half3(x) __builtin_astype((x), half3)
#define as_half4(x) __builtin_astype((x), half4)
#define as_half8(x) __builtin_astype((x), half8)
#define as_half16(x) __builtin_astype((x), half16)
#endif // cl_khr_fp16
#define as_size_t(x) __builtin_astype((x), size_t)
#define as_ptrdiff_t(x) __builtin_astype((x), ptrdiff_t)
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
// C++ for OpenCL - __remove_address_space
#if defined(__OPENCL_CPP_VERSION__)
template <typename _Tp> struct __remove_address_space { using type = _Tp; };
#if defined(__opencl_c_generic_address_space)
template <typename _Tp> struct __remove_address_space<__generic _Tp> {
using type = _Tp;
};
#endif
template <typename _Tp> struct __remove_address_space<__global _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__private _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__local _Tp> {
using type = _Tp;
};
template <typename _Tp> struct __remove_address_space<__constant _Tp> {
using type = _Tp;
};
#endif
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
#define __kernel_exec(X, typen) __kernel \
__attribute__((work_group_size_hint(X, 1, 1))) \
__attribute__((vec_type_hint(typen)))
#define kernel_exec(X, typen) __kernel \
__attribute__((work_group_size_hint(X, 1, 1))) \
__attribute__((vec_type_hint(typen)))
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
// OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
#endif
#ifdef cl_intel_device_side_avc_motion_estimation
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#endif // cl_intel_device_side_avc_motion_estimation
// Disable any extensions we may have enabled previously.
#pragma OPENCL EXTENSION all : disable
#endif //_OPENCL_BASE_H_

18371
lib/include/opencl-c.h vendored

File diff suppressed because it is too large Load Diff