mirror of
https://github.com/ziglang/zig.git
synced 2025-12-06 06:13:07 +00:00
update C language headers to clang release/14.x
upstream commit 91632c8ac97fa3daffe4ff8f1391735b5d6805e6
This commit is contained in:
parent
60954598e9
commit
397e055ddd
6
lib/include/__clang_cuda_complex_builtins.h
vendored
6
lib/include/__clang_cuda_complex_builtins.h
vendored
@ -16,7 +16,7 @@
|
|||||||
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
|
// to work with CUDA and OpenMP target offloading [in C and C++ mode].)
|
||||||
|
|
||||||
#pragma push_macro("__DEVICE__")
|
#pragma push_macro("__DEVICE__")
|
||||||
#ifdef __OPENMP_NVPTX__
|
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
|
||||||
#pragma omp declare target
|
#pragma omp declare target
|
||||||
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
|
#define __DEVICE__ __attribute__((noinline, nothrow, cold, weak))
|
||||||
#else
|
#else
|
||||||
@ -26,7 +26,7 @@
|
|||||||
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
|
// To make the algorithms available for C and C++ in CUDA and OpenMP we select
|
||||||
// different but equivalent function versions. TODO: For OpenMP we currently
|
// different but equivalent function versions. TODO: For OpenMP we currently
|
||||||
// select the native builtins as the overload support for templates is lacking.
|
// select the native builtins as the overload support for templates is lacking.
|
||||||
#if !defined(__OPENMP_NVPTX__)
|
#if !defined(__OPENMP_NVPTX__) && !defined(__OPENMP_AMDGCN__)
|
||||||
#define _ISNANd std::isnan
|
#define _ISNANd std::isnan
|
||||||
#define _ISNANf std::isnan
|
#define _ISNANf std::isnan
|
||||||
#define _ISINFd std::isinf
|
#define _ISINFd std::isinf
|
||||||
@ -276,7 +276,7 @@ __DEVICE__ float _Complex __divsc3(float __a, float __b, float __c, float __d) {
|
|||||||
#undef _fmaxd
|
#undef _fmaxd
|
||||||
#undef _fmaxf
|
#undef _fmaxf
|
||||||
|
|
||||||
#ifdef __OPENMP_NVPTX__
|
#if defined(__OPENMP_NVPTX__) || defined(__OPENMP_AMDGCN__)
|
||||||
#pragma omp end declare target
|
#pragma omp end declare target
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|||||||
32
lib/include/__clang_cuda_intrinsics.h
vendored
32
lib/include/__clang_cuda_intrinsics.h
vendored
@ -483,4 +483,36 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32,
|
|||||||
|
|
||||||
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 11000
|
||||||
|
extern "C" {
|
||||||
|
__device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) {
|
||||||
|
return (size_t)(void __attribute__((address_space(1))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) {
|
||||||
|
return (size_t)(void __attribute__((address_space(3))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) {
|
||||||
|
return (size_t)(void __attribute__((address_space(4))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) {
|
||||||
|
return (size_t)(void __attribute__((address_space(5))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) {
|
||||||
|
return (void *)(void __attribute__((address_space(1))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) {
|
||||||
|
return (void *)(void __attribute__((address_space(3))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) {
|
||||||
|
return (void *)(void __attribute__((address_space(4))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) {
|
||||||
|
return (void *)(void __attribute__((address_space(5))) *)__ptr;
|
||||||
|
}
|
||||||
|
__device__ inline uint32_t __nvvm_get_smem_pointer(void *__ptr) {
|
||||||
|
return __nv_cvta_generic_to_shared_impl(__ptr);
|
||||||
|
}
|
||||||
|
} // extern "C"
|
||||||
|
#endif // CUDA_VERSION >= 11000
|
||||||
|
|
||||||
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
|
#endif // defined(__CLANG_CUDA_INTRINSICS_H__)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ extern "C" {
|
|||||||
|
|
||||||
#if defined(__OPENMP_NVPTX__)
|
#if defined(__OPENMP_NVPTX__)
|
||||||
#define __DEVICE__
|
#define __DEVICE__
|
||||||
|
#pragma omp begin assumes ext_spmd_amenable no_openmp
|
||||||
#elif defined(__CUDA__)
|
#elif defined(__CUDA__)
|
||||||
#define __DEVICE__ __device__
|
#define __DEVICE__ __device__
|
||||||
#endif
|
#endif
|
||||||
@ -456,6 +457,11 @@ __DEVICE__ double __nv_y1(double __a);
|
|||||||
__DEVICE__ float __nv_y1f(float __a);
|
__DEVICE__ float __nv_y1f(float __a);
|
||||||
__DEVICE__ float __nv_ynf(int __a, float __b);
|
__DEVICE__ float __nv_ynf(int __a, float __b);
|
||||||
__DEVICE__ double __nv_yn(int __a, double __b);
|
__DEVICE__ double __nv_yn(int __a, double __b);
|
||||||
|
|
||||||
|
#if defined(__OPENMP_NVPTX__)
|
||||||
|
#pragma omp end assumes ext_spmd_amenable no_openmp
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__cplusplus)
|
#if defined(__cplusplus)
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
2
lib/include/__clang_cuda_math.h
vendored
2
lib/include/__clang_cuda_math.h
vendored
@ -345,4 +345,4 @@ __DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); }
|
|||||||
#pragma pop_macro("__DEVICE_VOID__")
|
#pragma pop_macro("__DEVICE_VOID__")
|
||||||
#pragma pop_macro("__FAST_OR_SLOW")
|
#pragma pop_macro("__FAST_OR_SLOW")
|
||||||
|
|
||||||
#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__
|
#endif // __CLANG_CUDA_MATH_H__
|
||||||
|
|||||||
69
lib/include/__clang_cuda_runtime_wrapper.h
vendored
69
lib/include/__clang_cuda_runtime_wrapper.h
vendored
@ -41,6 +41,7 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
#undef __CUDACC__
|
#undef __CUDACC__
|
||||||
|
|
||||||
// Preserve common macros that will be changed below by us or by CUDA
|
// Preserve common macros that will be changed below by us or by CUDA
|
||||||
@ -64,9 +65,9 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Make largest subset of device functions available during host
|
// Make largest subset of device functions available during host
|
||||||
// compilation -- SM_35 for the time being.
|
// compilation.
|
||||||
#ifndef __CUDA_ARCH__
|
#ifndef __CUDA_ARCH__
|
||||||
#define __CUDA_ARCH__ 350
|
#define __CUDA_ARCH__ 9999
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "__clang_cuda_builtin_vars.h"
|
#include "__clang_cuda_builtin_vars.h"
|
||||||
@ -205,11 +206,6 @@ inline __host__ double __signbitd(double x) {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CUDA_VERSION >= 9000
|
#if CUDA_VERSION >= 9000
|
||||||
// CUDA-9.2 needs host-side memcpy for some host functions in
|
|
||||||
// device_functions.hpp
|
|
||||||
#if CUDA_VERSION >= 9020
|
|
||||||
#include <string.h>
|
|
||||||
#endif
|
|
||||||
#include "crt/math_functions.hpp"
|
#include "crt/math_functions.hpp"
|
||||||
#else
|
#else
|
||||||
#include "math_functions.hpp"
|
#include "math_functions.hpp"
|
||||||
@ -275,7 +271,38 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
|||||||
#undef __CUDABE__
|
#undef __CUDABE__
|
||||||
#endif
|
#endif
|
||||||
#include "sm_20_atomic_functions.hpp"
|
#include "sm_20_atomic_functions.hpp"
|
||||||
|
// Predicate functions used in `__builtin_assume` need to have no side effect.
|
||||||
|
// However, sm_20_intrinsics.hpp doesn't define them with neither pure nor
|
||||||
|
// const attribute. Rename definitions from sm_20_intrinsics.hpp and re-define
|
||||||
|
// them as pure ones.
|
||||||
|
#pragma push_macro("__isGlobal")
|
||||||
|
#pragma push_macro("__isShared")
|
||||||
|
#pragma push_macro("__isConstant")
|
||||||
|
#pragma push_macro("__isLocal")
|
||||||
|
#define __isGlobal __ignored_cuda___isGlobal
|
||||||
|
#define __isShared __ignored_cuda___isShared
|
||||||
|
#define __isConstant __ignored_cuda___isConstant
|
||||||
|
#define __isLocal __ignored_cuda___isLocal
|
||||||
#include "sm_20_intrinsics.hpp"
|
#include "sm_20_intrinsics.hpp"
|
||||||
|
#pragma pop_macro("__isGlobal")
|
||||||
|
#pragma pop_macro("__isShared")
|
||||||
|
#pragma pop_macro("__isConstant")
|
||||||
|
#pragma pop_macro("__isLocal")
|
||||||
|
#pragma push_macro("__DEVICE__")
|
||||||
|
#define __DEVICE__ static __device__ __forceinline__ __attribute__((const))
|
||||||
|
__DEVICE__ unsigned int __isGlobal(const void *p) {
|
||||||
|
return __nvvm_isspacep_global(p);
|
||||||
|
}
|
||||||
|
__DEVICE__ unsigned int __isShared(const void *p) {
|
||||||
|
return __nvvm_isspacep_shared(p);
|
||||||
|
}
|
||||||
|
__DEVICE__ unsigned int __isConstant(const void *p) {
|
||||||
|
return __nvvm_isspacep_const(p);
|
||||||
|
}
|
||||||
|
__DEVICE__ unsigned int __isLocal(const void *p) {
|
||||||
|
return __nvvm_isspacep_local(p);
|
||||||
|
}
|
||||||
|
#pragma pop_macro("__DEVICE__")
|
||||||
#include "sm_32_atomic_functions.hpp"
|
#include "sm_32_atomic_functions.hpp"
|
||||||
|
|
||||||
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
|
// Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
|
||||||
@ -330,6 +357,34 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
|
|||||||
|
|
||||||
#pragma pop_macro("__host__")
|
#pragma pop_macro("__host__")
|
||||||
|
|
||||||
|
// __clang_cuda_texture_intrinsics.h must be included first in order to provide
|
||||||
|
// implementation for __nv_tex_surf_handler that CUDA's headers depend on.
|
||||||
|
// The implementation requires c++11 and only works with CUDA-9 or newer.
|
||||||
|
#if __cplusplus >= 201103L && CUDA_VERSION >= 9000
|
||||||
|
// clang-format off
|
||||||
|
#include <__clang_cuda_texture_intrinsics.h>
|
||||||
|
// clang-format on
|
||||||
|
#else
|
||||||
|
#if CUDA_VERSION >= 9000
|
||||||
|
// Provide a hint that texture support needs C++11.
|
||||||
|
template <typename T> struct __nv_tex_needs_cxx11 {
|
||||||
|
const static bool value = false;
|
||||||
|
};
|
||||||
|
template <class T>
|
||||||
|
__host__ __device__ void __nv_tex_surf_handler(const char *name, T *ptr,
|
||||||
|
cudaTextureObject_t obj,
|
||||||
|
float x) {
|
||||||
|
_Static_assert(__nv_tex_needs_cxx11<T>::value,
|
||||||
|
"Texture support requires C++11");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Textures in CUDA-8 and older are not supported by clang.There's no
|
||||||
|
// convenient way to intercept texture use in these versions, so we can't
|
||||||
|
// produce a meaningful error. The source code that attempts to use textures
|
||||||
|
// will continue to fail as it does now.
|
||||||
|
#endif // CUDA_VERSION
|
||||||
|
#endif // __cplusplus >= 201103L && CUDA_VERSION >= 9000
|
||||||
|
#include "texture_fetch_functions.h"
|
||||||
#include "texture_indirect_functions.h"
|
#include "texture_indirect_functions.h"
|
||||||
|
|
||||||
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
|
// Restore state of __CUDA_ARCH__ and __THROW we had on entry.
|
||||||
|
|||||||
740
lib/include/__clang_cuda_texture_intrinsics.h
vendored
Normal file
740
lib/include/__clang_cuda_texture_intrinsics.h
vendored
Normal file
@ -0,0 +1,740 @@
|
|||||||
|
/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*
|
||||||
|
* This header provides in-header implmentations for NVCC's built-in
|
||||||
|
* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
|
||||||
|
* built-in is unusual as it's actually a set of function overloads that use the
|
||||||
|
* first string literal argument as one of the overload parameters.
|
||||||
|
*/
|
||||||
|
#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
|
||||||
|
#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
|
||||||
|
#ifndef __CUDA__
|
||||||
|
#error "This file is for CUDA compilation only."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// __nv_tex_surf_handler() provided by this header as a macro.
|
||||||
|
#define __nv_tex_surf_handler(__op, __ptr, ...) \
|
||||||
|
::__cuda_tex::__tex_fetch< \
|
||||||
|
::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
|
||||||
|
__VA_ARGS__)
|
||||||
|
|
||||||
|
#pragma push_macro("__ASM_OUT")
|
||||||
|
#pragma push_macro("__ASM_OUTP")
|
||||||
|
#pragma push_macro("__Args")
|
||||||
|
#pragma push_macro("__ID")
|
||||||
|
#pragma push_macro("__IDV")
|
||||||
|
#pragma push_macro("__IMPL_2DGATHER")
|
||||||
|
#pragma push_macro("__IMPL_ALIAS")
|
||||||
|
#pragma push_macro("__IMPL_ALIASI")
|
||||||
|
#pragma push_macro("__IMPL_F1")
|
||||||
|
#pragma push_macro("__IMPL_F3")
|
||||||
|
#pragma push_macro("__IMPL_F3N")
|
||||||
|
#pragma push_macro("__IMPL_F3S")
|
||||||
|
#pragma push_macro("__IMPL_S")
|
||||||
|
#pragma push_macro("__IMPL_S3")
|
||||||
|
#pragma push_macro("__IMPL_S3I")
|
||||||
|
#pragma push_macro("__IMPL_S3N")
|
||||||
|
#pragma push_macro("__IMPL_S3NI")
|
||||||
|
#pragma push_macro("__IMPL_S3S")
|
||||||
|
#pragma push_macro("__IMPL_S3SI")
|
||||||
|
#pragma push_macro("__IMPL_SI")
|
||||||
|
#pragma push_macro("__L")
|
||||||
|
#pragma push_macro("__STRIP_PARENS")
|
||||||
|
|
||||||
|
// Put all functions into anonymous namespace so they have internal linkage.
|
||||||
|
// The device-only function here must be internal in order to avoid ODR
|
||||||
|
// violations in case they are used from the files compiled with
|
||||||
|
// -fgpu-rdc. E.g. a library and an app using it may be built with a different
|
||||||
|
// version of this header file.
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// Put the implmentation into its own namespace so we don't pollute the TU.
|
||||||
|
namespace __cuda_tex {
|
||||||
|
|
||||||
|
// First, we need a perfect hash function and a few constexpr helper functions
|
||||||
|
// for converting a string literal into a numeric value which can be used to
|
||||||
|
// parametrize a template. We can not use string literals for that as that would
|
||||||
|
// require C++20.
|
||||||
|
//
|
||||||
|
// The hash function was generated with 'gperf' and then manually converted into
|
||||||
|
// its constexpr equivalent.
|
||||||
|
//
|
||||||
|
// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
|
||||||
|
// function has a collision for any of the texture operations, the compilation
|
||||||
|
// will fail due to an attempt to redefine a tag with the same value. If the
|
||||||
|
// header compiles, then the hash function is good enough for the job.
|
||||||
|
|
||||||
|
constexpr int __tex_len(const char *s) {
|
||||||
|
return (s[0] == 0) ? 0
|
||||||
|
: (s[1] == 0) ? 1
|
||||||
|
: (s[2] == 0) ? 2
|
||||||
|
: (s[3] == 0) ? 3
|
||||||
|
: (s[4] == 0) ? 4
|
||||||
|
: (s[5] == 0) ? 5
|
||||||
|
: (s[6] == 0) ? 6
|
||||||
|
: (s[7] == 0) ? 7
|
||||||
|
: (s[8] == 0) ? 8
|
||||||
|
: (s[9] == 0) ? 9
|
||||||
|
: (s[10] == 0) ? 10
|
||||||
|
: (s[11] == 0) ? 11
|
||||||
|
: (s[12] == 0) ? 12
|
||||||
|
: (s[13] == 0) ? 13
|
||||||
|
: (s[14] == 0) ? 14
|
||||||
|
: (s[15] == 0) ? 15
|
||||||
|
: (s[16] == 0) ? 16
|
||||||
|
: (s[17] == 0) ? 17
|
||||||
|
: (s[18] == 0) ? 18
|
||||||
|
: (s[19] == 0) ? 19
|
||||||
|
: (s[20] == 0) ? 20
|
||||||
|
: (s[21] == 0) ? 21
|
||||||
|
: (s[22] == 0) ? 22
|
||||||
|
: (s[23] == 0) ? 23
|
||||||
|
: (s[24] == 0) ? 24
|
||||||
|
: (s[25] == 0) ? 25
|
||||||
|
: (s[26] == 0) ? 26
|
||||||
|
: (s[27] == 0) ? 27
|
||||||
|
: (s[28] == 0) ? 28
|
||||||
|
: (s[29] == 0) ? 29
|
||||||
|
: (s[30] == 0) ? 30
|
||||||
|
: (s[31] == 0) ? 31
|
||||||
|
: 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int __tex_hash_map(int c) {
|
||||||
|
return (c == 49) ? 10
|
||||||
|
: (c == 50) ? 0
|
||||||
|
: (c == 51) ? 100
|
||||||
|
: (c == 52) ? 30
|
||||||
|
: (c == 67) ? 10
|
||||||
|
: (c == 68) ? 0
|
||||||
|
: (c == 69) ? 25
|
||||||
|
: (c == 72) ? 70
|
||||||
|
: (c == 77) ? 0
|
||||||
|
: (c == 96) ? 44
|
||||||
|
: (c == 99) ? 10
|
||||||
|
: (c == 100) ? 5
|
||||||
|
: (c == 101) ? 60
|
||||||
|
: (c == 102) ? 40
|
||||||
|
: (c == 103) ? 70
|
||||||
|
: (c == 104) ? 25
|
||||||
|
: (c == 112) ? 0
|
||||||
|
: (c == 114) ? 45
|
||||||
|
: (c == 117) ? 5
|
||||||
|
: (c == 118) ? 85
|
||||||
|
: (c == 120) ? 20
|
||||||
|
: 225;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int __tex_op_hash(const char *str) {
|
||||||
|
return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
|
||||||
|
__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tag type to identify particular texture operation.
|
||||||
|
template <int N> struct __Tag;
|
||||||
|
#define __ID(__op) __Tag<__tex_op_hash(__op)>
|
||||||
|
// Tags for variants of particular operation. E.g. tex2Dgather can translate
|
||||||
|
// into 4 different instructions.
|
||||||
|
#define __IDV(__op, __variant) \
|
||||||
|
__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
|
||||||
|
|
||||||
|
// Helper classes for figuring out key data types for derived types.
|
||||||
|
// E.g. char2 has __base_t = char, __fetch_t = char4
|
||||||
|
template <class> struct __TypeInfoT;
|
||||||
|
// Type info for the fundamental types.
|
||||||
|
template <> struct __TypeInfoT<float> {
|
||||||
|
using __base_t = float;
|
||||||
|
using __fetch_t = float4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<char> {
|
||||||
|
using __base_t = char;
|
||||||
|
using __fetch_t = int4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<signed char> {
|
||||||
|
using __base_t = signed char;
|
||||||
|
using __fetch_t = int4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<unsigned char> {
|
||||||
|
using __base_t = unsigned char;
|
||||||
|
using __fetch_t = uint4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<short> {
|
||||||
|
using __base_t = short;
|
||||||
|
using __fetch_t = int4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<unsigned short> {
|
||||||
|
using __base_t = unsigned short;
|
||||||
|
using __fetch_t = uint4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<int> {
|
||||||
|
using __base_t = int;
|
||||||
|
using __fetch_t = int4;
|
||||||
|
};
|
||||||
|
template <> struct __TypeInfoT<unsigned int> {
|
||||||
|
using __base_t = unsigned int;
|
||||||
|
using __fetch_t = uint4;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Derived base/fetch types for N-element vectors.
|
||||||
|
template <class __T> struct __TypeInfoT {
|
||||||
|
using __base_t = decltype(__T::x);
|
||||||
|
using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Classes that implement specific texture ops.
|
||||||
|
template <class __op> struct __tex_fetch_v4;
|
||||||
|
|
||||||
|
// Helper macros to strip parens from a macro argument.
|
||||||
|
#define __Args(...) __VA_ARGS__
|
||||||
|
#define __STRIP_PARENS(__X) __X
|
||||||
|
#define __L(__X) __STRIP_PARENS(__Args __X)
|
||||||
|
|
||||||
|
// Construct inline assembly output args.
|
||||||
|
// Results are stored in a temp var __r.
|
||||||
|
// isResident bool is pointed to by __ir
|
||||||
|
// Asm args for return values. It's a 4-element vector
|
||||||
|
#define __ASM_OUT(__t) \
|
||||||
|
("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
|
||||||
|
// .. possibly combined with a predicate.
|
||||||
|
#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
|
||||||
|
|
||||||
|
// Implements a single variant of texture fetch instruction.
|
||||||
|
#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
|
||||||
|
template <> \
|
||||||
|
__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
|
||||||
|
__rt __r; \
|
||||||
|
asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
|
||||||
|
return __r; \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Implements texture fetch instructions for int4/uint4/float4 data types.
|
||||||
|
#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
|
||||||
|
__ASM_OUT("r"), __asm_args) \
|
||||||
|
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
|
||||||
|
__ASM_OUT("r"), __asm_args) \
|
||||||
|
__IMPL_F1(float4, float4, __args, \
|
||||||
|
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
|
||||||
|
__asm_args)
|
||||||
|
// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
|
||||||
|
// types. Similar to above, but returns a boolean 'isPresent' value in addition
|
||||||
|
// to texture data,
|
||||||
|
#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
|
||||||
|
__ASM_OUTP("r"), __asm_args) \
|
||||||
|
__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
|
||||||
|
__ASM_OUTP("r"), __asm_args) \
|
||||||
|
__IMPL_F1(float4, float4, __args, \
|
||||||
|
__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
|
||||||
|
__asm_args)
|
||||||
|
|
||||||
|
// Similar to F3, but for integer data which is returned as normalized floats.
|
||||||
|
// Only instantiates fetch functions for int4/uint4.
|
||||||
|
#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
|
||||||
|
__ASM_OUT("r"), __asm_args) \
|
||||||
|
__IMPL_F1(float4, uint4, __args, \
|
||||||
|
__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
|
||||||
|
__asm_args)
|
||||||
|
|
||||||
|
// Instantiates __tex_fetch_v4 with regular fetch functions.
|
||||||
|
#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
template <> struct __tex_fetch_v4<__op> { \
|
||||||
|
template <class T> \
|
||||||
|
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
|
||||||
|
__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same, but for sparse ops. Only available on sm_60+
|
||||||
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
|
||||||
|
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
|
||||||
|
__asm_args) \
|
||||||
|
template <> struct __tex_fetch_v4<__op> { \
|
||||||
|
template <class T> \
|
||||||
|
__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
|
||||||
|
__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Same, but for normalized float ops.
|
||||||
|
#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
|
||||||
|
__asm_args) \
|
||||||
|
template <> struct __tex_fetch_v4<__op> { \
|
||||||
|
template <class T> \
|
||||||
|
__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
|
||||||
|
__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regular and normalized float ops share a lot of similarities. This macro
|
||||||
|
// instantiates both variants -- normal for __op and normalized for __opn.
|
||||||
|
#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
|
||||||
|
__asm_args) \
|
||||||
|
__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
|
||||||
|
__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
|
||||||
|
|
||||||
|
// Convenience macros which converts string literal __op into a __Tag,
|
||||||
|
#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
|
||||||
|
#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
|
||||||
|
#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
|
||||||
|
__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
|
||||||
|
#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
|
||||||
|
__asm_args) \
|
||||||
|
__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
|
||||||
|
__asm_args)
|
||||||
|
|
||||||
|
// CUDA headers have some 'legacy' texture oprerations that duplicate
|
||||||
|
// functionality. So, we just inherit it, instead of refining a copy.
|
||||||
|
#define __IMPL_ALIASI(__op, __opn) \
|
||||||
|
template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
|
||||||
|
#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
|
||||||
|
|
||||||
|
// Now we can instantiate everything we need for each specific texture fetch
|
||||||
|
// variant.
|
||||||
|
__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
|
||||||
|
__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
|
||||||
|
"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
|
||||||
|
__IMPL_ALIAS("__itex1D", "__tex1D_v2");
|
||||||
|
__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
|
||||||
|
(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
|
||||||
|
("f"(__x), "f"(__dPdx), "f"(__dPdy)));
|
||||||
|
__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
|
||||||
|
(float __x, int __layer), "tex.a1d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
|
||||||
|
__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
|
||||||
|
(float __x, int __layer, float __dPdx, float __dPdy),
|
||||||
|
"tex.grad.a1d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
|
||||||
|
__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
|
||||||
|
(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
|
||||||
|
"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
|
||||||
|
("f"(__x), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
|
||||||
|
|
||||||
|
// 2D
|
||||||
|
__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
|
||||||
|
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
|
||||||
|
__IMPL_ALIAS("__itex2D", "__tex2D_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
|
||||||
|
" selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
|
||||||
|
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
|
||||||
|
"tex.grad.2d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
|
||||||
|
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
|
||||||
|
"f"(__dPdy->y)));
|
||||||
|
__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex2DGrad_sparse",
|
||||||
|
(float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
|
||||||
|
unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.grad.2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
|
||||||
|
"f"(__dPdy->y)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
|
||||||
|
(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y)));
|
||||||
|
__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex2DLayered_sparse",
|
||||||
|
(float __x, float __y, int __layer, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.a2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
|
||||||
|
(float __x, float __y, int __layer, const float2 *__dPdx,
|
||||||
|
const float2 *__dPdy),
|
||||||
|
"tex.grad.a2d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
|
||||||
|
"f"(__dPdy->x), "f"(__dPdy->y)));
|
||||||
|
__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S(
|
||||||
|
"__itex2DLayeredGrad_sparse",
|
||||||
|
(float __x, float __y, int __layer, const float2 *__dPdx,
|
||||||
|
const float2 *__dPdy, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.grad.a2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
|
||||||
|
"f"(__dPdy->x), "f"(__dPdy->y)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
|
||||||
|
(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
|
||||||
|
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex2DLayeredLod_sparse",
|
||||||
|
(float __x, float __y, int __layer, float __level,
|
||||||
|
unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.level.a2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
|
||||||
|
(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
|
||||||
|
("f"(__x), "f"(__y), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex2DLod_sparse",
|
||||||
|
(float __x, float __y, float __level, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.level.2d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__level)));
|
||||||
|
|
||||||
|
// 2D gather is special. Unlike other variants that translate into exactly one
|
||||||
|
// asm instruction, it uses one of the four different instructions selected by
|
||||||
|
// __comp. We implement each instruction variant separately, and dispatch the
|
||||||
|
// right one from the manually implemented 'umbrella' fetch.
|
||||||
|
#define __IMPL_2DGATHER(variant, instr) \
|
||||||
|
__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
|
||||||
|
__IDV("__tex2Dgather_rmnf_v2", variant), \
|
||||||
|
(float __x, float __y, int __comp), instr, "f32", \
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
|
||||||
|
__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
|
||||||
|
__IDV("__tex2Dgather_v2", variant)); \
|
||||||
|
__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
|
||||||
|
(float __x, float __y, unsigned char *__ir, int __comp), \
|
||||||
|
"{.reg .pred %%p0;\n\t" instr, "f32", \
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }", \
|
||||||
|
("f"(__x), "f"(__y)));
|
||||||
|
__IMPL_2DGATHER(0, "tld4.r.2d.v4");
|
||||||
|
__IMPL_2DGATHER(1, "tld4.g.2d.v4");
|
||||||
|
__IMPL_2DGATHER(2, "tld4.b.2d.v4");
|
||||||
|
__IMPL_2DGATHER(3, "tld4.a.2d.v4");
|
||||||
|
|
||||||
|
// Umbrella dispatcher -- calls into specific 2Dgather variant.
|
||||||
|
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
|
||||||
|
template <class __T>
|
||||||
|
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
|
||||||
|
int __comp) {
|
||||||
|
switch (__comp) {
|
||||||
|
case 0:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 1:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 2:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 3:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
|
||||||
|
|
||||||
|
template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
|
||||||
|
template <class __T>
|
||||||
|
__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
|
||||||
|
float __y, int __comp) {
|
||||||
|
switch (__comp) {
|
||||||
|
case 0:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 1:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 2:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
case 3:
|
||||||
|
return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
|
||||||
|
template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
|
||||||
|
template <class __T>
|
||||||
|
__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
|
||||||
|
unsigned char *__ir, int __comp) {
|
||||||
|
switch (__comp) {
|
||||||
|
case 0:
|
||||||
|
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __ir, __comp);
|
||||||
|
case 1:
|
||||||
|
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __ir, __comp);
|
||||||
|
case 2:
|
||||||
|
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __ir, __comp);
|
||||||
|
case 3:
|
||||||
|
return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
|
||||||
|
__obj, __x, __y, __ir, __comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// 3D
|
||||||
|
__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
|
||||||
|
"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z)));
|
||||||
|
__IMPL_ALIAS("__itex3D", "__tex3D_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex3D_sparse",
|
||||||
|
(float __x, float __y, float __z, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.3d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, const float4 *__dPdx,
|
||||||
|
const float4 *__dPdy),
|
||||||
|
"tex.grad.3d.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
|
||||||
|
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
|
||||||
|
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
|
||||||
|
__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex3DGrad_sparse",
|
||||||
|
(float __x, float __y, float __z, const float4 *__dPdx,
|
||||||
|
const float4 *__dPdy, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.grad.3d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
|
||||||
|
"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
|
||||||
|
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
|
||||||
|
|
||||||
|
__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
|
||||||
|
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itex3DLod_sparse",
|
||||||
|
(float __x, float __y, float __z, float __level,
|
||||||
|
unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.level.3d.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
|
||||||
|
|
||||||
|
// Cubemap
|
||||||
|
__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z), "tex.cube.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z)));
|
||||||
|
__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
|
||||||
|
|
||||||
|
__IMPL_S3S("__itexCubemap_sparse",
|
||||||
|
(float __x, float __y, float __z, unsigned char *__ir),
|
||||||
|
"{.reg .pred %%p0;\n\t"
|
||||||
|
"tex.cube.v4",
|
||||||
|
"f32",
|
||||||
|
"{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
|
||||||
|
"selp.u16 %4, 1, 0, %%p0; }",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z)));
|
||||||
|
|
||||||
|
__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, const float4 *__dPdx,
|
||||||
|
const float4 *__dPdy),
|
||||||
|
"tex.grad.cube.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
|
||||||
|
"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
|
||||||
|
"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
|
||||||
|
__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
|
||||||
|
__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
|
||||||
|
const float4 *__dPdy),
|
||||||
|
"tex.grad.acube.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
|
||||||
|
"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
|
||||||
|
"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
|
||||||
|
"f"(__dPdy->z)));
|
||||||
|
__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, int __layer, float __level),
|
||||||
|
"tex.level.acube.v4", "f32",
|
||||||
|
"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
|
||||||
|
("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
|
||||||
|
|
||||||
|
__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
|
||||||
|
(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
|
||||||
|
"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
|
||||||
|
("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
|
||||||
|
__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
|
||||||
|
|
||||||
|
// Helper class for extracting slice of data from V4 fetch results.
|
||||||
|
template <class __DestT, class __SrcT> struct __convert {
|
||||||
|
template <int __NElements = sizeof(__DestT) /
|
||||||
|
sizeof(typename __TypeInfoT<__DestT>::__base_t)>
|
||||||
|
__device__ static __DestT __run(__SrcT __v);
|
||||||
|
template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
|
||||||
|
template <> __device__ static __DestT __run<2>(__SrcT __v) {
|
||||||
|
return {__v.x, __v.y};
|
||||||
|
}
|
||||||
|
template <> __device__ static __DestT __run<3>(__SrcT __v) {
|
||||||
|
return {__v.x, __v.y, __v.z};
|
||||||
|
}
|
||||||
|
template <> __device__ static __DestT __run<4>(__SrcT __v) {
|
||||||
|
return {__v.x, __v.y, __v.z, __v.w};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// These are the top-level function overloads the __nv_tex_surf_handler expands
|
||||||
|
// to. Each overload deals with one of the several ways __nv_tex_surf_handler
|
||||||
|
// is called by CUDA headers. In the end, each of the overloads does the same
|
||||||
|
// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
|
||||||
|
// fetch texture data and which `__convert::run` is needed to convert it into
|
||||||
|
// appropriate return type.
|
||||||
|
|
||||||
|
// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
|
||||||
|
// Data type and return type are based on ret.
|
||||||
|
template <class __op, class __T, class... __Args>
|
||||||
|
__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
|
||||||
|
__Args... __args) {
|
||||||
|
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
|
||||||
|
*__ptr = __convert<__T, __FetchT>::__run(
|
||||||
|
__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
// texture<> objects get magically converted into a texture reference. However,
|
||||||
|
// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
|
||||||
|
// cheat a bit and use inline assembly to do it. It costs us an extra register
|
||||||
|
// and a move, but that is easy for ptxas to optimize away.
|
||||||
|
template <class __T>
|
||||||
|
__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
|
||||||
|
cudaTextureObject_t __obj;
|
||||||
|
asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
|
||||||
|
return __obj;
|
||||||
|
}
|
||||||
|
|
||||||
|
// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
|
||||||
|
// Data type and return type is based on ret.
|
||||||
|
template <class __op, class __T, class __HandleT, class... __Args>
|
||||||
|
__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
|
||||||
|
__Args... __args) {
|
||||||
|
using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
|
||||||
|
*__ptr = __convert<__T, __FetchT>::__run(
|
||||||
|
__tex_fetch_v4<__op>::template __run<__FetchT>(
|
||||||
|
__tex_handle_to_obj(__handle), __args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
|
||||||
|
// cudaReadModeNormalizedFloat fetches always return float4.
|
||||||
|
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
|
||||||
|
__device__ static void
|
||||||
|
__tex_fetch(__DataT *, __RetT *__ptr,
|
||||||
|
texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
|
||||||
|
__Args... __args) {
|
||||||
|
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
|
||||||
|
*__ptr = __convert<__RetT, float4>::__run(
|
||||||
|
__tex_fetch_v4<__op>::template __run<__FetchT>(
|
||||||
|
__tex_handle_to_obj(__handle), __args...));
|
||||||
|
}
|
||||||
|
|
||||||
|
// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
|
||||||
|
// For cudaReadModeElementType fetch return type is based on type_dummy.
|
||||||
|
template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
|
||||||
|
__device__ static void
|
||||||
|
__tex_fetch(__DataT *, __RetT *__ptr,
|
||||||
|
texture<__DataT, __TexT, cudaReadModeElementType> __handle,
|
||||||
|
__Args... __args) {
|
||||||
|
using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
|
||||||
|
*__ptr = __convert<__RetT, __FetchT>::__run(
|
||||||
|
__tex_fetch_v4<__op>::template __run<__FetchT>(
|
||||||
|
__tex_handle_to_obj(__handle), __args...));
|
||||||
|
}
|
||||||
|
} // namespace __cuda_tex
|
||||||
|
} // namespace
|
||||||
|
#pragma pop_macro("__ASM_OUT")
|
||||||
|
#pragma pop_macro("__ASM_OUTP")
|
||||||
|
#pragma pop_macro("__Args")
|
||||||
|
#pragma pop_macro("__ID")
|
||||||
|
#pragma pop_macro("__IDV")
|
||||||
|
#pragma pop_macro("__IMPL_2DGATHER")
|
||||||
|
#pragma pop_macro("__IMPL_ALIAS")
|
||||||
|
#pragma pop_macro("__IMPL_ALIASI")
|
||||||
|
#pragma pop_macro("__IMPL_F1")
|
||||||
|
#pragma pop_macro("__IMPL_F3")
|
||||||
|
#pragma pop_macro("__IMPL_F3N")
|
||||||
|
#pragma pop_macro("__IMPL_F3S")
|
||||||
|
#pragma pop_macro("__IMPL_S")
|
||||||
|
#pragma pop_macro("__IMPL_S3")
|
||||||
|
#pragma pop_macro("__IMPL_S3I")
|
||||||
|
#pragma pop_macro("__IMPL_S3N")
|
||||||
|
#pragma pop_macro("__IMPL_S3NI")
|
||||||
|
#pragma pop_macro("__IMPL_S3S")
|
||||||
|
#pragma pop_macro("__IMPL_S3SI")
|
||||||
|
#pragma pop_macro("__IMPL_SI")
|
||||||
|
#pragma pop_macro("__L")
|
||||||
|
#pragma pop_macro("__STRIP_PARENS")
|
||||||
|
#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
|
||||||
21
lib/include/__clang_hip_runtime_wrapper.h
vendored
21
lib/include/__clang_hip_runtime_wrapper.h
vendored
@ -50,6 +50,9 @@ extern "C" {
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#if __has_include("hip/hip_version.h")
|
||||||
|
#include "hip/hip_version.h"
|
||||||
|
#endif // __has_include("hip/hip_version.h")
|
||||||
#else
|
#else
|
||||||
typedef __SIZE_TYPE__ size_t;
|
typedef __SIZE_TYPE__ size_t;
|
||||||
// Define macros which are needed to declare HIP device API's without standard
|
// Define macros which are needed to declare HIP device API's without standard
|
||||||
@ -74,25 +77,35 @@ typedef __SIZE_TYPE__ __hip_size_t;
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif //__cplusplus
|
#endif //__cplusplus
|
||||||
|
|
||||||
|
#if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
|
||||||
|
extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
|
||||||
|
extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
|
||||||
|
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
||||||
|
return (void *) __ockl_dm_alloc(__size);
|
||||||
|
}
|
||||||
|
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
||||||
|
__ockl_dm_dealloc((unsigned long long)__ptr);
|
||||||
|
}
|
||||||
|
#else // HIP version check
|
||||||
#if __HIP_ENABLE_DEVICE_MALLOC__
|
#if __HIP_ENABLE_DEVICE_MALLOC__
|
||||||
__device__ void *__hip_malloc(__hip_size_t __size);
|
__device__ void *__hip_malloc(__hip_size_t __size);
|
||||||
__device__ void *__hip_free(void *__ptr);
|
__device__ void *__hip_free(void *__ptr);
|
||||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
||||||
return __hip_malloc(__size);
|
return __hip_malloc(__size);
|
||||||
}
|
}
|
||||||
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
|
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
||||||
return __hip_free(__ptr);
|
__hip_free(__ptr);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
__attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
|
||||||
__builtin_trap();
|
__builtin_trap();
|
||||||
return (void *)0;
|
return (void *)0;
|
||||||
}
|
}
|
||||||
__attribute__((weak)) inline __device__ void *free(void *__ptr) {
|
__attribute__((weak)) inline __device__ void free(void *__ptr) {
|
||||||
__builtin_trap();
|
__builtin_trap();
|
||||||
return (void *)0;
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#endif // HIP version check
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|||||||
2
lib/include/__wmmintrin_aes.h
vendored
2
lib/include/__wmmintrin_aes.h
vendored
@ -133,7 +133,7 @@ _mm_aesimc_si128(__m128i __V)
|
|||||||
/// An 8-bit round constant used to generate the AES encryption key.
|
/// An 8-bit round constant used to generate the AES encryption key.
|
||||||
/// \returns A 128-bit round key for AES encryption.
|
/// \returns A 128-bit round key for AES encryption.
|
||||||
#define _mm_aeskeygenassist_si128(C, R) \
|
#define _mm_aeskeygenassist_si128(C, R) \
|
||||||
(__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))
|
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
|||||||
294
lib/include/altivec.h
vendored
294
lib/include/altivec.h
vendored
@ -19,6 +19,10 @@
|
|||||||
#define __CR6_EQ_REV 1
|
#define __CR6_EQ_REV 1
|
||||||
#define __CR6_LT 2
|
#define __CR6_LT 2
|
||||||
#define __CR6_LT_REV 3
|
#define __CR6_LT_REV 3
|
||||||
|
#define __CR6_GT 4
|
||||||
|
#define __CR6_GT_REV 5
|
||||||
|
#define __CR6_SO 6
|
||||||
|
#define __CR6_SO_REV 7
|
||||||
|
|
||||||
/* Constants for vec_test_data_class */
|
/* Constants for vec_test_data_class */
|
||||||
#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
|
#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
|
||||||
@ -1810,6 +1814,11 @@ vec_cmpeq(vector unsigned __int128 __a, vector unsigned __int128 __b) {
|
|||||||
return (vector bool __int128)__builtin_altivec_vcmpequq(
|
return (vector bool __int128)__builtin_altivec_vcmpequq(
|
||||||
(vector bool __int128)__a, (vector bool __int128)__b);
|
(vector bool __int128)__a, (vector bool __int128)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||||
|
vec_cmpeq(vector bool __int128 __a, vector bool __int128 __b) {
|
||||||
|
return (vector bool __int128)__builtin_altivec_vcmpequq(__a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __POWER9_VECTOR__
|
#ifdef __POWER9_VECTOR__
|
||||||
@ -1887,6 +1896,11 @@ vec_cmpne(vector signed __int128 __a, vector signed __int128 __b) {
|
|||||||
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
|
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(
|
||||||
(vector bool __int128)__a, (vector bool __int128)__b));
|
(vector bool __int128)__a, (vector bool __int128)__b));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ vector bool __int128 __ATTRS_o_ai
|
||||||
|
vec_cmpne(vector bool __int128 __a, vector bool __int128 __b) {
|
||||||
|
return (vector bool __int128) ~(__builtin_altivec_vcmpequq(__a, __b));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_cmpnez */
|
/* vec_cmpnez */
|
||||||
@ -2472,7 +2486,7 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
|
|||||||
#ifdef __POWER8_VECTOR__
|
#ifdef __POWER8_VECTOR__
|
||||||
/* vec_popcnt */
|
/* vec_popcnt */
|
||||||
|
|
||||||
static __inline__ vector signed char __ATTRS_o_ai
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
vec_popcnt(vector signed char __a) {
|
vec_popcnt(vector signed char __a) {
|
||||||
return __builtin_altivec_vpopcntb(__a);
|
return __builtin_altivec_vpopcntb(__a);
|
||||||
}
|
}
|
||||||
@ -2480,7 +2494,7 @@ static __inline__ vector unsigned char __ATTRS_o_ai
|
|||||||
vec_popcnt(vector unsigned char __a) {
|
vec_popcnt(vector unsigned char __a) {
|
||||||
return __builtin_altivec_vpopcntb(__a);
|
return __builtin_altivec_vpopcntb(__a);
|
||||||
}
|
}
|
||||||
static __inline__ vector signed short __ATTRS_o_ai
|
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||||
vec_popcnt(vector signed short __a) {
|
vec_popcnt(vector signed short __a) {
|
||||||
return __builtin_altivec_vpopcnth(__a);
|
return __builtin_altivec_vpopcnth(__a);
|
||||||
}
|
}
|
||||||
@ -2488,7 +2502,7 @@ static __inline__ vector unsigned short __ATTRS_o_ai
|
|||||||
vec_popcnt(vector unsigned short __a) {
|
vec_popcnt(vector unsigned short __a) {
|
||||||
return __builtin_altivec_vpopcnth(__a);
|
return __builtin_altivec_vpopcnth(__a);
|
||||||
}
|
}
|
||||||
static __inline__ vector signed int __ATTRS_o_ai
|
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||||
vec_popcnt(vector signed int __a) {
|
vec_popcnt(vector signed int __a) {
|
||||||
return __builtin_altivec_vpopcntw(__a);
|
return __builtin_altivec_vpopcntw(__a);
|
||||||
}
|
}
|
||||||
@ -2496,7 +2510,7 @@ static __inline__ vector unsigned int __ATTRS_o_ai
|
|||||||
vec_popcnt(vector unsigned int __a) {
|
vec_popcnt(vector unsigned int __a) {
|
||||||
return __builtin_altivec_vpopcntw(__a);
|
return __builtin_altivec_vpopcntw(__a);
|
||||||
}
|
}
|
||||||
static __inline__ vector signed long long __ATTRS_o_ai
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
vec_popcnt(vector signed long long __a) {
|
vec_popcnt(vector signed long long __a) {
|
||||||
return __builtin_altivec_vpopcntd(__a);
|
return __builtin_altivec_vpopcntd(__a);
|
||||||
}
|
}
|
||||||
@ -3049,13 +3063,10 @@ static __inline__ vector unsigned char __ATTRS_o_ai
|
|||||||
vec_xl_len_r(const unsigned char *__a, size_t __b) {
|
vec_xl_len_r(const unsigned char *__a, size_t __b) {
|
||||||
vector unsigned char __res =
|
vector unsigned char __res =
|
||||||
(vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
|
(vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
|
||||||
#ifdef __LITTLE_ENDIAN__
|
|
||||||
vector unsigned char __mask =
|
vector unsigned char __mask =
|
||||||
(vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
|
(vector unsigned char)__builtin_altivec_lvsr(16 - __b, (int *)NULL);
|
||||||
__res = (vector unsigned char)__builtin_altivec_vperm_4si(
|
return (vector unsigned char)__builtin_altivec_vperm_4si(
|
||||||
(vector int)__res, (vector int)__res, __mask);
|
(vector int)__res, (vector int)__res, __mask);
|
||||||
#endif
|
|
||||||
return __res;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// vec_xst_len
|
// vec_xst_len
|
||||||
@ -3130,15 +3141,11 @@ static __inline__ void __ATTRS_o_ai vec_xst_len(vector double __a, double *__b,
|
|||||||
static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
|
static __inline__ void __ATTRS_o_ai vec_xst_len_r(vector unsigned char __a,
|
||||||
unsigned char *__b,
|
unsigned char *__b,
|
||||||
size_t __c) {
|
size_t __c) {
|
||||||
#ifdef __LITTLE_ENDIAN__
|
|
||||||
vector unsigned char __mask =
|
vector unsigned char __mask =
|
||||||
(vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
|
(vector unsigned char)__builtin_altivec_lvsl(16 - __c, (int *)NULL);
|
||||||
vector unsigned char __res =
|
vector unsigned char __res =
|
||||||
__builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
|
__builtin_altivec_vperm_4si((vector int)__a, (vector int)__a, __mask);
|
||||||
return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
|
return __builtin_vsx_stxvll((vector int)__res, __b, (__c << 56));
|
||||||
#else
|
|
||||||
return __builtin_vsx_stxvll((vector int)__a, __b, (__c << 56));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
@ -7106,6 +7113,11 @@ vec_orc(vector float __a, vector bool int __b) {
|
|||||||
return (vector float)((vector unsigned int)__a | ~__b);
|
return (vector float)((vector unsigned int)__a | ~__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ vector float __ATTRS_o_ai vec_orc(vector float __a,
|
||||||
|
vector float __b) {
|
||||||
|
return (vector float)((vector unsigned int)__a | ~(vector unsigned int)__b);
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ vector signed long long __ATTRS_o_ai
|
static __inline__ vector signed long long __ATTRS_o_ai
|
||||||
vec_orc(vector signed long long __a, vector signed long long __b) {
|
vec_orc(vector signed long long __a, vector signed long long __b) {
|
||||||
return __a | ~__b;
|
return __a | ~__b;
|
||||||
@ -7150,6 +7162,12 @@ static __inline__ vector double __ATTRS_o_ai
|
|||||||
vec_orc(vector bool long long __a, vector double __b) {
|
vec_orc(vector bool long long __a, vector double __b) {
|
||||||
return (vector double)(__a | ~(vector unsigned long long)__b);
|
return (vector double)(__a | ~(vector unsigned long long)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ vector double __ATTRS_o_ai vec_orc(vector double __a,
|
||||||
|
vector double __b) {
|
||||||
|
return (vector double)((vector bool long long)__a |
|
||||||
|
~(vector unsigned long long)__b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_vor */
|
/* vec_vor */
|
||||||
@ -8399,9 +8417,20 @@ static __inline__ vector float __ATTRS_o_ai vec_round(vector float __a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __VSX__
|
#ifdef __VSX__
|
||||||
|
#ifdef __XL_COMPAT_ALTIVEC__
|
||||||
|
static __inline__ vector double __ATTRS_o_ai vec_rint(vector double __a);
|
||||||
|
static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
|
||||||
|
double __fpscr = __builtin_readflm();
|
||||||
|
__builtin_setrnd(0);
|
||||||
|
vector double __rounded = vec_rint(__a);
|
||||||
|
__builtin_setflm(__fpscr);
|
||||||
|
return __rounded;
|
||||||
|
}
|
||||||
|
#else
|
||||||
static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
|
static __inline__ vector double __ATTRS_o_ai vec_round(vector double __a) {
|
||||||
return __builtin_vsx_xvrdpi(__a);
|
return __builtin_vsx_xvrdpi(__a);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* vec_rint */
|
/* vec_rint */
|
||||||
|
|
||||||
@ -8839,7 +8868,7 @@ static __inline__ vector long long __ATTRS_o_ai
|
|||||||
vec_sl(vector long long __a, vector unsigned long long __b) {
|
vec_sl(vector long long __a, vector unsigned long long __b) {
|
||||||
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
|
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
|
||||||
}
|
}
|
||||||
#else
|
#elif defined(__VSX__)
|
||||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
vec_vspltb(vector unsigned char __a, unsigned char __b);
|
vec_vspltb(vector unsigned char __a, unsigned char __b);
|
||||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
@ -8885,7 +8914,7 @@ static __inline__ vector long long __ATTRS_o_ai
|
|||||||
vec_sl(vector long long __a, vector unsigned long long __b) {
|
vec_sl(vector long long __a, vector unsigned long long __b) {
|
||||||
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
|
return (vector long long)vec_sl((vector unsigned long long)__a, __b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif /* __VSX__ */
|
||||||
|
|
||||||
/* vec_vslb */
|
/* vec_vslb */
|
||||||
|
|
||||||
@ -10350,7 +10379,7 @@ static __inline__ vector long long __ATTRS_o_ai
|
|||||||
vec_sr(vector long long __a, vector unsigned long long __b) {
|
vec_sr(vector long long __a, vector unsigned long long __b) {
|
||||||
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
|
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
|
||||||
}
|
}
|
||||||
#else
|
#elif defined(__VSX__)
|
||||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
|
vec_sr(vector unsigned long long __a, vector unsigned long long __b) {
|
||||||
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
||||||
@ -10394,7 +10423,7 @@ static __inline__ vector long long __ATTRS_o_ai
|
|||||||
vec_sr(vector long long __a, vector unsigned long long __b) {
|
vec_sr(vector long long __a, vector unsigned long long __b) {
|
||||||
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
|
return (vector long long)vec_sr((vector unsigned long long)__a, __b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif /* __VSX__ */
|
||||||
|
|
||||||
/* vec_vsrb */
|
/* vec_vsrb */
|
||||||
|
|
||||||
@ -10480,7 +10509,7 @@ static __inline__ vector unsigned long long __ATTRS_o_ai
|
|||||||
vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
|
vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
|
||||||
return (vector unsigned long long)((vector signed long long)__a >> __b);
|
return (vector unsigned long long)((vector signed long long)__a >> __b);
|
||||||
}
|
}
|
||||||
#else
|
#elif defined(__VSX__)
|
||||||
static __inline__ vector signed long long __ATTRS_o_ai
|
static __inline__ vector signed long long __ATTRS_o_ai
|
||||||
vec_sra(vector signed long long __a, vector unsigned long long __b) {
|
vec_sra(vector signed long long __a, vector unsigned long long __b) {
|
||||||
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
||||||
@ -10492,7 +10521,7 @@ vec_sra(vector unsigned long long __a, vector unsigned long long __b) {
|
|||||||
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
__b %= (vector unsigned long long)(sizeof(unsigned long long) * __CHAR_BIT__);
|
||||||
return (vector unsigned long long)((vector signed long long)__a >> __b);
|
return (vector unsigned long long)((vector signed long long)__a >> __b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif /* __VSX__ */
|
||||||
|
|
||||||
/* vec_vsrab */
|
/* vec_vsrab */
|
||||||
|
|
||||||
@ -13441,74 +13470,74 @@ vec_vxor(vector bool long long __a, vector bool long long __b) {
|
|||||||
/* vec_extract */
|
/* vec_extract */
|
||||||
|
|
||||||
static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
|
static __inline__ signed char __ATTRS_o_ai vec_extract(vector signed char __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0xf];
|
return __a[__b & 0xf];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned char __ATTRS_o_ai
|
static __inline__ unsigned char __ATTRS_o_ai
|
||||||
vec_extract(vector unsigned char __a, unsigned int __b) {
|
vec_extract(vector unsigned char __a, signed int __b) {
|
||||||
return __a[__b & 0xf];
|
return __a[__b & 0xf];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
|
static __inline__ unsigned char __ATTRS_o_ai vec_extract(vector bool char __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0xf];
|
return __a[__b & 0xf];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
|
static __inline__ signed short __ATTRS_o_ai vec_extract(vector signed short __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x7];
|
return __a[__b & 0x7];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned short __ATTRS_o_ai
|
static __inline__ unsigned short __ATTRS_o_ai
|
||||||
vec_extract(vector unsigned short __a, unsigned int __b) {
|
vec_extract(vector unsigned short __a, signed int __b) {
|
||||||
return __a[__b & 0x7];
|
return __a[__b & 0x7];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
|
static __inline__ unsigned short __ATTRS_o_ai vec_extract(vector bool short __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x7];
|
return __a[__b & 0x7];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
|
static __inline__ signed int __ATTRS_o_ai vec_extract(vector signed int __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x3];
|
return __a[__b & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
|
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector unsigned int __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x3];
|
return __a[__b & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
|
static __inline__ unsigned int __ATTRS_o_ai vec_extract(vector bool int __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x3];
|
return __a[__b & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __VSX__
|
#ifdef __VSX__
|
||||||
static __inline__ signed long long __ATTRS_o_ai
|
static __inline__ signed long long __ATTRS_o_ai
|
||||||
vec_extract(vector signed long long __a, unsigned int __b) {
|
vec_extract(vector signed long long __a, signed int __b) {
|
||||||
return __a[__b & 0x1];
|
return __a[__b & 0x1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned long long __ATTRS_o_ai
|
static __inline__ unsigned long long __ATTRS_o_ai
|
||||||
vec_extract(vector unsigned long long __a, unsigned int __b) {
|
vec_extract(vector unsigned long long __a, signed int __b) {
|
||||||
return __a[__b & 0x1];
|
return __a[__b & 0x1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ unsigned long long __ATTRS_o_ai
|
static __inline__ unsigned long long __ATTRS_o_ai
|
||||||
vec_extract(vector bool long long __a, unsigned int __b) {
|
vec_extract(vector bool long long __a, signed int __b) {
|
||||||
return __a[__b & 0x1];
|
return __a[__b & 0x1];
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ double __ATTRS_o_ai vec_extract(vector double __a,
|
static __inline__ double __ATTRS_o_ai vec_extract(vector double __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x1];
|
return __a[__b & 0x1];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __inline__ float __ATTRS_o_ai vec_extract(vector float __a,
|
static __inline__ float __ATTRS_o_ai vec_extract(vector float __a,
|
||||||
unsigned int __b) {
|
signed int __b) {
|
||||||
return __a[__b & 0x3];
|
return __a[__b & 0x3];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -13568,82 +13597,82 @@ vec_extract_fp32_from_shortl(vector unsigned short __a) {
|
|||||||
|
|
||||||
static __inline__ vector signed char __ATTRS_o_ai
|
static __inline__ vector signed char __ATTRS_o_ai
|
||||||
vec_insert(signed char __a, vector signed char __b, int __c) {
|
vec_insert(signed char __a, vector signed char __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0xF] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
|
vec_insert(unsigned char __a, vector unsigned char __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0xF] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
|
static __inline__ vector bool char __ATTRS_o_ai vec_insert(unsigned char __a,
|
||||||
vector bool char __b,
|
vector bool char __b,
|
||||||
int __c) {
|
int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0xF] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector signed short __ATTRS_o_ai
|
static __inline__ vector signed short __ATTRS_o_ai
|
||||||
vec_insert(signed short __a, vector signed short __b, int __c) {
|
vec_insert(signed short __a, vector signed short __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x7] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned short __ATTRS_o_ai
|
static __inline__ vector unsigned short __ATTRS_o_ai
|
||||||
vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
|
vec_insert(unsigned short __a, vector unsigned short __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x7] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector bool short __ATTRS_o_ai
|
static __inline__ vector bool short __ATTRS_o_ai
|
||||||
vec_insert(unsigned short __a, vector bool short __b, int __c) {
|
vec_insert(unsigned short __a, vector bool short __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x7] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector signed int __ATTRS_o_ai
|
static __inline__ vector signed int __ATTRS_o_ai
|
||||||
vec_insert(signed int __a, vector signed int __b, int __c) {
|
vec_insert(signed int __a, vector signed int __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x3] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned int __ATTRS_o_ai
|
static __inline__ vector unsigned int __ATTRS_o_ai
|
||||||
vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
|
vec_insert(unsigned int __a, vector unsigned int __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x3] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
|
static __inline__ vector bool int __ATTRS_o_ai vec_insert(unsigned int __a,
|
||||||
vector bool int __b,
|
vector bool int __b,
|
||||||
int __c) {
|
int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x3] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __VSX__
|
#ifdef __VSX__
|
||||||
static __inline__ vector signed long long __ATTRS_o_ai
|
static __inline__ vector signed long long __ATTRS_o_ai
|
||||||
vec_insert(signed long long __a, vector signed long long __b, int __c) {
|
vec_insert(signed long long __a, vector signed long long __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x1] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned long long __ATTRS_o_ai
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
|
vec_insert(unsigned long long __a, vector unsigned long long __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x1] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector bool long long __ATTRS_o_ai
|
static __inline__ vector bool long long __ATTRS_o_ai
|
||||||
vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
|
vec_insert(unsigned long long __a, vector bool long long __b, int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x1] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
|
static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
|
||||||
vector double __b,
|
vector double __b,
|
||||||
int __c) {
|
int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x1] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -13651,7 +13680,7 @@ static __inline__ vector double __ATTRS_o_ai vec_insert(double __a,
|
|||||||
static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
|
static __inline__ vector float __ATTRS_o_ai vec_insert(float __a,
|
||||||
vector float __b,
|
vector float __b,
|
||||||
int __c) {
|
int __c) {
|
||||||
__b[__c] = __a;
|
__b[__c & 0x3] = __a;
|
||||||
return __b;
|
return __b;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -14812,42 +14841,43 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool int __a,
|
|||||||
#ifdef __VSX__
|
#ifdef __VSX__
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector signed long long __a,
|
||||||
vector signed long long __b) {
|
vector signed long long __b) {
|
||||||
|
#ifdef __POWER8_VECTOR__
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
|
return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, __b);
|
||||||
|
#else
|
||||||
|
// No vcmpequd on Power7 so we xor the two vectors and compare against zero as
|
||||||
|
// 32-bit elements.
|
||||||
|
return vec_all_eq((vector signed int)vec_xor(__a, __b), (vector signed int)0);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, __a, (vector long long)__b);
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
|
||||||
vector unsigned long long __b) {
|
vector unsigned long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
||||||
vector long long __b) {
|
vector long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
||||||
vector unsigned long long __b) {
|
vector unsigned long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT, (vector long long)__a,
|
return vec_all_eq((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -14877,6 +14907,11 @@ static __inline__ int __ATTRS_o_ai vec_all_eq(vector unsigned __int128 __a,
|
|||||||
vector unsigned __int128 __b) {
|
vector unsigned __int128 __b) {
|
||||||
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
|
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ int __ATTRS_o_ai vec_all_eq(vector bool __int128 __a,
|
||||||
|
vector bool __int128 __b) {
|
||||||
|
return __builtin_altivec_vcmpequq_p(__CR6_LT, __a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_all_ge */
|
/* vec_all_ge */
|
||||||
@ -15822,6 +15857,11 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector unsigned __int128 __a,
|
|||||||
vector unsigned __int128 __b) {
|
vector unsigned __int128 __b) {
|
||||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
|
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool __int128 __a,
|
||||||
|
vector bool __int128 __b) {
|
||||||
|
return __builtin_altivec_vcmpequq_p(__CR6_EQ, __a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_all_nge */
|
/* vec_all_nge */
|
||||||
@ -16111,6 +16151,11 @@ static __inline__ int __ATTRS_o_ai vec_any_eq(vector unsigned __int128 __a,
|
|||||||
vector unsigned __int128 __b) {
|
vector unsigned __int128 __b) {
|
||||||
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
|
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ int __ATTRS_o_ai vec_any_eq(vector bool __int128 __a,
|
||||||
|
vector bool __int128 __b) {
|
||||||
|
return __builtin_altivec_vcmpequq_p(__CR6_EQ_REV, __a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_any_ge */
|
/* vec_any_ge */
|
||||||
@ -17020,43 +17065,43 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool int __a,
|
|||||||
#ifdef __VSX__
|
#ifdef __VSX__
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
|
||||||
vector signed long long __b) {
|
vector signed long long __b) {
|
||||||
|
#ifdef __POWER8_VECTOR__
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
|
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a, __b);
|
||||||
|
#else
|
||||||
|
// Take advantage of the optimized sequence for vec_all_eq when vcmpequd is
|
||||||
|
// not available.
|
||||||
|
return !vec_all_eq(__a, __b);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
|
||||||
vector unsigned long long __b) {
|
vector unsigned long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, (vector long long)__a,
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector signed long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(__CR6_LT_REV, __a,
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
(vector signed long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
||||||
vector signed long long __b) {
|
vector signed long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
||||||
vector unsigned long long __b) {
|
vector unsigned long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool long long __a,
|
||||||
vector bool long long __b) {
|
vector bool long long __b) {
|
||||||
return __builtin_altivec_vcmpequd_p(
|
return vec_any_ne((vector signed long long)__a, (vector signed long long)__b);
|
||||||
__CR6_LT_REV, (vector signed long long)__a, (vector signed long long)__b);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -17086,6 +17131,11 @@ static __inline__ int __ATTRS_o_ai vec_any_ne(vector unsigned __int128 __a,
|
|||||||
vector unsigned __int128 __b) {
|
vector unsigned __int128 __b) {
|
||||||
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
|
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ int __ATTRS_o_ai vec_any_ne(vector bool __int128 __a,
|
||||||
|
vector bool __int128 __b) {
|
||||||
|
return __builtin_altivec_vcmpequq_p(__CR6_LT_REV, __a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* vec_any_nge */
|
/* vec_any_nge */
|
||||||
@ -17203,6 +17253,7 @@ provided.
|
|||||||
#define vec_ncipher_be __builtin_altivec_crypto_vncipher
|
#define vec_ncipher_be __builtin_altivec_crypto_vncipher
|
||||||
#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
|
#define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
|
||||||
|
|
||||||
|
#ifdef __VSX__
|
||||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
||||||
__builtin_crypto_vsbox(vector unsigned long long __a) {
|
__builtin_crypto_vsbox(vector unsigned long long __a) {
|
||||||
return __builtin_altivec_crypto_vsbox(__a);
|
return __builtin_altivec_crypto_vsbox(__a);
|
||||||
@ -17231,6 +17282,7 @@ __builtin_crypto_vncipherlast(vector unsigned long long __a,
|
|||||||
vector unsigned long long __b) {
|
vector unsigned long long __b) {
|
||||||
return __builtin_altivec_crypto_vncipherlast(__a, __b);
|
return __builtin_altivec_crypto_vncipherlast(__a, __b);
|
||||||
}
|
}
|
||||||
|
#endif /* __VSX__ */
|
||||||
|
|
||||||
#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
|
#define __builtin_crypto_vshasigmad __builtin_altivec_crypto_vshasigmad
|
||||||
#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
|
#define __builtin_crypto_vshasigmaw __builtin_altivec_crypto_vshasigmaw
|
||||||
@ -17346,12 +17398,22 @@ vec_vbpermq(vector unsigned char __a, vector unsigned char __b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__powerpc64__) && defined(__SIZEOF_INT128__)
|
#if defined(__powerpc64__) && defined(__SIZEOF_INT128__)
|
||||||
static __inline__ vector unsigned long long __attribute__((__always_inline__))
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
|
vec_bperm(vector unsigned __int128 __a, vector unsigned char __b) {
|
||||||
return __builtin_altivec_vbpermq((vector unsigned char)__a,
|
return __builtin_altivec_vbpermq((vector unsigned char)__a,
|
||||||
(vector unsigned char)__b);
|
(vector unsigned char)__b);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
|
vec_bperm(vector unsigned char __a, vector unsigned char __b) {
|
||||||
|
return __builtin_altivec_vbpermq(__a, __b);
|
||||||
|
}
|
||||||
|
#endif // __POWER8_VECTOR__
|
||||||
|
#ifdef __POWER9_VECTOR__
|
||||||
|
static __inline__ vector unsigned long long __ATTRS_o_ai
|
||||||
|
vec_bperm(vector unsigned long long __a, vector unsigned char __b) {
|
||||||
|
return __builtin_altivec_vbpermd(__a, __b);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@ -18198,13 +18260,13 @@ vec_expandm(vector unsigned __int128 __a) {
|
|||||||
|
|
||||||
#define vec_cntm(__a, __mp) \
|
#define vec_cntm(__a, __mp) \
|
||||||
_Generic((__a), vector unsigned char \
|
_Generic((__a), vector unsigned char \
|
||||||
: __builtin_altivec_vcntmbb((__a), (unsigned int)(__mp)), \
|
: __builtin_altivec_vcntmbb((__a), (unsigned char)(__mp)), \
|
||||||
vector unsigned short \
|
vector unsigned short \
|
||||||
: __builtin_altivec_vcntmbh((__a), (unsigned int)(__mp)), \
|
: __builtin_altivec_vcntmbh((__a), (unsigned char)(__mp)), \
|
||||||
vector unsigned int \
|
vector unsigned int \
|
||||||
: __builtin_altivec_vcntmbw((__a), (unsigned int)(__mp)), \
|
: __builtin_altivec_vcntmbw((__a), (unsigned char)(__mp)), \
|
||||||
vector unsigned long long \
|
vector unsigned long long \
|
||||||
: __builtin_altivec_vcntmbd((__a), (unsigned int)(__mp)))
|
: __builtin_altivec_vcntmbd((__a), (unsigned char)(__mp)))
|
||||||
|
|
||||||
/* vec_gen[b|h|w|d|q]m */
|
/* vec_gen[b|h|w|d|q]m */
|
||||||
|
|
||||||
@ -18319,10 +18381,10 @@ vec_cfuge(vector unsigned long long __a, vector unsigned long long __b) {
|
|||||||
: __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
|
: __builtin_vsx_xxgenpcvdm((__a), (int)(__imm)))
|
||||||
#endif /* __VSX__ */
|
#endif /* __VSX__ */
|
||||||
|
|
||||||
/* vec_clrl */
|
/* vec_clr_first */
|
||||||
|
|
||||||
static __inline__ vector signed char __ATTRS_o_ai
|
static __inline__ vector signed char __ATTRS_o_ai
|
||||||
vec_clrl(vector signed char __a, unsigned int __n) {
|
vec_clr_first(vector signed char __a, unsigned int __n) {
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
return __builtin_altivec_vclrrb(__a, __n);
|
return __builtin_altivec_vclrrb(__a, __n);
|
||||||
#else
|
#else
|
||||||
@ -18331,7 +18393,7 @@ vec_clrl(vector signed char __a, unsigned int __n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
vec_clrl(vector unsigned char __a, unsigned int __n) {
|
vec_clr_first(vector unsigned char __a, unsigned int __n) {
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
return __builtin_altivec_vclrrb((vector signed char)__a, __n);
|
return __builtin_altivec_vclrrb((vector signed char)__a, __n);
|
||||||
#else
|
#else
|
||||||
@ -18339,10 +18401,10 @@ vec_clrl(vector unsigned char __a, unsigned int __n) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* vec_clrr */
|
/* vec_clr_last */
|
||||||
|
|
||||||
static __inline__ vector signed char __ATTRS_o_ai
|
static __inline__ vector signed char __ATTRS_o_ai
|
||||||
vec_clrr(vector signed char __a, unsigned int __n) {
|
vec_clr_last(vector signed char __a, unsigned int __n) {
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
return __builtin_altivec_vclrlb(__a, __n);
|
return __builtin_altivec_vclrlb(__a, __n);
|
||||||
#else
|
#else
|
||||||
@ -18351,7 +18413,7 @@ vec_clrr(vector signed char __a, unsigned int __n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned char __ATTRS_o_ai
|
static __inline__ vector unsigned char __ATTRS_o_ai
|
||||||
vec_clrr(vector unsigned char __a, unsigned int __n) {
|
vec_clr_last(vector unsigned char __a, unsigned int __n) {
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
return __builtin_altivec_vclrlb((vector signed char)__a, __n);
|
return __builtin_altivec_vclrlb((vector signed char)__a, __n);
|
||||||
#else
|
#else
|
||||||
@ -18733,36 +18795,39 @@ static __inline__ vector double __ATTRS_o_ai vec_splatid(const float __a) {
|
|||||||
|
|
||||||
static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
|
static __inline__ vector signed int __ATTRS_o_ai vec_splati_ins(
|
||||||
vector signed int __a, const unsigned int __b, const signed int __c) {
|
vector signed int __a, const unsigned int __b, const signed int __c) {
|
||||||
|
const unsigned int __d = __b & 0x01;
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__a[1 - __b] = __c;
|
__a[1 - __d] = __c;
|
||||||
__a[3 - __b] = __c;
|
__a[3 - __d] = __c;
|
||||||
#else
|
#else
|
||||||
__a[__b] = __c;
|
__a[__d] = __c;
|
||||||
__a[2 + __b] = __c;
|
__a[2 + __d] = __c;
|
||||||
#endif
|
#endif
|
||||||
return __a;
|
return __a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
|
static __inline__ vector unsigned int __ATTRS_o_ai vec_splati_ins(
|
||||||
vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
|
vector unsigned int __a, const unsigned int __b, const unsigned int __c) {
|
||||||
|
const unsigned int __d = __b & 0x01;
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__a[1 - __b] = __c;
|
__a[1 - __d] = __c;
|
||||||
__a[3 - __b] = __c;
|
__a[3 - __d] = __c;
|
||||||
#else
|
#else
|
||||||
__a[__b] = __c;
|
__a[__d] = __c;
|
||||||
__a[2 + __b] = __c;
|
__a[2 + __d] = __c;
|
||||||
#endif
|
#endif
|
||||||
return __a;
|
return __a;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ vector float __ATTRS_o_ai
|
static __inline__ vector float __ATTRS_o_ai
|
||||||
vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
|
vec_splati_ins(vector float __a, const unsigned int __b, const float __c) {
|
||||||
|
const unsigned int __d = __b & 0x01;
|
||||||
#ifdef __LITTLE_ENDIAN__
|
#ifdef __LITTLE_ENDIAN__
|
||||||
__a[1 - __b] = __c;
|
__a[1 - __d] = __c;
|
||||||
__a[3 - __b] = __c;
|
__a[3 - __d] = __c;
|
||||||
#else
|
#else
|
||||||
__a[__b] = __c;
|
__a[__d] = __c;
|
||||||
__a[2 + __b] = __c;
|
__a[2 + __d] = __c;
|
||||||
#endif
|
#endif
|
||||||
return __a;
|
return __a;
|
||||||
}
|
}
|
||||||
@ -18976,6 +19041,51 @@ vec_sra(vector signed __int128 __a, vector unsigned __int128 __b) {
|
|||||||
#endif /* __SIZEOF_INT128__ */
|
#endif /* __SIZEOF_INT128__ */
|
||||||
#endif /* __POWER10_VECTOR__ */
|
#endif /* __POWER10_VECTOR__ */
|
||||||
|
|
||||||
|
#ifdef __POWER8_VECTOR__
|
||||||
|
#define __bcdadd(__a, __b, __ps) __builtin_ppc_bcdadd((__a), (__b), (__ps))
|
||||||
|
#define __bcdsub(__a, __b, __ps) __builtin_ppc_bcdsub((__a), (__b), (__ps))
|
||||||
|
|
||||||
|
static __inline__ long __bcdadd_ofl(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdadd_p(__CR6_SO, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdsub_ofl(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcd_invalid(vector unsigned char __a) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_SO, __a, __a);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdcmpeq(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_EQ, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdcmplt(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_LT, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdcmpgt(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_GT, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdcmple(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_GT_REV, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __inline__ long __bcdcmpge(vector unsigned char __a,
|
||||||
|
vector unsigned char __b) {
|
||||||
|
return __builtin_ppc_bcdsub_p(__CR6_LT_REV, __a, __b);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // __POWER8_VECTOR__
|
||||||
|
|
||||||
#undef __ATTRS_o_ai
|
#undef __ATTRS_o_ai
|
||||||
|
|
||||||
#endif /* __ALTIVEC_H */
|
#endif /* __ALTIVEC_H */
|
||||||
|
|||||||
4
lib/include/ammintrin.h
vendored
4
lib/include/ammintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __AMMINTRIN_H
|
#ifndef __AMMINTRIN_H
|
||||||
#define __AMMINTRIN_H
|
#define __AMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
|||||||
19
lib/include/amxintrin.h
vendored
19
lib/include/amxintrin.h
vendored
@ -314,7 +314,7 @@ typedef struct __tile1024i_str {
|
|||||||
/// \param stride
|
/// \param stride
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
static void __tile_loadd(__tile1024i *dst, const void *base,
|
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
|
||||||
__SIZE_TYPE__ stride) {
|
__SIZE_TYPE__ stride) {
|
||||||
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
|
||||||
}
|
}
|
||||||
@ -335,7 +335,7 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
|
|||||||
/// \param stride
|
/// \param stride
|
||||||
/// The stride between the rows' data to be loaded in memory.
|
/// The stride between the rows' data to be loaded in memory.
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
static void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
||||||
__SIZE_TYPE__ stride) {
|
__SIZE_TYPE__ stride) {
|
||||||
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
|
||||||
}
|
}
|
||||||
@ -357,7 +357,7 @@ static void __tile_stream_loadd(__tile1024i *dst, const void *base,
|
|||||||
/// \param src1
|
/// \param src1
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
||||||
__tile1024i src1) {
|
__tile1024i src1) {
|
||||||
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
|
dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
src0.tile, src1.tile);
|
src0.tile, src1.tile);
|
||||||
@ -380,7 +380,7 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
|
|||||||
/// \param src1
|
/// \param src1
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
||||||
__tile1024i src1) {
|
__tile1024i src1) {
|
||||||
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
|
dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
src0.tile, src1.tile);
|
src0.tile, src1.tile);
|
||||||
@ -403,7 +403,7 @@ static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
|
|||||||
/// \param src1
|
/// \param src1
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
||||||
__tile1024i src1) {
|
__tile1024i src1) {
|
||||||
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
|
dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
src0.tile, src1.tile);
|
src0.tile, src1.tile);
|
||||||
@ -426,7 +426,7 @@ static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
|
|||||||
/// \param src1
|
/// \param src1
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_INT8
|
__DEFAULT_FN_ATTRS_INT8
|
||||||
static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
||||||
__tile1024i src1) {
|
__tile1024i src1) {
|
||||||
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
|
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
src0.tile, src1.tile);
|
src0.tile, src1.tile);
|
||||||
@ -446,7 +446,8 @@ static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
|
|||||||
/// \param stride
|
/// \param stride
|
||||||
/// The stride between the rows' data to be stored in memory.
|
/// The stride between the rows' data to be stored in memory.
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
|
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
|
||||||
|
__tile1024i src) {
|
||||||
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
_tile_stored_internal(src.row, src.col, base, stride, src.tile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -459,7 +460,7 @@ static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
|
|||||||
/// \param dst
|
/// \param dst
|
||||||
/// The destination tile to be zero. Max size is 1024 Bytes.
|
/// The destination tile to be zero. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_TILE
|
__DEFAULT_FN_ATTRS_TILE
|
||||||
static void __tile_zero(__tile1024i *dst) {
|
static __inline__ void __tile_zero(__tile1024i *dst) {
|
||||||
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
|
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -479,7 +480,7 @@ static void __tile_zero(__tile1024i *dst) {
|
|||||||
/// \param src1
|
/// \param src1
|
||||||
/// The 2nd source tile. Max size is 1024 Bytes.
|
/// The 2nd source tile. Max size is 1024 Bytes.
|
||||||
__DEFAULT_FN_ATTRS_BF16
|
__DEFAULT_FN_ATTRS_BF16
|
||||||
static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
|
||||||
__tile1024i src1) {
|
__tile1024i src1) {
|
||||||
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
|
||||||
src0.tile, src1.tile);
|
src0.tile, src1.tile);
|
||||||
|
|||||||
6
lib/include/arm_acle.h
vendored
6
lib/include/arm_acle.h
vendored
@ -730,6 +730,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
|
|||||||
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
|
#define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* Memory Operations Intrinsics */
|
||||||
|
#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
|
||||||
|
#define __arm_mops_memset_tag(__tagged_address, __value, __size) \
|
||||||
|
__builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Transactional Memory Extension (TME) Intrinsics */
|
/* Transactional Memory Extension (TME) Intrinsics */
|
||||||
#if __ARM_FEATURE_TME
|
#if __ARM_FEATURE_TME
|
||||||
|
|
||||||
|
|||||||
9420
lib/include/arm_neon.h
vendored
9420
lib/include/arm_neon.h
vendored
File diff suppressed because it is too large
Load Diff
204
lib/include/avx2intrin.h
vendored
204
lib/include/avx2intrin.h
vendored
@ -20,25 +20,25 @@
|
|||||||
|
|
||||||
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
/* SSE4 Multiple Packed Sums of Absolute Difference. */
|
||||||
#define _mm256_mpsadbw_epu8(X, Y, M) \
|
#define _mm256_mpsadbw_epu8(X, Y, M) \
|
||||||
(__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
|
((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
|
||||||
(__v32qi)(__m256i)(Y), (int)(M))
|
(__v32qi)(__m256i)(Y), (int)(M)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_abs_epi8(__m256i __a)
|
_mm256_abs_epi8(__m256i __a)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
|
return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_abs_epi16(__m256i __a)
|
_mm256_abs_epi16(__m256i __a)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
|
return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_abs_epi32(__m256i __a)
|
_mm256_abs_epi32(__m256i __a)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
|
return (__m256i)__builtin_elementwise_abs((__v8si)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
@ -114,8 +114,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_alignr_epi8(a, b, n) \
|
#define _mm256_alignr_epi8(a, b, n) \
|
||||||
(__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
|
((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
|
||||||
(__v32qi)(__m256i)(b), (n))
|
(__v32qi)(__m256i)(b), (n)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_and_si256(__m256i __a, __m256i __b)
|
_mm256_and_si256(__m256i __a, __m256i __b)
|
||||||
@ -149,8 +149,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_blend_epi16(V1, V2, M) \
|
#define _mm256_blend_epi16(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
|
((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
|
||||||
(__v16hi)(__m256i)(V2), (int)(M))
|
(__v16hi)(__m256i)(V2), (int)(M)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
|
_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
|
||||||
@ -253,73 +253,73 @@ _mm256_madd_epi16(__m256i __a, __m256i __b)
|
|||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epi8(__m256i __a, __m256i __b)
|
_mm256_max_epi8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epi16(__m256i __a, __m256i __b)
|
_mm256_max_epi16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epi32(__m256i __a, __m256i __b)
|
_mm256_max_epi32(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
|
return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epu8(__m256i __a, __m256i __b)
|
_mm256_max_epu8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epu16(__m256i __a, __m256i __b)
|
_mm256_max_epu16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_max_epu32(__m256i __a, __m256i __b)
|
_mm256_max_epu32(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
|
return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epi8(__m256i __a, __m256i __b)
|
_mm256_min_epi8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epi16(__m256i __a, __m256i __b)
|
_mm256_min_epi16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epi32(__m256i __a, __m256i __b)
|
_mm256_min_epi32(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
|
return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epu8(__m256i __a, __m256i __b)
|
_mm256_min_epu8(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
|
return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epu16(__m256i __a, __m256i __b)
|
_mm256_min_epu16(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
|
return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_min_epu32(__m256i __a, __m256i __b)
|
_mm256_min_epu32(__m256i __a, __m256i __b)
|
||||||
{
|
{
|
||||||
return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
|
return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int __DEFAULT_FN_ATTRS256
|
static __inline__ int __DEFAULT_FN_ATTRS256
|
||||||
@ -467,13 +467,13 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_shuffle_epi32(a, imm) \
|
#define _mm256_shuffle_epi32(a, imm) \
|
||||||
(__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_shufflehi_epi16(a, imm) \
|
#define _mm256_shufflehi_epi16(a, imm) \
|
||||||
(__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_shufflelo_epi16(a, imm) \
|
#define _mm256_shufflelo_epi16(a, imm) \
|
||||||
(__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_sign_epi8(__m256i __a, __m256i __b)
|
_mm256_sign_epi8(__m256i __a, __m256i __b)
|
||||||
@ -494,10 +494,10 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_slli_si256(a, imm) \
|
#define _mm256_slli_si256(a, imm) \
|
||||||
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_bslli_epi128(a, imm) \
|
#define _mm256_bslli_epi128(a, imm) \
|
||||||
(__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_slli_epi16(__m256i __a, int __count)
|
_mm256_slli_epi16(__m256i __a, int __count)
|
||||||
@ -560,10 +560,10 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_srli_si256(a, imm) \
|
#define _mm256_srli_si256(a, imm) \
|
||||||
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_bsrli_epi128(a, imm) \
|
#define _mm256_bsrli_epi128(a, imm) \
|
||||||
(__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))
|
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_srli_epi16(__m256i __a, int __count)
|
_mm256_srli_epi16(__m256i __a, int __count)
|
||||||
@ -743,12 +743,12 @@ _mm256_broadcastsi128_si256(__m128i __X)
|
|||||||
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
|
#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
|
||||||
|
|
||||||
#define _mm_blend_epi32(V1, V2, M) \
|
#define _mm_blend_epi32(V1, V2, M) \
|
||||||
(__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
|
((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
|
||||||
(__v4si)(__m128i)(V2), (int)(M))
|
(__v4si)(__m128i)(V2), (int)(M)))
|
||||||
|
|
||||||
#define _mm256_blend_epi32(V1, V2, M) \
|
#define _mm256_blend_epi32(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
|
((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
|
||||||
(__v8si)(__m256i)(V2), (int)(M))
|
(__v8si)(__m256i)(V2), (int)(M)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_broadcastb_epi8(__m128i __X)
|
_mm256_broadcastb_epi8(__m128i __X)
|
||||||
@ -806,7 +806,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_permute4x64_pd(V, M) \
|
#define _mm256_permute4x64_pd(V, M) \
|
||||||
(__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))
|
((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
|
_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
|
||||||
@ -815,17 +815,17 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_permute4x64_epi64(V, M) \
|
#define _mm256_permute4x64_epi64(V, M) \
|
||||||
(__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))
|
((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
|
||||||
|
|
||||||
#define _mm256_permute2x128_si256(V1, V2, M) \
|
#define _mm256_permute2x128_si256(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))
|
((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
|
||||||
|
|
||||||
#define _mm256_extracti128_si256(V, M) \
|
#define _mm256_extracti128_si256(V, M) \
|
||||||
(__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))
|
((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
|
||||||
|
|
||||||
#define _mm256_inserti128_si256(V1, V2, M) \
|
#define _mm256_inserti128_si256(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
|
((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
|
||||||
(__v2di)(__m128i)(V2), (int)(M))
|
(__v2di)(__m128i)(V2), (int)(M)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_maskload_epi32(int const *__X, __m256i __M)
|
_mm256_maskload_epi32(int const *__X, __m256i __M)
|
||||||
@ -936,211 +936,211 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
|
#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
|
||||||
(__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
|
((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v2df)(__m128d)(mask), (s))
|
(__v2df)(__m128d)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
|
#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
|
||||||
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
|
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4df)(__m256d)(mask), (s))
|
(__v4df)(__m256d)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
|
#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
|
||||||
(__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
|
((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v2df)(__m128d)(mask), (s))
|
(__v2df)(__m128d)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
|
#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
|
||||||
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
|
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4df)(__m256d)(mask), (s))
|
(__v4df)(__m256d)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
|
#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
|
||||||
(__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
|
((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4sf)(__m128)(mask), (s))
|
(__v4sf)(__m128)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
|
#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
|
||||||
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
|
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v8si)(__m256i)(i), \
|
(__v8si)(__m256i)(i), \
|
||||||
(__v8sf)(__m256)(mask), (s))
|
(__v8sf)(__m256)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
|
#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
|
||||||
(__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
|
((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v4sf)(__m128)(mask), (s))
|
(__v4sf)(__m128)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
|
#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
|
||||||
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
|
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4sf)(__m128)(mask), (s))
|
(__v4sf)(__m128)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
|
#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
|
||||||
(__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
|
((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
|
||||||
(int const *)(m), \
|
(int const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4si)(__m128i)(mask), (s))
|
(__v4si)(__m128i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
|
#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
|
||||||
(__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
|
((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
|
||||||
(int const *)(m), \
|
(int const *)(m), \
|
||||||
(__v8si)(__m256i)(i), \
|
(__v8si)(__m256i)(i), \
|
||||||
(__v8si)(__m256i)(mask), (s))
|
(__v8si)(__m256i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
|
#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
|
((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
|
||||||
(int const *)(m), \
|
(int const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v4si)(__m128i)(mask), (s))
|
(__v4si)(__m128i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
|
#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
|
((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
|
||||||
(int const *)(m), \
|
(int const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4si)(__m128i)(mask), (s))
|
(__v4si)(__m128i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
|
#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
|
||||||
(__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
|
((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v2di)(__m128i)(mask), (s))
|
(__v2di)(__m128i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
|
#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
|
||||||
(__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
|
((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4di)(__m256i)(mask), (s))
|
(__v4di)(__m256i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
|
#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
|
((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v2di)(__m128i)(mask), (s))
|
(__v2di)(__m128i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
|
#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
|
||||||
(__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
|
((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4di)(__m256i)(mask), (s))
|
(__v4di)(__m256i)(mask), (s)))
|
||||||
|
|
||||||
#define _mm_i32gather_pd(m, i, s) \
|
#define _mm_i32gather_pd(m, i, s) \
|
||||||
(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
|
((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
||||||
_mm_setzero_pd()), \
|
_mm_setzero_pd()), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm256_i32gather_pd(m, i, s) \
|
#define _mm256_i32gather_pd(m, i, s) \
|
||||||
(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
|
((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
||||||
_mm256_setzero_pd(), \
|
_mm256_setzero_pd(), \
|
||||||
_CMP_EQ_OQ), \
|
_CMP_EQ_OQ), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm_i64gather_pd(m, i, s) \
|
#define _mm_i64gather_pd(m, i, s) \
|
||||||
(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
|
((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
|
||||||
_mm_setzero_pd()), \
|
_mm_setzero_pd()), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm256_i64gather_pd(m, i, s) \
|
#define _mm256_i64gather_pd(m, i, s) \
|
||||||
(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
|
((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
|
||||||
(double const *)(m), \
|
(double const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
|
||||||
_mm256_setzero_pd(), \
|
_mm256_setzero_pd(), \
|
||||||
_CMP_EQ_OQ), \
|
_CMP_EQ_OQ), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm_i32gather_ps(m, i, s) \
|
#define _mm_i32gather_ps(m, i, s) \
|
||||||
(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
|
((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
||||||
_mm_setzero_ps()), \
|
_mm_setzero_ps()), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm256_i32gather_ps(m, i, s) \
|
#define _mm256_i32gather_ps(m, i, s) \
|
||||||
(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
|
((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v8si)(__m256i)(i), \
|
(__v8si)(__m256i)(i), \
|
||||||
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
|
(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
|
||||||
_mm256_setzero_ps(), \
|
_mm256_setzero_ps(), \
|
||||||
_CMP_EQ_OQ), \
|
_CMP_EQ_OQ), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm_i64gather_ps(m, i, s) \
|
#define _mm_i64gather_ps(m, i, s) \
|
||||||
(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
|
((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
||||||
_mm_setzero_ps()), \
|
_mm_setzero_ps()), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm256_i64gather_ps(m, i, s) \
|
#define _mm256_i64gather_ps(m, i, s) \
|
||||||
(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
|
((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
|
||||||
(float const *)(m), \
|
(float const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
|
||||||
_mm_setzero_ps()), \
|
_mm_setzero_ps()), \
|
||||||
(s))
|
(s)))
|
||||||
|
|
||||||
#define _mm_i32gather_epi32(m, i, s) \
|
#define _mm_i32gather_epi32(m, i, s) \
|
||||||
(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
|
((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
|
||||||
(int const *)(m), (__v4si)(__m128i)(i), \
|
(int const *)(m), (__v4si)(__m128i)(i), \
|
||||||
(__v4si)_mm_set1_epi32(-1), (s))
|
(__v4si)_mm_set1_epi32(-1), (s)))
|
||||||
|
|
||||||
#define _mm256_i32gather_epi32(m, i, s) \
|
#define _mm256_i32gather_epi32(m, i, s) \
|
||||||
(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
|
((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
|
||||||
(int const *)(m), (__v8si)(__m256i)(i), \
|
(int const *)(m), (__v8si)(__m256i)(i), \
|
||||||
(__v8si)_mm256_set1_epi32(-1), (s))
|
(__v8si)_mm256_set1_epi32(-1), (s)))
|
||||||
|
|
||||||
#define _mm_i64gather_epi32(m, i, s) \
|
#define _mm_i64gather_epi32(m, i, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
|
((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
|
||||||
(int const *)(m), (__v2di)(__m128i)(i), \
|
(int const *)(m), (__v2di)(__m128i)(i), \
|
||||||
(__v4si)_mm_set1_epi32(-1), (s))
|
(__v4si)_mm_set1_epi32(-1), (s)))
|
||||||
|
|
||||||
#define _mm256_i64gather_epi32(m, i, s) \
|
#define _mm256_i64gather_epi32(m, i, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
|
((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
|
||||||
(int const *)(m), (__v4di)(__m256i)(i), \
|
(int const *)(m), (__v4di)(__m256i)(i), \
|
||||||
(__v4si)_mm_set1_epi32(-1), (s))
|
(__v4si)_mm_set1_epi32(-1), (s)))
|
||||||
|
|
||||||
#define _mm_i32gather_epi64(m, i, s) \
|
#define _mm_i32gather_epi64(m, i, s) \
|
||||||
(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
|
((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v2di)_mm_set1_epi64x(-1), (s))
|
(__v2di)_mm_set1_epi64x(-1), (s)))
|
||||||
|
|
||||||
#define _mm256_i32gather_epi64(m, i, s) \
|
#define _mm256_i32gather_epi64(m, i, s) \
|
||||||
(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
|
((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4si)(__m128i)(i), \
|
(__v4si)(__m128i)(i), \
|
||||||
(__v4di)_mm256_set1_epi64x(-1), (s))
|
(__v4di)_mm256_set1_epi64x(-1), (s)))
|
||||||
|
|
||||||
#define _mm_i64gather_epi64(m, i, s) \
|
#define _mm_i64gather_epi64(m, i, s) \
|
||||||
(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
|
((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v2di)(__m128i)(i), \
|
(__v2di)(__m128i)(i), \
|
||||||
(__v2di)_mm_set1_epi64x(-1), (s))
|
(__v2di)_mm_set1_epi64x(-1), (s)))
|
||||||
|
|
||||||
#define _mm256_i64gather_epi64(m, i, s) \
|
#define _mm256_i64gather_epi64(m, i, s) \
|
||||||
(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
|
((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
|
||||||
(long long const *)(m), \
|
(long long const *)(m), \
|
||||||
(__v4di)(__m256i)(i), \
|
(__v4di)(__m256i)(i), \
|
||||||
(__v4di)_mm256_set1_epi64x(-1), (s))
|
(__v4di)_mm256_set1_epi64x(-1), (s)))
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS256
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
|||||||
6
lib/include/avx512bf16intrin.h
vendored
6
lib/include/avx512bf16intrin.h
vendored
@ -232,7 +232,7 @@ _mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
|
|||||||
///
|
///
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 256-bit vector of [16 x bfloat].
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
|
||||||
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
||||||
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
|
(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
|
||||||
@ -247,7 +247,7 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
|
|||||||
/// bit is not set.
|
/// bit is not set.
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 256-bit vector of [16 x bfloat].
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
|
_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
|
||||||
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
|
||||||
@ -265,7 +265,7 @@ _mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
|
|||||||
/// A 16-bit mask.
|
/// A 16-bit mask.
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 256-bit vector of [16 x bfloat].
|
/// A 256-bit vector of [16 x bfloat].
|
||||||
/// \returns A 512-bit vector of [16 x float] come from convertion of __A
|
/// \returns A 512-bit vector of [16 x float] come from conversion of __A
|
||||||
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
static __inline__ __m512 __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
|
_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
|
||||||
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
|
return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
|
||||||
|
|||||||
108
lib/include/avx512bwintrin.h
vendored
108
lib/include/avx512bwintrin.h
vendored
@ -178,16 +178,16 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _kshiftli_mask32(A, I) \
|
#define _kshiftli_mask32(A, I) \
|
||||||
(__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))
|
((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
#define _kshiftri_mask32(A, I) \
|
#define _kshiftri_mask32(A, I) \
|
||||||
(__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))
|
((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
#define _kshiftli_mask64(A, I) \
|
#define _kshiftli_mask64(A, I) \
|
||||||
(__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))
|
((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
#define _kshiftri_mask64(A, I) \
|
#define _kshiftri_mask64(A, I) \
|
||||||
(__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))
|
((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
_cvtmask32_u32(__mmask32 __A) {
|
_cvtmask32_u32(__mmask32 __A) {
|
||||||
@ -232,44 +232,44 @@ _store_mask64(__mmask64 *__A, __mmask64 __B) {
|
|||||||
/* Integer compare */
|
/* Integer compare */
|
||||||
|
|
||||||
#define _mm512_cmp_epi8_mask(a, b, p) \
|
#define _mm512_cmp_epi8_mask(a, b, p) \
|
||||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||||
(__v64qi)(__m512i)(b), (int)(p), \
|
(__v64qi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask64)-1)
|
(__mmask64)-1))
|
||||||
|
|
||||||
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
|
#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
|
||||||
(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
|
||||||
(__v64qi)(__m512i)(b), (int)(p), \
|
(__v64qi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask64)(m))
|
(__mmask64)(m)))
|
||||||
|
|
||||||
#define _mm512_cmp_epu8_mask(a, b, p) \
|
#define _mm512_cmp_epu8_mask(a, b, p) \
|
||||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||||
(__v64qi)(__m512i)(b), (int)(p), \
|
(__v64qi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask64)-1)
|
(__mmask64)-1))
|
||||||
|
|
||||||
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
|
#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
|
||||||
(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
|
||||||
(__v64qi)(__m512i)(b), (int)(p), \
|
(__v64qi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask64)(m))
|
(__mmask64)(m)))
|
||||||
|
|
||||||
#define _mm512_cmp_epi16_mask(a, b, p) \
|
#define _mm512_cmp_epi16_mask(a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||||
(__v32hi)(__m512i)(b), (int)(p), \
|
(__v32hi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask32)-1)
|
(__mmask32)-1))
|
||||||
|
|
||||||
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
|
#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
|
||||||
(__v32hi)(__m512i)(b), (int)(p), \
|
(__v32hi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask32)(m))
|
(__mmask32)(m)))
|
||||||
|
|
||||||
#define _mm512_cmp_epu16_mask(a, b, p) \
|
#define _mm512_cmp_epu16_mask(a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||||
(__v32hi)(__m512i)(b), (int)(p), \
|
(__v32hi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask32)-1)
|
(__mmask32)-1))
|
||||||
|
|
||||||
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
|
#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
|
||||||
(__v32hi)(__m512i)(b), (int)(p), \
|
(__v32hi)(__m512i)(b), (int)(p), \
|
||||||
(__mmask32)(m))
|
(__mmask32)(m)))
|
||||||
|
|
||||||
#define _mm512_cmpeq_epi8_mask(A, B) \
|
#define _mm512_cmpeq_epi8_mask(A, B) \
|
||||||
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
|
_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
|
||||||
@ -485,7 +485,7 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_abs_epi8 (__m512i __A)
|
_mm512_abs_epi8 (__m512i __A)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
|
return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -507,7 +507,7 @@ _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_abs_epi16 (__m512i __A)
|
_mm512_abs_epi16 (__m512i __A)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
|
return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -751,7 +751,7 @@ _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_max_epi8 (__m512i __A, __m512i __B)
|
_mm512_max_epi8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
|
return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -773,7 +773,7 @@ _mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_max_epi16 (__m512i __A, __m512i __B)
|
_mm512_max_epi16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
|
return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -796,7 +796,7 @@ _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_max_epu8 (__m512i __A, __m512i __B)
|
_mm512_max_epu8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
|
return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -818,7 +818,7 @@ _mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_max_epu16 (__m512i __A, __m512i __B)
|
_mm512_max_epu16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
|
return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -840,7 +840,7 @@ _mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_min_epi8 (__m512i __A, __m512i __B)
|
_mm512_min_epi8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
|
return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -862,7 +862,7 @@ _mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_min_epi16 (__m512i __A, __m512i __B)
|
_mm512_min_epi16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
|
return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -884,7 +884,7 @@ _mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_min_epu8 (__m512i __A, __m512i __B)
|
_mm512_min_epu8 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
|
return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -906,7 +906,7 @@ _mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
|
|||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_min_epu16 (__m512i __A, __m512i __B)
|
_mm512_min_epu16 (__m512i __A, __m512i __B)
|
||||||
{
|
{
|
||||||
return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
|
return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -1428,36 +1428,36 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
|
|||||||
|
|
||||||
|
|
||||||
#define _mm512_shufflehi_epi16(A, imm) \
|
#define _mm512_shufflehi_epi16(A, imm) \
|
||||||
(__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))
|
((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
|
#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shufflehi_epi16((A), \
|
(__v32hi)_mm512_shufflehi_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v32hi)(__m512i)(W))
|
(__v32hi)(__m512i)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
|
#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shufflehi_epi16((A), \
|
(__v32hi)_mm512_shufflehi_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v32hi)_mm512_setzero_si512())
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shufflelo_epi16(A, imm) \
|
#define _mm512_shufflelo_epi16(A, imm) \
|
||||||
(__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))
|
((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
|
||||||
|
|
||||||
|
|
||||||
#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
|
#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shufflelo_epi16((A), \
|
(__v32hi)_mm512_shufflelo_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v32hi)(__m512i)(W))
|
(__v32hi)(__m512i)(W)))
|
||||||
|
|
||||||
|
|
||||||
#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
|
#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shufflelo_epi16((A), \
|
(__v32hi)_mm512_shufflelo_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v32hi)_mm512_setzero_si512())
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_sllv_epi16(__m512i __A, __m512i __B)
|
_mm512_sllv_epi16(__m512i __A, __m512i __B)
|
||||||
@ -1527,7 +1527,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_bslli_epi128(a, imm) \
|
#define _mm512_bslli_epi128(a, imm) \
|
||||||
(__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
|
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_srlv_epi16(__m512i __A, __m512i __B)
|
_mm512_srlv_epi16(__m512i __A, __m512i __B)
|
||||||
@ -1664,7 +1664,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_bsrli_epi128(a, imm) \
|
#define _mm512_bsrli_epi128(a, imm) \
|
||||||
(__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
|
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
|
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
|
||||||
@ -1984,32 +1984,32 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_alignr_epi8(A, B, N) \
|
#define _mm512_alignr_epi8(A, B, N) \
|
||||||
(__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
|
||||||
(__v64qi)(__m512i)(B), (int)(N))
|
(__v64qi)(__m512i)(B), (int)(N)))
|
||||||
|
|
||||||
#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
|
#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
|
||||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||||
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
|
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v64qi)(__m512i)(W))
|
(__v64qi)(__m512i)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_alignr_epi8(U, A, B, N) \
|
#define _mm512_maskz_alignr_epi8(U, A, B, N) \
|
||||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||||
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
|
(__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v64qi)(__m512i)_mm512_setzero_si512())
|
(__v64qi)(__m512i)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_dbsad_epu8(A, B, imm) \
|
#define _mm512_dbsad_epu8(A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
|
||||||
(__v64qi)(__m512i)(B), (int)(imm))
|
(__v64qi)(__m512i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
|
#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
|
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v32hi)(__m512i)(W))
|
(__v32hi)(__m512i)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
|
#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
|
(__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v32hi)_mm512_setzero_si512())
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_sad_epu8 (__m512i __A, __m512i __B)
|
_mm512_sad_epu8 (__m512i __A, __m512i __B)
|
||||||
|
|||||||
444
lib/include/avx512dqintrin.h
vendored
444
lib/include/avx512dqintrin.h
vendored
@ -121,10 +121,10 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _kshiftli_mask8(A, I) \
|
#define _kshiftli_mask8(A, I) \
|
||||||
(__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))
|
((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
#define _kshiftri_mask8(A, I) \
|
#define _kshiftri_mask8(A, I) \
|
||||||
(__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))
|
((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
|
||||||
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
_cvtmask8_u32(__mmask8 __A) {
|
_cvtmask8_u32(__mmask8 __A) {
|
||||||
@ -342,19 +342,19 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundpd_epi64(A, R) \
|
#define _mm512_cvt_roundpd_epi64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
|
#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
|
#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvtpd_epu64 (__m512d __A) {
|
_mm512_cvtpd_epu64 (__m512d __A) {
|
||||||
@ -381,19 +381,19 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundpd_epu64(A, R) \
|
#define _mm512_cvt_roundpd_epu64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
|
#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
|
#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvtps_epi64 (__m256 __A) {
|
_mm512_cvtps_epi64 (__m256 __A) {
|
||||||
@ -420,19 +420,19 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundps_epi64(A, R) \
|
#define _mm512_cvt_roundps_epi64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
|
#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
|
#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvtps_epu64 (__m256 __A) {
|
_mm512_cvtps_epu64 (__m256 __A) {
|
||||||
@ -459,19 +459,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundps_epu64(A, R) \
|
#define _mm512_cvt_roundps_epu64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
|
#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
|
#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
|
|
||||||
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
||||||
@ -494,19 +494,19 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundepi64_pd(A, R) \
|
#define _mm512_cvt_roundepi64_pd(A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
|
#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)(__m512d)(W), \
|
(__v8df)(__m512d)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
|
#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS512
|
static __inline__ __m256 __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvtepi64_ps (__m512i __A) {
|
_mm512_cvtepi64_ps (__m512i __A) {
|
||||||
@ -533,19 +533,19 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundepi64_ps(A, R) \
|
#define _mm512_cvt_roundepi64_ps(A, R) \
|
||||||
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
|
#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
|
||||||
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)(__m256)(W), (__mmask8)(U), \
|
(__v8sf)(__m256)(W), (__mmask8)(U), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
|
#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
|
||||||
(__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
@ -573,19 +573,19 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvtt_roundpd_epi64(A, R) \
|
#define _mm512_cvtt_roundpd_epi64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
|
#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
|
#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvttpd_epu64 (__m512d __A) {
|
_mm512_cvttpd_epu64 (__m512d __A) {
|
||||||
@ -612,19 +612,19 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvtt_roundpd_epu64(A, R) \
|
#define _mm512_cvtt_roundpd_epu64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
|
#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
|
#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvttps_epi64 (__m256 __A) {
|
_mm512_cvttps_epi64 (__m256 __A) {
|
||||||
@ -651,19 +651,19 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvtt_roundps_epi64(A, R) \
|
#define _mm512_cvtt_roundps_epi64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
|
#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
|
#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
static __inline__ __m512i __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvttps_epu64 (__m256 __A) {
|
_mm512_cvttps_epu64 (__m256 __A) {
|
||||||
@ -690,19 +690,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvtt_roundps_epu64(A, R) \
|
#define _mm512_cvtt_roundps_epu64(A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
|
#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)(__m512i)(W), \
|
(__v8di)(__m512i)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
|
#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
|
||||||
(__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8di)_mm512_setzero_si512(), \
|
(__v8di)_mm512_setzero_si512(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
static __inline__ __m512d __DEFAULT_FN_ATTRS512
|
||||||
_mm512_cvtepu64_pd (__m512i __A) {
|
_mm512_cvtepu64_pd (__m512i __A) {
|
||||||
@ -724,20 +724,20 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundepu64_pd(A, R) \
|
#define _mm512_cvt_roundepu64_pd(A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
|
#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)(__m512d)(W), \
|
(__v8df)(__m512d)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
|
#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
|
||||||
(__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
|
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS512
|
static __inline__ __m256 __DEFAULT_FN_ATTRS512
|
||||||
@ -765,290 +765,290 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_cvt_roundepu64_ps(A, R) \
|
#define _mm512_cvt_roundepu64_ps(A, R) \
|
||||||
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
|
#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
|
||||||
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)(__m256)(W), (__mmask8)(U), \
|
(__v8sf)(__m256)(W), (__mmask8)(U), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
|
#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
|
||||||
(__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_range_pd(A, B, C) \
|
#define _mm512_range_pd(A, B, C) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, \
|
(__mmask8)-1, \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_mask_range_pd(W, U, A, B, C) \
|
#define _mm512_mask_range_pd(W, U, A, B, C) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)(__m512d)(W), (__mmask8)(U), \
|
(__v8df)(__m512d)(W), (__mmask8)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_maskz_range_pd(U, A, B, C) \
|
#define _mm512_maskz_range_pd(U, A, B, C) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), \
|
(__mmask8)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_range_round_pd(A, B, C, R) \
|
#define _mm512_range_round_pd(A, B, C, R) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
|
#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)(__m512d)(W), (__mmask8)(U), \
|
(__v8df)(__m512d)(W), (__mmask8)(U), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
|
#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
|
||||||
(__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(B), (int)(C), \
|
(__v8df)(__m512d)(B), (int)(C), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_range_ps(A, B, C) \
|
#define _mm512_range_ps(A, B, C) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, \
|
(__mmask16)-1, \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_mask_range_ps(W, U, A, B, C) \
|
#define _mm512_mask_range_ps(W, U, A, B, C) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)(__m512)(W), (__mmask16)(U), \
|
(__v16sf)(__m512)(W), (__mmask16)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_maskz_range_ps(U, A, B, C) \
|
#define _mm512_maskz_range_ps(U, A, B, C) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(U), \
|
(__mmask16)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_range_round_ps(A, B, C, R) \
|
#define _mm512_range_round_ps(A, B, C, R) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, (int)(R))
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
|
#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)(__m512)(W), (__mmask16)(U), \
|
(__v16sf)(__m512)(W), (__mmask16)(U), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
|
#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
|
||||||
(__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(B), (int)(C), \
|
(__v16sf)(__m512)(B), (int)(C), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(U), (int)(R))
|
(__mmask16)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm_range_round_ss(A, B, C, R) \
|
#define _mm_range_round_ss(A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8) -1, (int)(C),\
|
(__mmask8) -1, (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
|
#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)(__m128)(W),\
|
(__v4sf)(__m128)(W),\
|
||||||
(__mmask8)(U), (int)(C),\
|
(__mmask8)(U), (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
|
#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_maskz_range_round_ss(U, A, B, C, R) \
|
#define _mm_maskz_range_round_ss(U, A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(U), (int)(C),\
|
(__mmask8)(U), (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_range_round_sd(A, B, C, R) \
|
#define _mm_range_round_sd(A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8) -1, (int)(C),\
|
(__mmask8) -1, (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
|
#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)(__m128d)(W),\
|
(__v2df)(__m128d)(W),\
|
||||||
(__mmask8)(U), (int)(C),\
|
(__mmask8)(U), (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_maskz_range_round_sd(U, A, B, C, R) \
|
#define _mm_maskz_range_round_sd(U, A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(C),\
|
(__mmask8)(U), (int)(C),\
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm512_reduce_pd(A, B) \
|
#define _mm512_reduce_pd(A, B) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, \
|
(__mmask8)-1, \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_mask_reduce_pd(W, U, A, B) \
|
#define _mm512_mask_reduce_pd(W, U, A, B) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)(__m512d)(W), \
|
(__v8df)(__m512d)(W), \
|
||||||
(__mmask8)(U), \
|
(__mmask8)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_maskz_reduce_pd(U, A, B) \
|
#define _mm512_maskz_reduce_pd(U, A, B) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), \
|
(__mmask8)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_reduce_ps(A, B) \
|
#define _mm512_reduce_ps(A, B) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, \
|
(__mmask16)-1, \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_mask_reduce_ps(W, U, A, B) \
|
#define _mm512_mask_reduce_ps(W, U, A, B) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)(__m512)(W), \
|
(__v16sf)(__m512)(W), \
|
||||||
(__mmask16)(U), \
|
(__mmask16)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_maskz_reduce_ps(U, A, B) \
|
#define _mm512_maskz_reduce_ps(U, A, B) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(U), \
|
(__mmask16)(U), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm512_reduce_round_pd(A, B, R) \
|
#define _mm512_reduce_round_pd(A, B, R) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
|
#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)(__m512d)(W), \
|
(__v8df)(__m512d)(W), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
|
#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
|
||||||
(__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(R))
|
(__mmask8)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_reduce_round_ps(A, B, R) \
|
#define _mm512_reduce_round_ps(A, B, R) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, (int)(R))
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
|
#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)(__m512)(W), \
|
(__v16sf)(__m512)(W), \
|
||||||
(__mmask16)(U), (int)(R))
|
(__mmask16)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
|
#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
|
||||||
(__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(U), (int)(R))
|
(__mmask16)(U), (int)(R)))
|
||||||
|
|
||||||
#define _mm_reduce_ss(A, B, C) \
|
#define _mm_reduce_ss(A, B, C) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
|
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
|
||||||
(int)(C), _MM_FROUND_CUR_DIRECTION)
|
(int)(C), _MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_mask_reduce_ss(W, U, A, B, C) \
|
#define _mm_mask_reduce_ss(W, U, A, B, C) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)(__m128)(W), (__mmask8)(U), \
|
(__v4sf)(__m128)(W), (__mmask8)(U), \
|
||||||
(int)(C), _MM_FROUND_CUR_DIRECTION)
|
(int)(C), _MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_ss(U, A, B, C) \
|
#define _mm_maskz_reduce_ss(U, A, B, C) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(U), (int)(C), \
|
(__mmask8)(U), (int)(C), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_reduce_round_ss(A, B, C, R) \
|
#define _mm_reduce_round_ss(A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
|
(__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
|
||||||
(int)(C), (int)(R))
|
(int)(C), (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
|
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)(__m128)(W), (__mmask8)(U), \
|
(__v4sf)(__m128)(W), (__mmask8)(U), \
|
||||||
(int)(C), (int)(R))
|
(int)(C), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
|
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
|
||||||
(__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(U), (int)(C), (int)(R))
|
(__mmask8)(U), (int)(C), (int)(R)))
|
||||||
|
|
||||||
#define _mm_reduce_sd(A, B, C) \
|
#define _mm_reduce_sd(A, B, C) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(C), \
|
(__mmask8)-1, (int)(C), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_mask_reduce_sd(W, U, A, B, C) \
|
#define _mm_mask_reduce_sd(W, U, A, B, C) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)(__m128d)(W), (__mmask8)(U), \
|
(__v2df)(__m128d)(W), (__mmask8)(U), \
|
||||||
(int)(C), _MM_FROUND_CUR_DIRECTION)
|
(int)(C), _MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_sd(U, A, B, C) \
|
#define _mm_maskz_reduce_sd(U, A, B, C) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(C), \
|
(__mmask8)(U), (int)(C), \
|
||||||
_MM_FROUND_CUR_DIRECTION)
|
_MM_FROUND_CUR_DIRECTION))
|
||||||
|
|
||||||
#define _mm_reduce_round_sd(A, B, C, R) \
|
#define _mm_reduce_round_sd(A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(C), (int)(R))
|
(__mmask8)-1, (int)(C), (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
|
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)(__m128d)(W), (__mmask8)(U), \
|
(__v2df)(__m128d)(W), (__mmask8)(U), \
|
||||||
(int)(C), (int)(R))
|
(int)(C), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
|
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
|
||||||
(__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U), (int)(C), (int)(R))
|
(__mmask8)(U), (int)(C), (int)(R)))
|
||||||
|
|
||||||
static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
|
static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
|
||||||
_mm512_movepi32_mask (__m512i __A)
|
_mm512_movepi32_mask (__m512i __A)
|
||||||
@ -1218,158 +1218,158 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_extractf32x8_ps(A, imm) \
|
#define _mm512_extractf32x8_ps(A, imm) \
|
||||||
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
||||||
(__v8sf)_mm256_undefined_ps(), \
|
(__v8sf)_mm256_undefined_ps(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
|
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
|
||||||
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
||||||
(__v8sf)(__m256)(W), \
|
(__v8sf)(__m256)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
|
#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
|
||||||
(__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_extractf64x2_pd(A, imm) \
|
#define _mm512_extractf64x2_pd(A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)_mm_undefined_pd(), \
|
(__v2df)_mm_undefined_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
|
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)(__m128d)(W), \
|
(__v2df)(__m128d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
|
#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_extracti32x8_epi32(A, imm) \
|
#define _mm512_extracti32x8_epi32(A, imm) \
|
||||||
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
||||||
(__v8si)_mm256_undefined_si256(), \
|
(__v8si)_mm256_undefined_si256(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
|
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
||||||
(__v8si)(__m256i)(W), \
|
(__v8si)(__m256i)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
|
#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
|
||||||
(__v8si)_mm256_setzero_si256(), \
|
(__v8si)_mm256_setzero_si256(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_extracti64x2_epi64(A, imm) \
|
#define _mm512_extracti64x2_epi64(A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)_mm_undefined_si128(), \
|
(__v2di)_mm_undefined_si128(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
|
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)(__m128i)(W), \
|
(__v2di)(__m128i)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
|
#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)_mm_setzero_si128(), \
|
(__v2di)_mm_setzero_si128(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_insertf32x8(A, B, imm) \
|
#define _mm512_insertf32x8(A, B, imm) \
|
||||||
(__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
|
||||||
(__v8sf)(__m256)(B), (int)(imm))
|
(__v8sf)(__m256)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
|
#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
|
||||||
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
|
((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
|
||||||
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
|
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
|
||||||
(__v16sf)(__m512)(W))
|
(__v16sf)(__m512)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_insertf32x8(U, A, B, imm) \
|
#define _mm512_maskz_insertf32x8(U, A, B, imm) \
|
||||||
(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
|
((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
|
||||||
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
|
(__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
|
||||||
(__v16sf)_mm512_setzero_ps())
|
(__v16sf)_mm512_setzero_ps()))
|
||||||
|
|
||||||
#define _mm512_insertf64x2(A, B, imm) \
|
#define _mm512_insertf64x2(A, B, imm) \
|
||||||
(__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
|
||||||
(__v2df)(__m128d)(B), (int)(imm))
|
(__v2df)(__m128d)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
|
#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
|
||||||
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
|
((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
|
||||||
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
|
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
|
||||||
(__v8df)(__m512d)(W))
|
(__v8df)(__m512d)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_insertf64x2(U, A, B, imm) \
|
#define _mm512_maskz_insertf64x2(U, A, B, imm) \
|
||||||
(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
|
((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
|
||||||
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
|
(__v8df)_mm512_insertf64x2((A), (B), (imm)), \
|
||||||
(__v8df)_mm512_setzero_pd())
|
(__v8df)_mm512_setzero_pd()))
|
||||||
|
|
||||||
#define _mm512_inserti32x8(A, B, imm) \
|
#define _mm512_inserti32x8(A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
|
||||||
(__v8si)(__m256i)(B), (int)(imm))
|
(__v8si)(__m256i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
|
#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
|
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
|
||||||
(__v16si)(__m512i)(W))
|
(__v16si)(__m512i)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_inserti32x8(U, A, B, imm) \
|
#define _mm512_maskz_inserti32x8(U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
|
(__v16si)_mm512_inserti32x8((A), (B), (imm)), \
|
||||||
(__v16si)_mm512_setzero_si512())
|
(__v16si)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_inserti64x2(A, B, imm) \
|
#define _mm512_inserti64x2(A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
|
||||||
(__v2di)(__m128i)(B), (int)(imm))
|
(__v2di)(__m128i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
|
#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
|
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
|
||||||
(__v8di)(__m512i)(W))
|
(__v8di)(__m512i)(W)))
|
||||||
|
|
||||||
#define _mm512_maskz_inserti64x2(U, A, B, imm) \
|
#define _mm512_maskz_inserti64x2(U, A, B, imm) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
|
(__v8di)_mm512_inserti64x2((A), (B), (imm)), \
|
||||||
(__v8di)_mm512_setzero_si512())
|
(__v8di)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
|
#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
|
||||||
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
|
((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
|
||||||
(int)(imm), (__mmask16)(U))
|
(int)(imm), (__mmask16)(U)))
|
||||||
|
|
||||||
#define _mm512_fpclass_ps_mask(A, imm) \
|
#define _mm512_fpclass_ps_mask(A, imm) \
|
||||||
(__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
|
((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
|
||||||
(int)(imm), (__mmask16)-1)
|
(int)(imm), (__mmask16)-1))
|
||||||
|
|
||||||
#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
|
#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm512_fpclass_pd_mask(A, imm) \
|
#define _mm512_fpclass_pd_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_fpclass_sd_mask(A, imm) \
|
#define _mm_fpclass_sd_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_fpclass_sd_mask(U, A, imm) \
|
#define _mm_mask_fpclass_sd_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_fpclass_ss_mask(A, imm) \
|
#define _mm_fpclass_ss_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_fpclass_ss_mask(U, A, imm) \
|
#define _mm_mask_fpclass_ss_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS512
|
#undef __DEFAULT_FN_ATTRS512
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|||||||
120
lib/include/avx512erintrin.h
vendored
120
lib/include/avx512erintrin.h
vendored
@ -15,19 +15,19 @@
|
|||||||
|
|
||||||
/* exp2a23 */
|
/* exp2a23 */
|
||||||
#define _mm512_exp2a23_round_pd(A, R) \
|
#define _mm512_exp2a23_round_pd(A, R) \
|
||||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
|
#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
|
||||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
|
#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
|
||||||
(__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_exp2a23_pd(A) \
|
#define _mm512_exp2a23_pd(A) \
|
||||||
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -39,19 +39,19 @@
|
|||||||
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm512_exp2a23_round_ps(A, R) \
|
#define _mm512_exp2a23_round_ps(A, R) \
|
||||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, (int)(R))
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
|
#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
|
||||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
|
#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
|
||||||
(__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(M), (int)(R))
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_exp2a23_ps(A) \
|
#define _mm512_exp2a23_ps(A) \
|
||||||
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -64,19 +64,19 @@
|
|||||||
|
|
||||||
/* rsqrt28 */
|
/* rsqrt28 */
|
||||||
#define _mm512_rsqrt28_round_pd(A, R) \
|
#define _mm512_rsqrt28_round_pd(A, R) \
|
||||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
|
#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
|
||||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
|
#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
|
||||||
(__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_rsqrt28_pd(A) \
|
#define _mm512_rsqrt28_pd(A) \
|
||||||
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -88,19 +88,19 @@
|
|||||||
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm512_rsqrt28_round_ps(A, R) \
|
#define _mm512_rsqrt28_round_ps(A, R) \
|
||||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, (int)(R))
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
|
#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
|
||||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
|
#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
|
||||||
(__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(M), (int)(R))
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_rsqrt28_ps(A) \
|
#define _mm512_rsqrt28_ps(A) \
|
||||||
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -112,22 +112,22 @@
|
|||||||
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_rsqrt28_round_ss(A, B, R) \
|
#define _mm_rsqrt28_round_ss(A, B, R) \
|
||||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
|
#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
|
||||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)(__m128)(S), \
|
(__v4sf)(__m128)(S), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
|
#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
|
||||||
(__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_rsqrt28_ss(A, B) \
|
#define _mm_rsqrt28_ss(A, B) \
|
||||||
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -139,22 +139,22 @@
|
|||||||
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_rsqrt28_round_sd(A, B, R) \
|
#define _mm_rsqrt28_round_sd(A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
|
#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)(__m128d)(S), \
|
(__v2df)(__m128d)(S), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
|
#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_rsqrt28_sd(A, B) \
|
#define _mm_rsqrt28_sd(A, B) \
|
||||||
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -167,19 +167,19 @@
|
|||||||
|
|
||||||
/* rcp28 */
|
/* rcp28 */
|
||||||
#define _mm512_rcp28_round_pd(A, R) \
|
#define _mm512_rcp28_round_pd(A, R) \
|
||||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
|
#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
|
||||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
(__v8df)(__m512d)(S), (__mmask8)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
|
#define _mm512_maskz_rcp28_round_pd(M, A, R) \
|
||||||
(__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
|
||||||
(__v8df)_mm512_setzero_pd(), \
|
(__v8df)_mm512_setzero_pd(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_rcp28_pd(A) \
|
#define _mm512_rcp28_pd(A) \
|
||||||
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -191,19 +191,19 @@
|
|||||||
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm512_rcp28_round_ps(A, R) \
|
#define _mm512_rcp28_round_ps(A, R) \
|
||||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)-1, (int)(R))
|
(__mmask16)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
|
#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
|
||||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
(__v16sf)(__m512)(S), (__mmask16)(M), \
|
||||||
(int)(R))
|
(int)(R)))
|
||||||
|
|
||||||
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
|
#define _mm512_maskz_rcp28_round_ps(M, A, R) \
|
||||||
(__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
|
||||||
(__v16sf)_mm512_setzero_ps(), \
|
(__v16sf)_mm512_setzero_ps(), \
|
||||||
(__mmask16)(M), (int)(R))
|
(__mmask16)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm512_rcp28_ps(A) \
|
#define _mm512_rcp28_ps(A) \
|
||||||
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -215,22 +215,22 @@
|
|||||||
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
_mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_rcp28_round_ss(A, B, R) \
|
#define _mm_rcp28_round_ss(A, B, R) \
|
||||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
|
#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
|
||||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)(__m128)(S), \
|
(__v4sf)(__m128)(S), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
|
#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
|
||||||
(__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), \
|
(__v4sf)(__m128)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_rcp28_ss(A, B) \
|
#define _mm_rcp28_ss(A, B) \
|
||||||
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
@ -242,22 +242,22 @@
|
|||||||
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|
||||||
#define _mm_rcp28_round_sd(A, B, R) \
|
#define _mm_rcp28_round_sd(A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1, (int)(R))
|
(__mmask8)-1, (int)(R)))
|
||||||
|
|
||||||
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
|
#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)(__m128d)(S), \
|
(__v2df)(__m128d)(S), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
|
#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
|
||||||
(__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), \
|
(__v2df)(__m128d)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(M), (int)(R))
|
(__mmask8)(M), (int)(R)))
|
||||||
|
|
||||||
#define _mm_rcp28_sd(A, B) \
|
#define _mm_rcp28_sd(A, B) \
|
||||||
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
_mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
|
||||||
|
|||||||
1850
lib/include/avx512fintrin.h
vendored
1850
lib/include/avx512fintrin.h
vendored
File diff suppressed because it is too large
Load Diff
3349
lib/include/avx512fp16intrin.h
vendored
Normal file
3349
lib/include/avx512fp16intrin.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
72
lib/include/avx512vbmi2intrin.h
vendored
72
lib/include/avx512vbmi2intrin.h
vendored
@ -129,88 +129,88 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm512_shldi_epi64(A, B, I) \
|
#define _mm512_shldi_epi64(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
|
||||||
(__v8di)(__m512i)(B), (int)(I))
|
(__v8di)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
|
#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
||||||
(__v8di)(__m512i)(S))
|
(__v8di)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
|
#define _mm512_maskz_shldi_epi64(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
(__v8di)_mm512_shldi_epi64((A), (B), (I)), \
|
||||||
(__v8di)_mm512_setzero_si512())
|
(__v8di)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shldi_epi32(A, B, I) \
|
#define _mm512_shldi_epi32(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
|
||||||
(__v16si)(__m512i)(B), (int)(I))
|
(__v16si)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
|
#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
||||||
(__v16si)(__m512i)(S))
|
(__v16si)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
|
#define _mm512_maskz_shldi_epi32(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
(__v16si)_mm512_shldi_epi32((A), (B), (I)), \
|
||||||
(__v16si)_mm512_setzero_si512())
|
(__v16si)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shldi_epi16(A, B, I) \
|
#define _mm512_shldi_epi16(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
|
||||||
(__v32hi)(__m512i)(B), (int)(I))
|
(__v32hi)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
|
#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
||||||
(__v32hi)(__m512i)(S))
|
(__v32hi)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
|
#define _mm512_maskz_shldi_epi16(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
(__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
|
||||||
(__v32hi)_mm512_setzero_si512())
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shrdi_epi64(A, B, I) \
|
#define _mm512_shrdi_epi64(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
|
||||||
(__v8di)(__m512i)(B), (int)(I))
|
(__v8di)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
|
#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v8di)(__m512i)(S))
|
(__v8di)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
|
#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
|
||||||
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
(__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v8di)_mm512_setzero_si512())
|
(__v8di)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shrdi_epi32(A, B, I) \
|
#define _mm512_shrdi_epi32(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
|
||||||
(__v16si)(__m512i)(B), (int)(I))
|
(__v16si)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
|
#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v16si)(__m512i)(S))
|
(__v16si)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
|
#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
|
||||||
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
(__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v16si)_mm512_setzero_si512())
|
(__v16si)_mm512_setzero_si512()))
|
||||||
|
|
||||||
#define _mm512_shrdi_epi16(A, B, I) \
|
#define _mm512_shrdi_epi16(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
|
||||||
(__v32hi)(__m512i)(B), (int)(I))
|
(__v32hi)(__m512i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
|
#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v32hi)(__m512i)(S))
|
(__v32hi)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
|
#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
|
||||||
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
(__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v32hi)_mm512_setzero_si512())
|
(__v32hi)_mm512_setzero_si512()))
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
static __inline__ __m512i __DEFAULT_FN_ATTRS
|
||||||
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
|
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
|
||||||
|
|||||||
54
lib/include/avx512vlbf16intrin.h
vendored
54
lib/include/avx512vlbf16intrin.h
vendored
@ -420,18 +420,46 @@ static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
|
|||||||
return __R[0];
|
return __R[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
|
||||||
|
return _mm_castsi128_ps(
|
||||||
|
(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert Packed BF16 Data to Packed float Data.
|
/// Convert Packed BF16 Data to Packed float Data.
|
||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
///
|
///
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 128-bit vector of [8 x bfloat].
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
|
||||||
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
||||||
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
|
(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
|
||||||
|
(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
|
||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
@ -441,13 +469,33 @@ static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
|
|||||||
/// bit is not set.
|
/// bit is not set.
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 128-bit vector of [8 x bfloat].
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
||||||
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
|
||||||
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
|
(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// \param __S
|
||||||
|
/// A 128-bit vector of [4 x float]. Elements are copied from __S when
|
||||||
|
/// the corresponding mask bit is not set.
|
||||||
|
/// \param __U
|
||||||
|
/// A 4-bit mask. Elements are zeroed out when the corresponding mask
|
||||||
|
/// bit is not set.
|
||||||
|
/// \param __A
|
||||||
|
/// A 128-bit vector of [4 x bfloat].
|
||||||
|
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
|
||||||
|
static __inline__ __m128 __DEFAULT_FN_ATTRS128
|
||||||
|
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
|
||||||
|
return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
|
||||||
|
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
|
||||||
|
16));
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
/// Convert Packed BF16 Data to Packed float Data using merging mask.
|
||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
@ -460,7 +508,7 @@ _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
|
|||||||
/// bit is not set.
|
/// bit is not set.
|
||||||
/// \param __A
|
/// \param __A
|
||||||
/// A 128-bit vector of [8 x bfloat].
|
/// A 128-bit vector of [8 x bfloat].
|
||||||
/// \returns A 256-bit vector of [8 x float] come from convertion of __A
|
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
|
||||||
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
static __inline__ __m256 __DEFAULT_FN_ATTRS256
|
||||||
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
|
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
|
||||||
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
|
return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
|
||||||
|
|||||||
136
lib/include/avx512vlbwintrin.h
vendored
136
lib/include/avx512vlbwintrin.h
vendored
@ -21,84 +21,84 @@
|
|||||||
/* Integer compare */
|
/* Integer compare */
|
||||||
|
|
||||||
#define _mm_cmp_epi8_mask(a, b, p) \
|
#define _mm_cmp_epi8_mask(a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
|
((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
|
||||||
(__v16qi)(__m128i)(b), (int)(p), \
|
(__v16qi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask16)-1)
|
(__mmask16)-1))
|
||||||
|
|
||||||
#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
|
#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
|
((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
|
||||||
(__v16qi)(__m128i)(b), (int)(p), \
|
(__v16qi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask16)(m))
|
(__mmask16)(m)))
|
||||||
|
|
||||||
#define _mm_cmp_epu8_mask(a, b, p) \
|
#define _mm_cmp_epu8_mask(a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
|
((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
|
||||||
(__v16qi)(__m128i)(b), (int)(p), \
|
(__v16qi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask16)-1)
|
(__mmask16)-1))
|
||||||
|
|
||||||
#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
|
#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
|
((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
|
||||||
(__v16qi)(__m128i)(b), (int)(p), \
|
(__v16qi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask16)(m))
|
(__mmask16)(m)))
|
||||||
|
|
||||||
#define _mm256_cmp_epi8_mask(a, b, p) \
|
#define _mm256_cmp_epi8_mask(a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
|
((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
|
||||||
(__v32qi)(__m256i)(b), (int)(p), \
|
(__v32qi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask32)-1)
|
(__mmask32)-1))
|
||||||
|
|
||||||
#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
|
#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
|
((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
|
||||||
(__v32qi)(__m256i)(b), (int)(p), \
|
(__v32qi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask32)(m))
|
(__mmask32)(m)))
|
||||||
|
|
||||||
#define _mm256_cmp_epu8_mask(a, b, p) \
|
#define _mm256_cmp_epu8_mask(a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
|
((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
|
||||||
(__v32qi)(__m256i)(b), (int)(p), \
|
(__v32qi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask32)-1)
|
(__mmask32)-1))
|
||||||
|
|
||||||
#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
|
#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
|
||||||
(__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
|
((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
|
||||||
(__v32qi)(__m256i)(b), (int)(p), \
|
(__v32qi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask32)(m))
|
(__mmask32)(m)))
|
||||||
|
|
||||||
#define _mm_cmp_epi16_mask(a, b, p) \
|
#define _mm_cmp_epi16_mask(a, b, p) \
|
||||||
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
|
((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
|
||||||
(__v8hi)(__m128i)(b), (int)(p), \
|
(__v8hi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
|
#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
|
||||||
(__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
|
((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
|
||||||
(__v8hi)(__m128i)(b), (int)(p), \
|
(__v8hi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask8)(m))
|
(__mmask8)(m)))
|
||||||
|
|
||||||
#define _mm_cmp_epu16_mask(a, b, p) \
|
#define _mm_cmp_epu16_mask(a, b, p) \
|
||||||
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
|
((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
|
||||||
(__v8hi)(__m128i)(b), (int)(p), \
|
(__v8hi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
|
#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
|
||||||
(__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
|
((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
|
||||||
(__v8hi)(__m128i)(b), (int)(p), \
|
(__v8hi)(__m128i)(b), (int)(p), \
|
||||||
(__mmask8)(m))
|
(__mmask8)(m)))
|
||||||
|
|
||||||
#define _mm256_cmp_epi16_mask(a, b, p) \
|
#define _mm256_cmp_epi16_mask(a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
|
((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
|
||||||
(__v16hi)(__m256i)(b), (int)(p), \
|
(__v16hi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask16)-1)
|
(__mmask16)-1))
|
||||||
|
|
||||||
#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
|
#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
|
((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
|
||||||
(__v16hi)(__m256i)(b), (int)(p), \
|
(__v16hi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask16)(m))
|
(__mmask16)(m)))
|
||||||
|
|
||||||
#define _mm256_cmp_epu16_mask(a, b, p) \
|
#define _mm256_cmp_epu16_mask(a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
|
((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
|
||||||
(__v16hi)(__m256i)(b), (int)(p), \
|
(__v16hi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask16)-1)
|
(__mmask16)-1))
|
||||||
|
|
||||||
#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
|
#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
|
||||||
(__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
|
((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
|
||||||
(__v16hi)(__m256i)(b), (int)(p), \
|
(__v16hi)(__m256i)(b), (int)(p), \
|
||||||
(__mmask16)(m))
|
(__mmask16)(m)))
|
||||||
|
|
||||||
#define _mm_cmpeq_epi8_mask(A, B) \
|
#define _mm_cmpeq_epi8_mask(A, B) \
|
||||||
_mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
|
_mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
|
||||||
@ -1821,46 +1821,46 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
|
|||||||
|
|
||||||
|
|
||||||
#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
|
#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
|
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
|
||||||
(__v8hi)(__m128i)(W))
|
(__v8hi)(__m128i)(W)))
|
||||||
|
|
||||||
#define _mm_maskz_shufflehi_epi16(U, A, imm) \
|
#define _mm_maskz_shufflehi_epi16(U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
|
(__v8hi)_mm_shufflehi_epi16((A), (imm)), \
|
||||||
(__v8hi)_mm_setzero_si128())
|
(__v8hi)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
|
#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
|
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
|
||||||
(__v16hi)(__m256i)(W))
|
(__v16hi)(__m256i)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
|
#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
|
(__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
|
||||||
(__v16hi)_mm256_setzero_si256())
|
(__v16hi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
|
#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
|
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
|
||||||
(__v8hi)(__m128i)(W))
|
(__v8hi)(__m128i)(W)))
|
||||||
|
|
||||||
#define _mm_maskz_shufflelo_epi16(U, A, imm) \
|
#define _mm_maskz_shufflelo_epi16(U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
|
(__v8hi)_mm_shufflelo_epi16((A), (imm)), \
|
||||||
(__v8hi)_mm_setzero_si128())
|
(__v8hi)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
|
#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shufflelo_epi16((A), \
|
(__v16hi)_mm256_shufflelo_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v16hi)(__m256i)(W))
|
(__v16hi)(__m256i)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
|
#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shufflelo_epi16((A), \
|
(__v16hi)_mm256_shufflelo_epi16((A), \
|
||||||
(imm)), \
|
(imm)), \
|
||||||
(__v16hi)_mm256_setzero_si256())
|
(__v16hi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_sllv_epi16(__m256i __A, __m256i __B)
|
_mm256_sllv_epi16(__m256i __A, __m256i __B)
|
||||||
@ -2756,52 +2756,52 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_mask_alignr_epi8(W, U, A, B, N) \
|
#define _mm_mask_alignr_epi8(W, U, A, B, N) \
|
||||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||||
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
|
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v16qi)(__m128i)(W))
|
(__v16qi)(__m128i)(W)))
|
||||||
|
|
||||||
#define _mm_maskz_alignr_epi8(U, A, B, N) \
|
#define _mm_maskz_alignr_epi8(U, A, B, N) \
|
||||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||||
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
|
(__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v16qi)_mm_setzero_si128())
|
(__v16qi)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
|
#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
|
||||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||||
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
|
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v32qi)(__m256i)(W))
|
(__v32qi)(__m256i)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_alignr_epi8(U, A, B, N) \
|
#define _mm256_maskz_alignr_epi8(U, A, B, N) \
|
||||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||||
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
|
(__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
|
||||||
(__v32qi)_mm256_setzero_si256())
|
(__v32qi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_dbsad_epu8(A, B, imm) \
|
#define _mm_dbsad_epu8(A, B, imm) \
|
||||||
(__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(imm))
|
(__v16qi)(__m128i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
|
#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
|
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v8hi)(__m128i)(W))
|
(__v8hi)(__m128i)(W)))
|
||||||
|
|
||||||
#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
|
#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
|
(__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v8hi)_mm_setzero_si128())
|
(__v8hi)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_dbsad_epu8(A, B, imm) \
|
#define _mm256_dbsad_epu8(A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
|
((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
|
||||||
(__v32qi)(__m256i)(B), (int)(imm))
|
(__v32qi)(__m256i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
|
#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
|
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v16hi)(__m256i)(W))
|
(__v16hi)(__m256i)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
|
#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
|
(__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
|
||||||
(__v16hi)_mm256_setzero_si256())
|
(__v16hi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
#undef __DEFAULT_FN_ATTRS256
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|||||||
176
lib/include/avx512vldqintrin.h
vendored
176
lib/include/avx512vldqintrin.h
vendored
@ -773,134 +773,134 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_range_pd(A, B, C) \
|
#define _mm_range_pd(A, B, C) \
|
||||||
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), (int)(C), \
|
(__v2df)(__m128d)(B), (int)(C), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_range_pd(W, U, A, B, C) \
|
#define _mm_mask_range_pd(W, U, A, B, C) \
|
||||||
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), (int)(C), \
|
(__v2df)(__m128d)(B), (int)(C), \
|
||||||
(__v2df)(__m128d)(W), \
|
(__v2df)(__m128d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_maskz_range_pd(U, A, B, C) \
|
#define _mm_maskz_range_pd(U, A, B, C) \
|
||||||
(__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
|
||||||
(__v2df)(__m128d)(B), (int)(C), \
|
(__v2df)(__m128d)(B), (int)(C), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_range_pd(A, B, C) \
|
#define _mm256_range_pd(A, B, C) \
|
||||||
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
||||||
(__v4df)(__m256d)(B), (int)(C), \
|
(__v4df)(__m256d)(B), (int)(C), \
|
||||||
(__v4df)_mm256_setzero_pd(), \
|
(__v4df)_mm256_setzero_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_range_pd(W, U, A, B, C) \
|
#define _mm256_mask_range_pd(W, U, A, B, C) \
|
||||||
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
||||||
(__v4df)(__m256d)(B), (int)(C), \
|
(__v4df)(__m256d)(B), (int)(C), \
|
||||||
(__v4df)(__m256d)(W), \
|
(__v4df)(__m256d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_range_pd(U, A, B, C) \
|
#define _mm256_maskz_range_pd(U, A, B, C) \
|
||||||
(__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
|
||||||
(__v4df)(__m256d)(B), (int)(C), \
|
(__v4df)(__m256d)(B), (int)(C), \
|
||||||
(__v4df)_mm256_setzero_pd(), \
|
(__v4df)_mm256_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_range_ps(A, B, C) \
|
#define _mm_range_ps(A, B, C) \
|
||||||
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), (int)(C), \
|
(__v4sf)(__m128)(B), (int)(C), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_range_ps(W, U, A, B, C) \
|
#define _mm_mask_range_ps(W, U, A, B, C) \
|
||||||
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), (int)(C), \
|
(__v4sf)(__m128)(B), (int)(C), \
|
||||||
(__v4sf)(__m128)(W), (__mmask8)(U))
|
(__v4sf)(__m128)(W), (__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_maskz_range_ps(U, A, B, C) \
|
#define _mm_maskz_range_ps(U, A, B, C) \
|
||||||
(__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
|
||||||
(__v4sf)(__m128)(B), (int)(C), \
|
(__v4sf)(__m128)(B), (int)(C), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_range_ps(A, B, C) \
|
#define _mm256_range_ps(A, B, C) \
|
||||||
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8sf)(__m256)(B), (int)(C), \
|
(__v8sf)(__m256)(B), (int)(C), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_range_ps(W, U, A, B, C) \
|
#define _mm256_mask_range_ps(W, U, A, B, C) \
|
||||||
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8sf)(__m256)(B), (int)(C), \
|
(__v8sf)(__m256)(B), (int)(C), \
|
||||||
(__v8sf)(__m256)(W), (__mmask8)(U))
|
(__v8sf)(__m256)(W), (__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_range_ps(U, A, B, C) \
|
#define _mm256_maskz_range_ps(U, A, B, C) \
|
||||||
(__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
|
||||||
(__v8sf)(__m256)(B), (int)(C), \
|
(__v8sf)(__m256)(B), (int)(C), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_reduce_pd(A, B) \
|
#define _mm_reduce_pd(A, B) \
|
||||||
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_reduce_pd(W, U, A, B) \
|
#define _mm_mask_reduce_pd(W, U, A, B) \
|
||||||
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
||||||
(__v2df)(__m128d)(W), \
|
(__v2df)(__m128d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_pd(U, A, B) \
|
#define _mm_maskz_reduce_pd(U, A, B) \
|
||||||
(__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_reduce_pd(A, B) \
|
#define _mm256_reduce_pd(A, B) \
|
||||||
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
||||||
(__v4df)_mm256_setzero_pd(), \
|
(__v4df)_mm256_setzero_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_reduce_pd(W, U, A, B) \
|
#define _mm256_mask_reduce_pd(W, U, A, B) \
|
||||||
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
||||||
(__v4df)(__m256d)(W), \
|
(__v4df)(__m256d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_reduce_pd(U, A, B) \
|
#define _mm256_maskz_reduce_pd(U, A, B) \
|
||||||
(__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
|
||||||
(__v4df)_mm256_setzero_pd(), \
|
(__v4df)_mm256_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_reduce_ps(A, B) \
|
#define _mm_reduce_ps(A, B) \
|
||||||
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_reduce_ps(W, U, A, B) \
|
#define _mm_mask_reduce_ps(W, U, A, B) \
|
||||||
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
||||||
(__v4sf)(__m128)(W), \
|
(__v4sf)(__m128)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_maskz_reduce_ps(U, A, B) \
|
#define _mm_maskz_reduce_ps(U, A, B) \
|
||||||
(__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
|
||||||
(__v4sf)_mm_setzero_ps(), \
|
(__v4sf)_mm_setzero_ps(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_reduce_ps(A, B) \
|
#define _mm256_reduce_ps(A, B) \
|
||||||
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_reduce_ps(W, U, A, B) \
|
#define _mm256_mask_reduce_ps(W, U, A, B) \
|
||||||
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
||||||
(__v8sf)(__m256)(W), \
|
(__v8sf)(__m256)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_reduce_ps(U, A, B) \
|
#define _mm256_maskz_reduce_ps(U, A, B) \
|
||||||
(__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
|
||||||
(__v8sf)_mm256_setzero_ps(), \
|
(__v8sf)_mm256_setzero_ps(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
|
static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
|
||||||
_mm_movepi32_mask (__m128i __A)
|
_mm_movepi32_mask (__m128i __A)
|
||||||
@ -1066,100 +1066,100 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_extractf64x2_pd(A, imm) \
|
#define _mm256_extractf64x2_pd(A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)_mm_undefined_pd(), \
|
(__v2df)_mm_undefined_pd(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
|
#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)(__m128d)(W), \
|
(__v2df)(__m128d)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
|
#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
|
||||||
(__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2df)_mm_setzero_pd(), \
|
(__v2df)_mm_setzero_pd(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_extracti64x2_epi64(A, imm) \
|
#define _mm256_extracti64x2_epi64(A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)_mm_undefined_si128(), \
|
(__v2di)_mm_undefined_si128(), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
|
#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)(__m128i)(W), \
|
(__v2di)(__m128i)(W), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
|
#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
|
||||||
(__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
|
||||||
(int)(imm), \
|
(int)(imm), \
|
||||||
(__v2di)_mm_setzero_si128(), \
|
(__v2di)_mm_setzero_si128(), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_insertf64x2(A, B, imm) \
|
#define _mm256_insertf64x2(A, B, imm) \
|
||||||
(__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
|
((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
|
||||||
(__v2df)(__m128d)(B), (int)(imm))
|
(__v2df)(__m128d)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
|
#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
|
||||||
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
|
((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
|
||||||
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
|
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
|
||||||
(__v4df)(__m256d)(W))
|
(__v4df)(__m256d)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_insertf64x2(U, A, B, imm) \
|
#define _mm256_maskz_insertf64x2(U, A, B, imm) \
|
||||||
(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
|
((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
|
||||||
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
|
(__v4df)_mm256_insertf64x2((A), (B), (imm)), \
|
||||||
(__v4df)_mm256_setzero_pd())
|
(__v4df)_mm256_setzero_pd()))
|
||||||
|
|
||||||
#define _mm256_inserti64x2(A, B, imm) \
|
#define _mm256_inserti64x2(A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
|
||||||
(__v2di)(__m128i)(B), (int)(imm))
|
(__v2di)(__m128i)(B), (int)(imm)))
|
||||||
|
|
||||||
#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
|
#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
|
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
|
||||||
(__v4di)(__m256i)(W))
|
(__v4di)(__m256i)(W)))
|
||||||
|
|
||||||
#define _mm256_maskz_inserti64x2(U, A, B, imm) \
|
#define _mm256_maskz_inserti64x2(U, A, B, imm) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
|
(__v4di)_mm256_inserti64x2((A), (B), (imm)), \
|
||||||
(__v4di)_mm256_setzero_si256())
|
(__v4di)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_mask_fpclass_pd_mask(U, A, imm) \
|
#define _mm_mask_fpclass_pd_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_fpclass_pd_mask(A, imm) \
|
#define _mm_fpclass_pd_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
|
#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_fpclass_pd_mask(A, imm) \
|
#define _mm256_fpclass_pd_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm_mask_fpclass_ps_mask(U, A, imm) \
|
#define _mm_mask_fpclass_ps_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm_fpclass_ps_mask(A, imm) \
|
#define _mm_fpclass_ps_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
|
#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
|
||||||
(__mmask8)(U))
|
(__mmask8)(U)))
|
||||||
|
|
||||||
#define _mm256_fpclass_ps_mask(A, imm) \
|
#define _mm256_fpclass_ps_mask(A, imm) \
|
||||||
(__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
|
((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
|
||||||
(__mmask8)-1)
|
(__mmask8)-1))
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
#undef __DEFAULT_FN_ATTRS256
|
#undef __DEFAULT_FN_ATTRS256
|
||||||
|
|||||||
2068
lib/include/avx512vlfp16intrin.h
vendored
Normal file
2068
lib/include/avx512vlfp16intrin.h
vendored
Normal file
File diff suppressed because it is too large
Load Diff
712
lib/include/avx512vlintrin.h
vendored
712
lib/include/avx512vlintrin.h
vendored
File diff suppressed because it is too large
Load Diff
144
lib/include/avx512vlvbmi2intrin.h
vendored
144
lib/include/avx512vlvbmi2intrin.h
vendored
@ -239,172 +239,172 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm256_shldi_epi64(A, B, I) \
|
#define _mm256_shldi_epi64(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
|
||||||
(__v4di)(__m256i)(B), (int)(I))
|
(__v4di)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
|
#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
|
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
|
||||||
(__v4di)(__m256i)(S))
|
(__v4di)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
|
#define _mm256_maskz_shldi_epi64(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
|
(__v4di)_mm256_shldi_epi64((A), (B), (I)), \
|
||||||
(__v4di)_mm256_setzero_si256())
|
(__v4di)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shldi_epi64(A, B, I) \
|
#define _mm_shldi_epi64(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
|
||||||
(__v2di)(__m128i)(B), (int)(I))
|
(__v2di)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
|
#define _mm_mask_shldi_epi64(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
||||||
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
|
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
|
||||||
(__v2di)(__m128i)(S))
|
(__v2di)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shldi_epi64(U, A, B, I) \
|
#define _mm_maskz_shldi_epi64(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
||||||
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
|
(__v2di)_mm_shldi_epi64((A), (B), (I)), \
|
||||||
(__v2di)_mm_setzero_si128())
|
(__v2di)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_shldi_epi32(A, B, I) \
|
#define _mm256_shldi_epi32(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
|
||||||
(__v8si)(__m256i)(B), (int)(I))
|
(__v8si)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
|
#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
||||||
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
|
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
|
||||||
(__v8si)(__m256i)(S))
|
(__v8si)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
|
#define _mm256_maskz_shldi_epi32(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
||||||
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
|
(__v8si)_mm256_shldi_epi32((A), (B), (I)), \
|
||||||
(__v8si)_mm256_setzero_si256())
|
(__v8si)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shldi_epi32(A, B, I) \
|
#define _mm_shldi_epi32(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
|
||||||
(__v4si)(__m128i)(B), (int)(I))
|
(__v4si)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
|
#define _mm_mask_shldi_epi32(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||||
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
|
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
|
||||||
(__v4si)(__m128i)(S))
|
(__v4si)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shldi_epi32(U, A, B, I) \
|
#define _mm_maskz_shldi_epi32(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||||
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
|
(__v4si)_mm_shldi_epi32((A), (B), (I)), \
|
||||||
(__v4si)_mm_setzero_si128())
|
(__v4si)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_shldi_epi16(A, B, I) \
|
#define _mm256_shldi_epi16(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
|
||||||
(__v16hi)(__m256i)(B), (int)(I))
|
(__v16hi)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
|
#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
|
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
|
||||||
(__v16hi)(__m256i)(S))
|
(__v16hi)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
|
#define _mm256_maskz_shldi_epi16(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
|
(__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
|
||||||
(__v16hi)_mm256_setzero_si256())
|
(__v16hi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shldi_epi16(A, B, I) \
|
#define _mm_shldi_epi16(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
|
||||||
(__v8hi)(__m128i)(B), (int)(I))
|
(__v8hi)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
|
#define _mm_mask_shldi_epi16(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
|
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
|
||||||
(__v8hi)(__m128i)(S))
|
(__v8hi)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shldi_epi16(U, A, B, I) \
|
#define _mm_maskz_shldi_epi16(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
|
(__v8hi)_mm_shldi_epi16((A), (B), (I)), \
|
||||||
(__v8hi)_mm_setzero_si128())
|
(__v8hi)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_shrdi_epi64(A, B, I) \
|
#define _mm256_shrdi_epi64(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
|
||||||
(__v4di)(__m256i)(B), (int)(I))
|
(__v4di)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
|
#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
|
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v4di)(__m256i)(S))
|
(__v4di)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
|
#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
|
||||||
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
|
(__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v4di)_mm256_setzero_si256())
|
(__v4di)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shrdi_epi64(A, B, I) \
|
#define _mm_shrdi_epi64(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
|
||||||
(__v2di)(__m128i)(B), (int)(I))
|
(__v2di)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
|
#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
||||||
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
|
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v2di)(__m128i)(S))
|
(__v2di)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
|
#define _mm_maskz_shrdi_epi64(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
|
||||||
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
|
(__v2di)_mm_shrdi_epi64((A), (B), (I)), \
|
||||||
(__v2di)_mm_setzero_si128())
|
(__v2di)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_shrdi_epi32(A, B, I) \
|
#define _mm256_shrdi_epi32(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
|
||||||
(__v8si)(__m256i)(B), (int)(I))
|
(__v8si)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
|
#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
||||||
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
|
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v8si)(__m256i)(S))
|
(__v8si)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
|
#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
|
||||||
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
|
(__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v8si)_mm256_setzero_si256())
|
(__v8si)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shrdi_epi32(A, B, I) \
|
#define _mm_shrdi_epi32(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
|
||||||
(__v4si)(__m128i)(B), (int)(I))
|
(__v4si)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
|
#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||||
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
|
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v4si)(__m128i)(S))
|
(__v4si)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
|
#define _mm_maskz_shrdi_epi32(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
|
||||||
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
|
(__v4si)_mm_shrdi_epi32((A), (B), (I)), \
|
||||||
(__v4si)_mm_setzero_si128())
|
(__v4si)_mm_setzero_si128()))
|
||||||
|
|
||||||
#define _mm256_shrdi_epi16(A, B, I) \
|
#define _mm256_shrdi_epi16(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
|
||||||
(__v16hi)(__m256i)(B), (int)(I))
|
(__v16hi)(__m256i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
|
#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
|
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v16hi)(__m256i)(S))
|
(__v16hi)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
|
#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
|
||||||
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
|
(__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v16hi)_mm256_setzero_si256())
|
(__v16hi)_mm256_setzero_si256()))
|
||||||
|
|
||||||
#define _mm_shrdi_epi16(A, B, I) \
|
#define _mm_shrdi_epi16(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
|
||||||
(__v8hi)(__m128i)(B), (int)(I))
|
(__v8hi)(__m128i)(B), (int)(I)))
|
||||||
|
|
||||||
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
|
#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
|
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v8hi)(__m128i)(S))
|
(__v8hi)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
|
#define _mm_maskz_shrdi_epi16(U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
|
||||||
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
|
(__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
|
||||||
(__v8hi)_mm_setzero_si128())
|
(__v8hi)_mm_setzero_si128()))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
|
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
|
||||||
|
|||||||
16
lib/include/avx512vlvnniintrin.h
vendored
16
lib/include/avx512vlvnniintrin.h
vendored
@ -36,7 +36,7 @@
|
|||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm256_dpbusd_epi32(S, A, B) \
|
#define _mm256_dpbusd_epi32(S, A, B) \
|
||||||
(__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||||
@ -56,7 +56,7 @@
|
|||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm256_dpbusds_epi32(S, A, B) \
|
#define _mm256_dpbusds_epi32(S, A, B) \
|
||||||
(__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||||
@ -74,7 +74,7 @@
|
|||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm256_dpwssd_epi32(S, A, B) \
|
#define _mm256_dpwssd_epi32(S, A, B) \
|
||||||
(__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||||
@ -92,7 +92,7 @@
|
|||||||
/// DST[MAX:256] := 0
|
/// DST[MAX:256] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm256_dpwssds_epi32(S, A, B) \
|
#define _mm256_dpwssds_epi32(S, A, B) \
|
||||||
(__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
|
((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||||
@ -112,7 +112,7 @@
|
|||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm_dpbusd_epi32(S, A, B) \
|
#define _mm_dpbusd_epi32(S, A, B) \
|
||||||
(__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
|
||||||
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
|
||||||
@ -132,7 +132,7 @@
|
|||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm_dpbusds_epi32(S, A, B) \
|
#define _mm_dpbusds_epi32(S, A, B) \
|
||||||
(__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||||
@ -150,7 +150,7 @@
|
|||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm_dpwssd_epi32(S, A, B) \
|
#define _mm_dpwssd_epi32(S, A, B) \
|
||||||
(__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
|
||||||
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
|
||||||
@ -168,7 +168,7 @@
|
|||||||
/// DST[MAX:128] := 0
|
/// DST[MAX:128] := 0
|
||||||
/// \endoperation
|
/// \endoperation
|
||||||
#define _mm_dpwssds_epi32(S, A, B) \
|
#define _mm_dpwssds_epi32(S, A, B) \
|
||||||
(__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
|
((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
static __inline__ __m256i __DEFAULT_FN_ATTRS256
|
||||||
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
||||||
|
|||||||
451
lib/include/avxintrin.h
vendored
451
lib/include/avxintrin.h
vendored
@ -400,7 +400,7 @@ _mm256_rcp_ps(__m256 __a)
|
|||||||
/// 11: Truncated.
|
/// 11: Truncated.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the rounded values.
|
/// \returns A 256-bit vector of [4 x double] containing the rounded values.
|
||||||
#define _mm256_round_pd(V, M) \
|
#define _mm256_round_pd(V, M) \
|
||||||
(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
|
((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
|
||||||
|
|
||||||
/// Rounds the values stored in a 256-bit vector of [8 x float] as
|
/// Rounds the values stored in a 256-bit vector of [8 x float] as
|
||||||
/// specified by the byte operand. The source values are rounded to integer
|
/// specified by the byte operand. The source values are rounded to integer
|
||||||
@ -432,7 +432,7 @@ _mm256_rcp_ps(__m256 __a)
|
|||||||
/// 11: Truncated.
|
/// 11: Truncated.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the rounded values.
|
/// \returns A 256-bit vector of [8 x float] containing the rounded values.
|
||||||
#define _mm256_round_ps(V, M) \
|
#define _mm256_round_ps(V, M) \
|
||||||
(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
|
((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
|
||||||
|
|
||||||
/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
|
/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
|
||||||
/// source values are rounded up to integer values and returned as 64-bit
|
/// source values are rounded up to integer values and returned as 64-bit
|
||||||
@ -989,7 +989,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// returned vector.
|
/// returned vector.
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
||||||
#define _mm_permute_pd(A, C) \
|
#define _mm_permute_pd(A, C) \
|
||||||
(__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
|
((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
|
||||||
|
|
||||||
/// Copies the values in a 256-bit vector of [4 x double] as specified by
|
/// Copies the values in a 256-bit vector of [4 x double] as specified by
|
||||||
/// the immediate integer operand.
|
/// the immediate integer operand.
|
||||||
@ -1029,7 +1029,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// returned vector.
|
/// returned vector.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
||||||
#define _mm256_permute_pd(A, C) \
|
#define _mm256_permute_pd(A, C) \
|
||||||
(__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
|
((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
|
||||||
|
|
||||||
/// Copies the values in a 128-bit vector of [4 x float] as specified by
|
/// Copies the values in a 128-bit vector of [4 x float] as specified by
|
||||||
/// the immediate integer operand.
|
/// the immediate integer operand.
|
||||||
@ -1085,7 +1085,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// returned vector.
|
/// returned vector.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
||||||
#define _mm_permute_ps(A, C) \
|
#define _mm_permute_ps(A, C) \
|
||||||
(__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
|
((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
|
||||||
|
|
||||||
/// Copies the values in a 256-bit vector of [8 x float] as specified by
|
/// Copies the values in a 256-bit vector of [8 x float] as specified by
|
||||||
/// the immediate integer operand.
|
/// the immediate integer operand.
|
||||||
@ -1177,7 +1177,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// returned vector.
|
/// returned vector.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
||||||
#define _mm256_permute_ps(A, C) \
|
#define _mm256_permute_ps(A, C) \
|
||||||
(__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
|
((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
|
||||||
|
|
||||||
/// Permutes 128-bit data values stored in two 256-bit vectors of
|
/// Permutes 128-bit data values stored in two 256-bit vectors of
|
||||||
/// [4 x double], as specified by the immediate integer operand.
|
/// [4 x double], as specified by the immediate integer operand.
|
||||||
@ -1217,8 +1217,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// destination.
|
/// destination.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
||||||
#define _mm256_permute2f128_pd(V1, V2, M) \
|
#define _mm256_permute2f128_pd(V1, V2, M) \
|
||||||
(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
|
((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
|
||||||
(__v4df)(__m256d)(V2), (int)(M))
|
(__v4df)(__m256d)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Permutes 128-bit data values stored in two 256-bit vectors of
|
/// Permutes 128-bit data values stored in two 256-bit vectors of
|
||||||
/// [8 x float], as specified by the immediate integer operand.
|
/// [8 x float], as specified by the immediate integer operand.
|
||||||
@ -1258,8 +1258,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// destination.
|
/// destination.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
||||||
#define _mm256_permute2f128_ps(V1, V2, M) \
|
#define _mm256_permute2f128_ps(V1, V2, M) \
|
||||||
(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
|
((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
|
||||||
(__v8sf)(__m256)(V2), (int)(M))
|
(__v8sf)(__m256)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Permutes 128-bit data values stored in two 256-bit integer vectors,
|
/// Permutes 128-bit data values stored in two 256-bit integer vectors,
|
||||||
/// as specified by the immediate integer operand.
|
/// as specified by the immediate integer operand.
|
||||||
@ -1298,8 +1298,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// destination.
|
/// destination.
|
||||||
/// \returns A 256-bit integer vector containing the copied values.
|
/// \returns A 256-bit integer vector containing the copied values.
|
||||||
#define _mm256_permute2f128_si256(V1, V2, M) \
|
#define _mm256_permute2f128_si256(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
|
((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
|
||||||
(__v8si)(__m256i)(V2), (int)(M))
|
(__v8si)(__m256i)(V2), (int)(M)))
|
||||||
|
|
||||||
/* Vector Blend */
|
/* Vector Blend */
|
||||||
/// Merges 64-bit double-precision data values stored in either of the
|
/// Merges 64-bit double-precision data values stored in either of the
|
||||||
@ -1327,8 +1327,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// operand \a V2 is copied to the same position in the destination.
|
/// operand \a V2 is copied to the same position in the destination.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
/// \returns A 256-bit vector of [4 x double] containing the copied values.
|
||||||
#define _mm256_blend_pd(V1, V2, M) \
|
#define _mm256_blend_pd(V1, V2, M) \
|
||||||
(__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
|
((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
|
||||||
(__v4df)(__m256d)(V2), (int)(M))
|
(__v4df)(__m256d)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Merges 32-bit single-precision data values stored in either of the
|
/// Merges 32-bit single-precision data values stored in either of the
|
||||||
/// two 256-bit vectors of [8 x float], as specified by the immediate
|
/// two 256-bit vectors of [8 x float], as specified by the immediate
|
||||||
@ -1355,8 +1355,8 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c)
|
|||||||
/// operand \a V2 is copied to the same position in the destination.
|
/// operand \a V2 is copied to the same position in the destination.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
/// \returns A 256-bit vector of [8 x float] containing the copied values.
|
||||||
#define _mm256_blend_ps(V1, V2, M) \
|
#define _mm256_blend_ps(V1, V2, M) \
|
||||||
(__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
|
((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
|
||||||
(__v8sf)(__m256)(V2), (int)(M))
|
(__v8sf)(__m256)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Merges 64-bit double-precision data values stored in either of the
|
/// Merges 64-bit double-precision data values stored in either of the
|
||||||
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
|
/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
|
||||||
@ -1453,8 +1453,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// two parallel dot product computations.
|
/// two parallel dot product computations.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the two dot products.
|
/// \returns A 256-bit vector of [8 x float] containing the two dot products.
|
||||||
#define _mm256_dp_ps(V1, V2, M) \
|
#define _mm256_dp_ps(V1, V2, M) \
|
||||||
(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
|
((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
|
||||||
(__v8sf)(__m256)(V2), (M))
|
(__v8sf)(__m256)(V2), (M)))
|
||||||
|
|
||||||
/* Vector shuffle */
|
/* Vector shuffle */
|
||||||
/// Selects 8 float values from the 256-bit operands of [8 x float], as
|
/// Selects 8 float values from the 256-bit operands of [8 x float], as
|
||||||
@ -1507,8 +1507,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
|
/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
|
/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
|
||||||
#define _mm256_shuffle_ps(a, b, mask) \
|
#define _mm256_shuffle_ps(a, b, mask) \
|
||||||
(__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
|
((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
|
||||||
(__v8sf)(__m256)(b), (int)(mask))
|
(__v8sf)(__m256)(b), (int)(mask)))
|
||||||
|
|
||||||
/// Selects four double-precision values from the 256-bit operands of
|
/// Selects four double-precision values from the 256-bit operands of
|
||||||
/// [4 x double], as specified by the immediate value operand.
|
/// [4 x double], as specified by the immediate value operand.
|
||||||
@ -1553,8 +1553,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// destination.
|
/// destination.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
|
/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
|
||||||
#define _mm256_shuffle_pd(a, b, mask) \
|
#define _mm256_shuffle_pd(a, b, mask) \
|
||||||
(__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
|
((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
|
||||||
(__v4df)(__m256d)(b), (int)(mask))
|
(__v4df)(__m256d)(b), (int)(mask)))
|
||||||
|
|
||||||
/* Compare */
|
/* Compare */
|
||||||
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
|
#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
|
||||||
@ -1647,8 +1647,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
||||||
#define _mm_cmp_pd(a, b, c) \
|
#define _mm_cmp_pd(a, b, c) \
|
||||||
(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
|
((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
|
||||||
(__v2df)(__m128d)(b), (c))
|
(__v2df)(__m128d)(b), (c)))
|
||||||
|
|
||||||
/// Compares each of the corresponding values of two 128-bit vectors of
|
/// Compares each of the corresponding values of two 128-bit vectors of
|
||||||
/// [4 x float], using the operation specified by the immediate integer
|
/// [4 x float], using the operation specified by the immediate integer
|
||||||
@ -1707,8 +1707,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
||||||
#define _mm_cmp_ps(a, b, c) \
|
#define _mm_cmp_ps(a, b, c) \
|
||||||
(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
|
((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
|
||||||
(__v4sf)(__m128)(b), (c))
|
(__v4sf)(__m128)(b), (c)))
|
||||||
|
|
||||||
/// Compares each of the corresponding double-precision values of two
|
/// Compares each of the corresponding double-precision values of two
|
||||||
/// 256-bit vectors of [4 x double], using the operation specified by the
|
/// 256-bit vectors of [4 x double], using the operation specified by the
|
||||||
@ -1767,8 +1767,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
|
/// \returns A 256-bit vector of [4 x double] containing the comparison results.
|
||||||
#define _mm256_cmp_pd(a, b, c) \
|
#define _mm256_cmp_pd(a, b, c) \
|
||||||
(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
|
((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
|
||||||
(__v4df)(__m256d)(b), (c))
|
(__v4df)(__m256d)(b), (c)))
|
||||||
|
|
||||||
/// Compares each of the corresponding values of two 256-bit vectors of
|
/// Compares each of the corresponding values of two 256-bit vectors of
|
||||||
/// [8 x float], using the operation specified by the immediate integer
|
/// [8 x float], using the operation specified by the immediate integer
|
||||||
@ -1827,8 +1827,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
|
/// \returns A 256-bit vector of [8 x float] containing the comparison results.
|
||||||
#define _mm256_cmp_ps(a, b, c) \
|
#define _mm256_cmp_ps(a, b, c) \
|
||||||
(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
|
((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
|
||||||
(__v8sf)(__m256)(b), (c))
|
(__v8sf)(__m256)(b), (c)))
|
||||||
|
|
||||||
/// Compares each of the corresponding scalar double-precision values of
|
/// Compares each of the corresponding scalar double-precision values of
|
||||||
/// two 128-bit vectors of [2 x double], using the operation specified by the
|
/// two 128-bit vectors of [2 x double], using the operation specified by the
|
||||||
@ -1886,8 +1886,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
/// \returns A 128-bit vector of [2 x double] containing the comparison results.
|
||||||
#define _mm_cmp_sd(a, b, c) \
|
#define _mm_cmp_sd(a, b, c) \
|
||||||
(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
|
((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
|
||||||
(__v2df)(__m128d)(b), (c))
|
(__v2df)(__m128d)(b), (c)))
|
||||||
|
|
||||||
/// Compares each of the corresponding scalar values of two 128-bit
|
/// Compares each of the corresponding scalar values of two 128-bit
|
||||||
/// vectors of [4 x float], using the operation specified by the immediate
|
/// vectors of [4 x float], using the operation specified by the immediate
|
||||||
@ -1945,8 +1945,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// 0x1F: True (unordered, signaling)
|
/// 0x1F: True (unordered, signaling)
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
/// \returns A 128-bit vector of [4 x float] containing the comparison results.
|
||||||
#define _mm_cmp_ss(a, b, c) \
|
#define _mm_cmp_ss(a, b, c) \
|
||||||
(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
|
((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
|
||||||
(__v4sf)(__m128)(b), (c))
|
(__v4sf)(__m128)(b), (c)))
|
||||||
|
|
||||||
/// Takes a [8 x i32] vector and returns the vector element value
|
/// Takes a [8 x i32] vector and returns the vector element value
|
||||||
/// indexed by the immediate constant operand.
|
/// indexed by the immediate constant operand.
|
||||||
@ -1964,7 +1964,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A 32-bit integer containing the extracted 32 bits of extended
|
/// \returns A 32-bit integer containing the extracted 32 bits of extended
|
||||||
/// packed data.
|
/// packed data.
|
||||||
#define _mm256_extract_epi32(X, N) \
|
#define _mm256_extract_epi32(X, N) \
|
||||||
(int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
|
((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
|
||||||
|
|
||||||
/// Takes a [16 x i16] vector and returns the vector element value
|
/// Takes a [16 x i16] vector and returns the vector element value
|
||||||
/// indexed by the immediate constant operand.
|
/// indexed by the immediate constant operand.
|
||||||
@ -1982,8 +1982,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
|
/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
|
||||||
/// packed data.
|
/// packed data.
|
||||||
#define _mm256_extract_epi16(X, N) \
|
#define _mm256_extract_epi16(X, N) \
|
||||||
(int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
|
((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
|
||||||
(int)(N))
|
(int)(N)))
|
||||||
|
|
||||||
/// Takes a [32 x i8] vector and returns the vector element value
|
/// Takes a [32 x i8] vector and returns the vector element value
|
||||||
/// indexed by the immediate constant operand.
|
/// indexed by the immediate constant operand.
|
||||||
@ -2001,8 +2001,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
|
/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
|
||||||
/// packed data.
|
/// packed data.
|
||||||
#define _mm256_extract_epi8(X, N) \
|
#define _mm256_extract_epi8(X, N) \
|
||||||
(int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
|
((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
|
||||||
(int)(N))
|
(int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
/// Takes a [4 x i64] vector and returns the vector element value
|
/// Takes a [4 x i64] vector and returns the vector element value
|
||||||
@ -2021,7 +2021,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A 64-bit integer containing the extracted 64 bits of extended
|
/// \returns A 64-bit integer containing the extracted 64 bits of extended
|
||||||
/// packed data.
|
/// packed data.
|
||||||
#define _mm256_extract_epi64(X, N) \
|
#define _mm256_extract_epi64(X, N) \
|
||||||
(long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
|
((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/// Takes a [8 x i32] vector and replaces the vector element value
|
/// Takes a [8 x i32] vector and replaces the vector element value
|
||||||
@ -2043,8 +2043,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a __imm with \a __b.
|
||||||
#define _mm256_insert_epi32(X, I, N) \
|
#define _mm256_insert_epi32(X, I, N) \
|
||||||
(__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
|
||||||
(int)(I), (int)(N))
|
(int)(I), (int)(N)))
|
||||||
|
|
||||||
|
|
||||||
/// Takes a [16 x i16] vector and replaces the vector element value
|
/// Takes a [16 x i16] vector and replaces the vector element value
|
||||||
@ -2066,8 +2066,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a __imm with \a __b.
|
||||||
#define _mm256_insert_epi16(X, I, N) \
|
#define _mm256_insert_epi16(X, I, N) \
|
||||||
(__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
|
||||||
(int)(I), (int)(N))
|
(int)(I), (int)(N)))
|
||||||
|
|
||||||
/// Takes a [32 x i8] vector and replaces the vector element value
|
/// Takes a [32 x i8] vector and replaces the vector element value
|
||||||
/// indexed by the immediate constant operand with a new value. Returns the
|
/// indexed by the immediate constant operand with a new value. Returns the
|
||||||
@ -2088,8 +2088,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a __imm with \a __b.
|
||||||
#define _mm256_insert_epi8(X, I, N) \
|
#define _mm256_insert_epi8(X, I, N) \
|
||||||
(__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
|
||||||
(int)(I), (int)(N))
|
(int)(I), (int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
/// Takes a [4 x i64] vector and replaces the vector element value
|
/// Takes a [4 x i64] vector and replaces the vector element value
|
||||||
@ -2111,8 +2111,8 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
|
|||||||
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
/// \returns A copy of vector \a __a, after replacing its element indexed by
|
||||||
/// \a __imm with \a __b.
|
/// \a __imm with \a __b.
|
||||||
#define _mm256_insert_epi64(X, I, N) \
|
#define _mm256_insert_epi64(X, I, N) \
|
||||||
(__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
|
((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
|
||||||
(long long)(I), (int)(N))
|
(long long)(I), (int)(N)))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Conversion */
|
/* Conversion */
|
||||||
@ -4592,8 +4592,8 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// result.
|
/// result.
|
||||||
/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
|
/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
|
||||||
#define _mm256_insertf128_ps(V1, V2, M) \
|
#define _mm256_insertf128_ps(V1, V2, M) \
|
||||||
(__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
|
((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
|
||||||
(__v4sf)(__m128)(V2), (int)(M))
|
(__v4sf)(__m128)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Constructs a new 256-bit vector of [4 x double] by first duplicating
|
/// Constructs a new 256-bit vector of [4 x double] by first duplicating
|
||||||
/// a 256-bit vector of [4 x double] given in the first parameter, and then
|
/// a 256-bit vector of [4 x double] given in the first parameter, and then
|
||||||
@ -4630,8 +4630,8 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// result.
|
/// result.
|
||||||
/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
|
/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
|
||||||
#define _mm256_insertf128_pd(V1, V2, M) \
|
#define _mm256_insertf128_pd(V1, V2, M) \
|
||||||
(__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
|
((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
|
||||||
(__v2df)(__m128d)(V2), (int)(M))
|
(__v2df)(__m128d)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Constructs a new 256-bit integer vector by first duplicating a
|
/// Constructs a new 256-bit integer vector by first duplicating a
|
||||||
/// 256-bit integer vector given in the first parameter, and then replacing
|
/// 256-bit integer vector given in the first parameter, and then replacing
|
||||||
@ -4668,8 +4668,8 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// result.
|
/// result.
|
||||||
/// \returns A 256-bit integer vector containing the interleaved values.
|
/// \returns A 256-bit integer vector containing the interleaved values.
|
||||||
#define _mm256_insertf128_si256(V1, V2, M) \
|
#define _mm256_insertf128_si256(V1, V2, M) \
|
||||||
(__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
|
((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
|
||||||
(__v4si)(__m128i)(V2), (int)(M))
|
(__v4si)(__m128i)(V2), (int)(M)))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Vector extract.
|
Vector extract.
|
||||||
@ -4698,7 +4698,7 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
|
/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
|
||||||
#define _mm256_extractf128_ps(V, M) \
|
#define _mm256_extractf128_ps(V, M) \
|
||||||
(__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
|
((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
|
||||||
|
|
||||||
/// Extracts either the upper or the lower 128 bits from a 256-bit vector
|
/// Extracts either the upper or the lower 128 bits from a 256-bit vector
|
||||||
/// of [4 x double], as determined by the immediate integer parameter, and
|
/// of [4 x double], as determined by the immediate integer parameter, and
|
||||||
@ -4722,7 +4722,7 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
|
/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
|
||||||
#define _mm256_extractf128_pd(V, M) \
|
#define _mm256_extractf128_pd(V, M) \
|
||||||
(__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))
|
((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
|
||||||
|
|
||||||
/// Extracts either the upper or the lower 128 bits from a 256-bit
|
/// Extracts either the upper or the lower 128 bits from a 256-bit
|
||||||
/// integer vector, as determined by the immediate integer parameter, and
|
/// integer vector, as determined by the immediate integer parameter, and
|
||||||
@ -4746,177 +4746,7 @@ _mm256_zextsi128_si256(__m128i __a)
|
|||||||
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
|
||||||
/// \returns A 128-bit integer vector containing the extracted bits.
|
/// \returns A 128-bit integer vector containing the extracted bits.
|
||||||
#define _mm256_extractf128_si256(V, M) \
|
#define _mm256_extractf128_si256(V, M) \
|
||||||
(__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))
|
((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
|
||||||
|
|
||||||
/* SIMD load ops (unaligned) */
|
|
||||||
/// Loads two 128-bit floating-point vectors of [4 x float] from
|
|
||||||
/// unaligned memory locations and constructs a 256-bit floating-point vector
|
|
||||||
/// of [8 x float] by concatenating the two 128-bit vectors.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to load instructions followed by the
|
|
||||||
/// <c> VINSERTF128 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location containing 4 consecutive
|
|
||||||
/// single-precision floating-point values. These values are to be copied to
|
|
||||||
/// bits[255:128] of the result. The address of the memory location does not
|
|
||||||
/// have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location containing 4 consecutive
|
|
||||||
/// single-precision floating-point values. These values are to be copied to
|
|
||||||
/// bits[127:0] of the result. The address of the memory location does not
|
|
||||||
/// have to be aligned.
|
|
||||||
/// \returns A 256-bit floating-point vector of [8 x float] containing the
|
|
||||||
/// concatenated result.
|
|
||||||
static __inline __m256 __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
|
|
||||||
{
|
|
||||||
__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
|
|
||||||
return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Loads two 128-bit floating-point vectors of [2 x double] from
|
|
||||||
/// unaligned memory locations and constructs a 256-bit floating-point vector
|
|
||||||
/// of [4 x double] by concatenating the two 128-bit vectors.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to load instructions followed by the
|
|
||||||
/// <c> VINSERTF128 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location containing two consecutive
|
|
||||||
/// double-precision floating-point values. These values are to be copied to
|
|
||||||
/// bits[255:128] of the result. The address of the memory location does not
|
|
||||||
/// have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location containing two consecutive
|
|
||||||
/// double-precision floating-point values. These values are to be copied to
|
|
||||||
/// bits[127:0] of the result. The address of the memory location does not
|
|
||||||
/// have to be aligned.
|
|
||||||
/// \returns A 256-bit floating-point vector of [4 x double] containing the
|
|
||||||
/// concatenated result.
|
|
||||||
static __inline __m256d __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
|
|
||||||
{
|
|
||||||
__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
|
|
||||||
return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Loads two 128-bit integer vectors from unaligned memory locations and
|
|
||||||
/// constructs a 256-bit integer vector by concatenating the two 128-bit
|
|
||||||
/// vectors.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to load instructions followed by the
|
|
||||||
/// <c> VINSERTF128 </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location containing a 128-bit integer
|
|
||||||
/// vector. This vector is to be copied to bits[255:128] of the result. The
|
|
||||||
/// address of the memory location does not have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location containing a 128-bit integer
|
|
||||||
/// vector. This vector is to be copied to bits[127:0] of the result. The
|
|
||||||
/// address of the memory location does not have to be aligned.
|
|
||||||
/// \returns A 256-bit integer vector containing the concatenated result.
|
|
||||||
static __inline __m256i __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
|
|
||||||
{
|
|
||||||
__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
|
|
||||||
return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* SIMD store ops (unaligned) */
|
|
||||||
/// Stores the upper and lower 128 bits of a 256-bit floating-point
|
|
||||||
/// vector of [8 x float] into two different unaligned memory locations.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
|
||||||
/// store instructions.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __a
|
|
||||||
/// A 256-bit floating-point vector of [8 x float].
|
|
||||||
static __inline void __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
|
|
||||||
{
|
|
||||||
__m128 __v128;
|
|
||||||
|
|
||||||
__v128 = _mm256_castps256_ps128(__a);
|
|
||||||
_mm_storeu_ps(__addr_lo, __v128);
|
|
||||||
__v128 = _mm256_extractf128_ps(__a, 1);
|
|
||||||
_mm_storeu_ps(__addr_hi, __v128);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores the upper and lower 128 bits of a 256-bit floating-point
|
|
||||||
/// vector of [4 x double] into two different unaligned memory locations.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
|
||||||
/// store instructions.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __a
|
|
||||||
/// A 256-bit floating-point vector of [4 x double].
|
|
||||||
static __inline void __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
|
|
||||||
{
|
|
||||||
__m128d __v128;
|
|
||||||
|
|
||||||
__v128 = _mm256_castpd256_pd128(__a);
|
|
||||||
_mm_storeu_pd(__addr_lo, __v128);
|
|
||||||
__v128 = _mm256_extractf128_pd(__a, 1);
|
|
||||||
_mm_storeu_pd(__addr_hi, __v128);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Stores the upper and lower 128 bits of a 256-bit integer vector into
|
|
||||||
/// two different unaligned memory locations.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
|
||||||
/// store instructions.
|
|
||||||
///
|
|
||||||
/// \param __addr_hi
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __addr_lo
|
|
||||||
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
|
||||||
/// copied to this memory location. The address of this memory location does
|
|
||||||
/// not have to be aligned.
|
|
||||||
/// \param __a
|
|
||||||
/// A 256-bit integer vector.
|
|
||||||
static __inline void __DEFAULT_FN_ATTRS
|
|
||||||
_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
|
|
||||||
{
|
|
||||||
__m128i __v128;
|
|
||||||
|
|
||||||
__v128 = _mm256_castsi256_si128(__a);
|
|
||||||
_mm_storeu_si128(__addr_lo, __v128);
|
|
||||||
__v128 = _mm256_extractf128_si256(__a, 1);
|
|
||||||
_mm_storeu_si128(__addr_hi, __v128);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Constructs a 256-bit floating-point vector of [8 x float] by
|
/// Constructs a 256-bit floating-point vector of [8 x float] by
|
||||||
/// concatenating two 128-bit floating-point vectors of [4 x float].
|
/// concatenating two 128-bit floating-point vectors of [4 x float].
|
||||||
@ -5047,6 +4877,173 @@ _mm256_setr_m128i (__m128i __lo, __m128i __hi)
|
|||||||
return (__m256i)_mm256_set_m128i(__hi, __lo);
|
return (__m256i)_mm256_set_m128i(__hi, __lo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* SIMD load ops (unaligned) */
|
||||||
|
/// Loads two 128-bit floating-point vectors of [4 x float] from
|
||||||
|
/// unaligned memory locations and constructs a 256-bit floating-point vector
|
||||||
|
/// of [8 x float] by concatenating the two 128-bit vectors.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to load instructions followed by the
|
||||||
|
/// <c> VINSERTF128 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location containing 4 consecutive
|
||||||
|
/// single-precision floating-point values. These values are to be copied to
|
||||||
|
/// bits[255:128] of the result. The address of the memory location does not
|
||||||
|
/// have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location containing 4 consecutive
|
||||||
|
/// single-precision floating-point values. These values are to be copied to
|
||||||
|
/// bits[127:0] of the result. The address of the memory location does not
|
||||||
|
/// have to be aligned.
|
||||||
|
/// \returns A 256-bit floating-point vector of [8 x float] containing the
|
||||||
|
/// concatenated result.
|
||||||
|
static __inline __m256 __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
|
||||||
|
{
|
||||||
|
return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Loads two 128-bit floating-point vectors of [2 x double] from
|
||||||
|
/// unaligned memory locations and constructs a 256-bit floating-point vector
|
||||||
|
/// of [4 x double] by concatenating the two 128-bit vectors.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to load instructions followed by the
|
||||||
|
/// <c> VINSERTF128 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location containing two consecutive
|
||||||
|
/// double-precision floating-point values. These values are to be copied to
|
||||||
|
/// bits[255:128] of the result. The address of the memory location does not
|
||||||
|
/// have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location containing two consecutive
|
||||||
|
/// double-precision floating-point values. These values are to be copied to
|
||||||
|
/// bits[127:0] of the result. The address of the memory location does not
|
||||||
|
/// have to be aligned.
|
||||||
|
/// \returns A 256-bit floating-point vector of [4 x double] containing the
|
||||||
|
/// concatenated result.
|
||||||
|
static __inline __m256d __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
|
||||||
|
{
|
||||||
|
return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Loads two 128-bit integer vectors from unaligned memory locations and
|
||||||
|
/// constructs a 256-bit integer vector by concatenating the two 128-bit
|
||||||
|
/// vectors.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to load instructions followed by the
|
||||||
|
/// <c> VINSERTF128 </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location containing a 128-bit integer
|
||||||
|
/// vector. This vector is to be copied to bits[255:128] of the result. The
|
||||||
|
/// address of the memory location does not have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location containing a 128-bit integer
|
||||||
|
/// vector. This vector is to be copied to bits[127:0] of the result. The
|
||||||
|
/// address of the memory location does not have to be aligned.
|
||||||
|
/// \returns A 256-bit integer vector containing the concatenated result.
|
||||||
|
static __inline __m256i __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
|
||||||
|
{
|
||||||
|
return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* SIMD store ops (unaligned) */
|
||||||
|
/// Stores the upper and lower 128 bits of a 256-bit floating-point
|
||||||
|
/// vector of [8 x float] into two different unaligned memory locations.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
||||||
|
/// store instructions.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __a
|
||||||
|
/// A 256-bit floating-point vector of [8 x float].
|
||||||
|
static __inline void __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
|
||||||
|
{
|
||||||
|
__m128 __v128;
|
||||||
|
|
||||||
|
__v128 = _mm256_castps256_ps128(__a);
|
||||||
|
_mm_storeu_ps(__addr_lo, __v128);
|
||||||
|
__v128 = _mm256_extractf128_ps(__a, 1);
|
||||||
|
_mm_storeu_ps(__addr_hi, __v128);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores the upper and lower 128 bits of a 256-bit floating-point
|
||||||
|
/// vector of [4 x double] into two different unaligned memory locations.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
||||||
|
/// store instructions.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __a
|
||||||
|
/// A 256-bit floating-point vector of [4 x double].
|
||||||
|
static __inline void __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
|
||||||
|
{
|
||||||
|
__m128d __v128;
|
||||||
|
|
||||||
|
__v128 = _mm256_castpd256_pd128(__a);
|
||||||
|
_mm_storeu_pd(__addr_lo, __v128);
|
||||||
|
__v128 = _mm256_extractf128_pd(__a, 1);
|
||||||
|
_mm_storeu_pd(__addr_hi, __v128);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores the upper and lower 128 bits of a 256-bit integer vector into
|
||||||
|
/// two different unaligned memory locations.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
|
||||||
|
/// store instructions.
|
||||||
|
///
|
||||||
|
/// \param __addr_hi
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __addr_lo
|
||||||
|
/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
|
||||||
|
/// copied to this memory location. The address of this memory location does
|
||||||
|
/// not have to be aligned.
|
||||||
|
/// \param __a
|
||||||
|
/// A 256-bit integer vector.
|
||||||
|
static __inline void __DEFAULT_FN_ATTRS
|
||||||
|
_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
|
||||||
|
{
|
||||||
|
__m128i __v128;
|
||||||
|
|
||||||
|
__v128 = _mm256_castsi256_si128(__a);
|
||||||
|
_mm_storeu_si128(__addr_lo, __v128);
|
||||||
|
__v128 = _mm256_extractf128_si256(__a, 1);
|
||||||
|
_mm_storeu_si128(__addr_hi, __v128);
|
||||||
|
}
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
#undef __DEFAULT_FN_ATTRS128
|
#undef __DEFAULT_FN_ATTRS128
|
||||||
|
|
||||||
|
|||||||
10
lib/include/cetintrin.h
vendored
10
lib/include/cetintrin.h
vendored
@ -42,10 +42,20 @@ static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
|
|||||||
return __builtin_ia32_rdsspd(__a);
|
return __builtin_ia32_rdsspd(__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
|
||||||
|
unsigned int t;
|
||||||
|
return __builtin_ia32_rdsspd(t);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
|
||||||
return __builtin_ia32_rdsspq(__a);
|
return __builtin_ia32_rdsspq(__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
|
||||||
|
unsigned long long t;
|
||||||
|
return __builtin_ia32_rdsspq(t);
|
||||||
|
}
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
|
|||||||
3
lib/include/cpuid.h
vendored
3
lib/include/cpuid.h
vendored
@ -195,11 +195,12 @@
|
|||||||
#define bit_PCONFIG 0x00040000
|
#define bit_PCONFIG 0x00040000
|
||||||
#define bit_IBT 0x00100000
|
#define bit_IBT 0x00100000
|
||||||
#define bit_AMXBF16 0x00400000
|
#define bit_AMXBF16 0x00400000
|
||||||
|
#define bit_AVX512FP16 0x00800000
|
||||||
#define bit_AMXTILE 0x01000000
|
#define bit_AMXTILE 0x01000000
|
||||||
#define bit_AMXINT8 0x02000000
|
#define bit_AMXINT8 0x02000000
|
||||||
|
|
||||||
/* Features in %eax for leaf 7 sub-leaf 1 */
|
/* Features in %eax for leaf 7 sub-leaf 1 */
|
||||||
#define bit_AVXVNNI 0x00000008
|
#define bit_AVXVNNI 0x00000010
|
||||||
#define bit_AVX512BF16 0x00000020
|
#define bit_AVX512BF16 0x00000020
|
||||||
#define bit_HRESET 0x00400000
|
#define bit_HRESET 0x00400000
|
||||||
|
|
||||||
|
|||||||
100
lib/include/crc32intrin.h
vendored
Normal file
100
lib/include/crc32intrin.h
vendored
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
|
||||||
|
*
|
||||||
|
* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
|
* See https://llvm.org/LICENSE.txt for license information.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||||
|
*
|
||||||
|
*===-----------------------------------------------------------------------===
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef __CRC32INTRIN_H
|
||||||
|
#define __CRC32INTRIN_H
|
||||||
|
|
||||||
|
#define __DEFAULT_FN_ATTRS \
|
||||||
|
__attribute__((__always_inline__, __nodebug__, __target__("crc32")))
|
||||||
|
|
||||||
|
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
||||||
|
/// unsigned char operand.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __C
|
||||||
|
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
||||||
|
/// \a __D.
|
||||||
|
/// \param __D
|
||||||
|
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
|
||||||
|
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
|
/// operand \a __D.
|
||||||
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
|
_mm_crc32_u8(unsigned int __C, unsigned char __D)
|
||||||
|
{
|
||||||
|
return __builtin_ia32_crc32qi(__C, __D);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
||||||
|
/// unsigned short operand.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __C
|
||||||
|
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
||||||
|
/// \a __D.
|
||||||
|
/// \param __D
|
||||||
|
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
|
||||||
|
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
|
/// operand \a __D.
|
||||||
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
|
_mm_crc32_u16(unsigned int __C, unsigned short __D)
|
||||||
|
{
|
||||||
|
return __builtin_ia32_crc32hi(__C, __D);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds the first unsigned integer operand to the CRC-32C checksum of
|
||||||
|
/// the second unsigned integer operand.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __C
|
||||||
|
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
||||||
|
/// \a __D.
|
||||||
|
/// \param __D
|
||||||
|
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
|
||||||
|
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
|
/// operand \a __D.
|
||||||
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
||||||
|
_mm_crc32_u32(unsigned int __C, unsigned int __D)
|
||||||
|
{
|
||||||
|
return __builtin_ia32_crc32si(__C, __D);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __x86_64__
|
||||||
|
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
||||||
|
/// unsigned 64-bit integer operand.
|
||||||
|
///
|
||||||
|
/// \headerfile <x86intrin.h>
|
||||||
|
///
|
||||||
|
/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
|
||||||
|
///
|
||||||
|
/// \param __C
|
||||||
|
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
||||||
|
/// \a __D.
|
||||||
|
/// \param __D
|
||||||
|
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
|
||||||
|
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
|
/// operand \a __D.
|
||||||
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
||||||
|
_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
|
||||||
|
{
|
||||||
|
return __builtin_ia32_crc32di(__C, __D);
|
||||||
|
}
|
||||||
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
|
#endif /* __CRC32INTRIN_H */
|
||||||
38
lib/include/emmintrin.h
vendored
38
lib/include/emmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __EMMINTRIN_H
|
#ifndef __EMMINTRIN_H
|
||||||
#define __EMMINTRIN_H
|
#define __EMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
|
typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
|
||||||
@ -2371,7 +2375,7 @@ _mm_madd_epi16(__m128i __a, __m128i __b)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epi16(__m128i __a, __m128i __b)
|
_mm_max_epi16(__m128i __a, __m128i __b)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
|
return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
|
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
|
||||||
@ -2391,7 +2395,7 @@ _mm_max_epi16(__m128i __a, __m128i __b)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epu8(__m128i __a, __m128i __b)
|
_mm_max_epu8(__m128i __a, __m128i __b)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
|
return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares corresponding elements of two 128-bit signed [8 x i16]
|
/// Compares corresponding elements of two 128-bit signed [8 x i16]
|
||||||
@ -2411,7 +2415,7 @@ _mm_max_epu8(__m128i __a, __m128i __b)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epi16(__m128i __a, __m128i __b)
|
_mm_min_epi16(__m128i __a, __m128i __b)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
|
return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
|
/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
|
||||||
@ -2431,7 +2435,7 @@ _mm_min_epi16(__m128i __a, __m128i __b)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epu8(__m128i __a, __m128i __b)
|
_mm_min_epu8(__m128i __a, __m128i __b)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
|
return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Multiplies the corresponding elements of two signed [8 x i16]
|
/// Multiplies the corresponding elements of two signed [8 x i16]
|
||||||
@ -2818,10 +2822,10 @@ _mm_xor_si128(__m128i __a, __m128i __b)
|
|||||||
/// \a a.
|
/// \a a.
|
||||||
/// \returns A 128-bit integer vector containing the left-shifted value.
|
/// \returns A 128-bit integer vector containing the left-shifted value.
|
||||||
#define _mm_slli_si128(a, imm) \
|
#define _mm_slli_si128(a, imm) \
|
||||||
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm_bslli_si128(a, imm) \
|
#define _mm_bslli_si128(a, imm) \
|
||||||
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
|
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
|
||||||
/// by the specified number of bits. Low-order bits are cleared.
|
/// by the specified number of bits. Low-order bits are cleared.
|
||||||
@ -3035,10 +3039,10 @@ _mm_sra_epi32(__m128i __a, __m128i __count)
|
|||||||
/// \a a.
|
/// \a a.
|
||||||
/// \returns A 128-bit integer vector containing the right-shifted value.
|
/// \returns A 128-bit integer vector containing the right-shifted value.
|
||||||
#define _mm_srli_si128(a, imm) \
|
#define _mm_srli_si128(a, imm) \
|
||||||
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
#define _mm_bsrli_si128(a, imm) \
|
#define _mm_bsrli_si128(a, imm) \
|
||||||
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
/// Right-shifts each of 16-bit values in the 128-bit integer vector
|
/// Right-shifts each of 16-bit values in the 128-bit integer vector
|
||||||
/// operand by the specified number of bits. High-order bits are cleared.
|
/// operand by the specified number of bits. High-order bits are cleared.
|
||||||
@ -4356,8 +4360,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
|
|||||||
/// \returns An integer, whose lower 16 bits are selected from the 128-bit
|
/// \returns An integer, whose lower 16 bits are selected from the 128-bit
|
||||||
/// integer vector parameter and the remaining bits are assigned zeros.
|
/// integer vector parameter and the remaining bits are assigned zeros.
|
||||||
#define _mm_extract_epi16(a, imm) \
|
#define _mm_extract_epi16(a, imm) \
|
||||||
(int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
|
((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
|
||||||
(int)(imm))
|
(int)(imm)))
|
||||||
|
|
||||||
/// Constructs a 128-bit integer vector by first making a copy of the
|
/// Constructs a 128-bit integer vector by first making a copy of the
|
||||||
/// 128-bit integer vector parameter, and then inserting the lower 16 bits
|
/// 128-bit integer vector parameter, and then inserting the lower 16 bits
|
||||||
@ -4380,8 +4384,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
|
|||||||
/// lower 16 bits of \a __b are written.
|
/// lower 16 bits of \a __b are written.
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi16(a, b, imm) \
|
#define _mm_insert_epi16(a, b, imm) \
|
||||||
(__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
|
((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
|
||||||
(int)(imm))
|
(int)(imm)))
|
||||||
|
|
||||||
/// Copies the values of the most significant bits from each 8-bit
|
/// Copies the values of the most significant bits from each 8-bit
|
||||||
/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
|
/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
|
||||||
@ -4430,7 +4434,7 @@ _mm_movemask_epi8(__m128i __a)
|
|||||||
/// 11: assign values from bits [127:96] of \a a.
|
/// 11: assign values from bits [127:96] of \a a.
|
||||||
/// \returns A 128-bit integer vector containing the shuffled values.
|
/// \returns A 128-bit integer vector containing the shuffled values.
|
||||||
#define _mm_shuffle_epi32(a, imm) \
|
#define _mm_shuffle_epi32(a, imm) \
|
||||||
(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
|
/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
|
||||||
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
|
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
|
||||||
@ -4460,7 +4464,7 @@ _mm_movemask_epi8(__m128i __a)
|
|||||||
/// 11: assign values from bits [63:48] of \a a. \n
|
/// 11: assign values from bits [63:48] of \a a. \n
|
||||||
/// \returns A 128-bit integer vector containing the shuffled values.
|
/// \returns A 128-bit integer vector containing the shuffled values.
|
||||||
#define _mm_shufflelo_epi16(a, imm) \
|
#define _mm_shufflelo_epi16(a, imm) \
|
||||||
(__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
|
/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
|
||||||
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
|
/// elements of a 128-bit integer vector of [8 x i16], using the immediate
|
||||||
@ -4490,7 +4494,7 @@ _mm_movemask_epi8(__m128i __a)
|
|||||||
/// 11: assign values from bits [127:112] of \a a. \n
|
/// 11: assign values from bits [127:112] of \a a. \n
|
||||||
/// \returns A 128-bit integer vector containing the shuffled values.
|
/// \returns A 128-bit integer vector containing the shuffled values.
|
||||||
#define _mm_shufflehi_epi16(a, imm) \
|
#define _mm_shufflehi_epi16(a, imm) \
|
||||||
(__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
|
((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
|
||||||
|
|
||||||
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
|
/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
|
||||||
/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
|
/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
|
||||||
@ -4844,8 +4848,8 @@ _mm_movemask_pd(__m128d __a)
|
|||||||
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
|
/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
|
/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
|
||||||
#define _mm_shuffle_pd(a, b, i) \
|
#define _mm_shuffle_pd(a, b, i) \
|
||||||
(__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
|
((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
|
||||||
(int)(i))
|
(int)(i)))
|
||||||
|
|
||||||
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
|
/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
|
||||||
/// floating-point vector of [4 x float].
|
/// floating-point vector of [4 x float].
|
||||||
|
|||||||
8
lib/include/f16cintrin.h
vendored
8
lib/include/f16cintrin.h
vendored
@ -66,8 +66,8 @@ _cvtsh_ss(unsigned short __a)
|
|||||||
/// 1XX: Use MXCSR.RC for rounding
|
/// 1XX: Use MXCSR.RC for rounding
|
||||||
/// \returns The converted 16-bit half-precision float value.
|
/// \returns The converted 16-bit half-precision float value.
|
||||||
#define _cvtss_sh(a, imm) \
|
#define _cvtss_sh(a, imm) \
|
||||||
(unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
|
||||||
(imm)))[0])
|
(imm)))[0]))
|
||||||
|
|
||||||
/// Converts a 128-bit vector containing 32-bit float values into a
|
/// Converts a 128-bit vector containing 32-bit float values into a
|
||||||
/// 128-bit vector containing 16-bit half-precision float values.
|
/// 128-bit vector containing 16-bit half-precision float values.
|
||||||
@ -93,7 +93,7 @@ _cvtsh_ss(unsigned short __a)
|
|||||||
/// values. The lower 64 bits are used to store the converted 16-bit
|
/// values. The lower 64 bits are used to store the converted 16-bit
|
||||||
/// half-precision floating-point values.
|
/// half-precision floating-point values.
|
||||||
#define _mm_cvtps_ph(a, imm) \
|
#define _mm_cvtps_ph(a, imm) \
|
||||||
(__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
|
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
|
||||||
|
|
||||||
/// Converts a 128-bit vector containing 16-bit half-precision float
|
/// Converts a 128-bit vector containing 16-bit half-precision float
|
||||||
/// values into a 128-bit vector containing 32-bit float values.
|
/// values into a 128-bit vector containing 32-bit float values.
|
||||||
@ -136,7 +136,7 @@ _mm_cvtph_ps(__m128i __a)
|
|||||||
/// \returns A 128-bit vector containing the converted 16-bit half-precision
|
/// \returns A 128-bit vector containing the converted 16-bit half-precision
|
||||||
/// float values.
|
/// float values.
|
||||||
#define _mm256_cvtps_ph(a, imm) \
|
#define _mm256_cvtps_ph(a, imm) \
|
||||||
(__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
|
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
|
||||||
|
|
||||||
/// Converts a 128-bit vector containing 16-bit half-precision float
|
/// Converts a 128-bit vector containing 16-bit half-precision float
|
||||||
/// values into a 256-bit vector of [8 x float].
|
/// values into a 256-bit vector of [8 x float].
|
||||||
|
|||||||
21
lib/include/float.h
vendored
21
lib/include/float.h
vendored
@ -14,10 +14,11 @@
|
|||||||
* additional definitions provided for Windows.
|
* additional definitions provided for Windows.
|
||||||
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
|
* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
|
||||||
*
|
*
|
||||||
* Also fall back on Darwin to allow additional definitions and
|
* Also fall back on Darwin and AIX to allow additional definitions and
|
||||||
* implementation-defined values.
|
* implementation-defined values.
|
||||||
*/
|
*/
|
||||||
#if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
|
#if (defined(__APPLE__) || defined(__MINGW32__) || defined(_MSC_VER) || \
|
||||||
|
defined(_AIX)) && \
|
||||||
__STDC_HOSTED__ && __has_include_next(<float.h>)
|
__STDC_HOSTED__ && __has_include_next(<float.h>)
|
||||||
|
|
||||||
/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
|
/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
|
||||||
@ -37,7 +38,9 @@
|
|||||||
# undef FLT_MANT_DIG
|
# undef FLT_MANT_DIG
|
||||||
# undef DBL_MANT_DIG
|
# undef DBL_MANT_DIG
|
||||||
# undef LDBL_MANT_DIG
|
# undef LDBL_MANT_DIG
|
||||||
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
|
# if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
|
||||||
|
__cplusplus >= 201103L || \
|
||||||
|
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||||
# undef DECIMAL_DIG
|
# undef DECIMAL_DIG
|
||||||
# endif
|
# endif
|
||||||
# undef FLT_DIG
|
# undef FLT_DIG
|
||||||
@ -64,7 +67,9 @@
|
|||||||
# undef FLT_MIN
|
# undef FLT_MIN
|
||||||
# undef DBL_MIN
|
# undef DBL_MIN
|
||||||
# undef LDBL_MIN
|
# undef LDBL_MIN
|
||||||
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
|
# if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
|
||||||
|
__cplusplus >= 201703L || \
|
||||||
|
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||||
# undef FLT_TRUE_MIN
|
# undef FLT_TRUE_MIN
|
||||||
# undef DBL_TRUE_MIN
|
# undef DBL_TRUE_MIN
|
||||||
# undef LDBL_TRUE_MIN
|
# undef LDBL_TRUE_MIN
|
||||||
@ -87,7 +92,9 @@
|
|||||||
#define DBL_MANT_DIG __DBL_MANT_DIG__
|
#define DBL_MANT_DIG __DBL_MANT_DIG__
|
||||||
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
|
#define LDBL_MANT_DIG __LDBL_MANT_DIG__
|
||||||
|
|
||||||
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || __cplusplus >= 201103L
|
#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) || \
|
||||||
|
__cplusplus >= 201103L || \
|
||||||
|
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||||
# define DECIMAL_DIG __DECIMAL_DIG__
|
# define DECIMAL_DIG __DECIMAL_DIG__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -123,7 +130,9 @@
|
|||||||
#define DBL_MIN __DBL_MIN__
|
#define DBL_MIN __DBL_MIN__
|
||||||
#define LDBL_MIN __LDBL_MIN__
|
#define LDBL_MIN __LDBL_MIN__
|
||||||
|
|
||||||
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || __cplusplus >= 201703L
|
#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) || \
|
||||||
|
__cplusplus >= 201703L || \
|
||||||
|
(__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
|
||||||
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
|
# define FLT_TRUE_MIN __FLT_DENORM_MIN__
|
||||||
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
|
# define DBL_TRUE_MIN __DBL_DENORM_MIN__
|
||||||
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
|
# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
|
||||||
|
|||||||
63
lib/include/gfniintrin.h
vendored
63
lib/include/gfniintrin.h
vendored
@ -28,14 +28,14 @@
|
|||||||
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
|
#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
|
||||||
|
|
||||||
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
|
#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), \
|
(__v16qi)(__m128i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
|
#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
|
||||||
(__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), \
|
(__v16qi)(__m128i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
||||||
@ -46,14 +46,14 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
|
|||||||
|
|
||||||
#ifdef __AVXINTRIN_H
|
#ifdef __AVXINTRIN_H
|
||||||
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
|
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
|
||||||
(__v32qi)(__m256i)(B), \
|
(__v32qi)(__m256i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
|
#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
|
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
|
||||||
(__v32qi)(__m256i)(B), \
|
(__v32qi)(__m256i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
|
static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
|
||||||
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
||||||
@ -65,31 +65,31 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
|
|||||||
|
|
||||||
#ifdef __AVX512BWINTRIN_H
|
#ifdef __AVX512BWINTRIN_H
|
||||||
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
|
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
|
||||||
(__v64qi)(__m512i)(B), \
|
(__v64qi)(__m512i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||||
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
|
(__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||||
(__v64qi)(__m512i)(S))
|
(__v64qi)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||||
(__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
||||||
U, A, B, I)
|
U, A, B, I)
|
||||||
|
|
||||||
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
|
#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
|
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
|
||||||
(__v64qi)(__m512i)(B), \
|
(__v64qi)(__m512i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
|
||||||
(__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \
|
(__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
|
||||||
(__v64qi)(__m512i)(S))
|
(__v64qi)(__m512i)(S)))
|
||||||
|
|
||||||
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||||
(__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
|
||||||
U, A, B, I)
|
U, A, B, I)
|
||||||
|
|
||||||
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
|
static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
|
||||||
@ -117,39 +117,38 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
|
|||||||
|
|
||||||
#ifdef __AVX512VLBWINTRIN_H
|
#ifdef __AVX512VLBWINTRIN_H
|
||||||
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||||
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
|
(__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||||
(__v16qi)(__m128i)(S))
|
(__v16qi)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||||
(__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
||||||
U, A, B, I)
|
U, A, B, I)
|
||||||
|
|
||||||
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||||
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
|
(__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
|
||||||
(__v32qi)(__m256i)(S))
|
(__v32qi)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
|
||||||
(__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||||
U, A, B, I)
|
U, A, B, I)
|
||||||
|
|
||||||
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
|
||||||
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
|
(__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
|
||||||
(__v16qi)(__m128i)(S))
|
(__v16qi)(__m128i)(S)))
|
||||||
|
|
||||||
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||||
(__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \
|
_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
|
||||||
U, A, B, I)
|
|
||||||
|
|
||||||
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
|
||||||
(__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
|
||||||
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
|
(__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
|
||||||
(__v32qi)(__m256i)(S))
|
(__v32qi)(__m256i)(S)))
|
||||||
|
|
||||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
|
||||||
(__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
|
||||||
U, A, B, I)
|
U, A, B, I)
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
|
||||||
|
|||||||
11
lib/include/hexagon_protos.h
vendored
11
lib/include/hexagon_protos.h
vendored
@ -8003,17 +8003,6 @@
|
|||||||
#define Q6_P_vtrunohb_PP __builtin_HEXAGON_S6_vtrunohb_ppp
|
#define Q6_P_vtrunohb_PP __builtin_HEXAGON_S6_vtrunohb_ppp
|
||||||
#endif /* __HEXAGON_ARCH___ >= 62 */
|
#endif /* __HEXAGON_ARCH___ >= 62 */
|
||||||
|
|
||||||
#if __HEXAGON_ARCH__ >= 62
|
|
||||||
/* ==========================================================================
|
|
||||||
Assembly Syntax: Vd32=vmem(Rt32):nt
|
|
||||||
C Intrinsic Prototype: HVX_Vector Q6_V_vmem_R_nt(Word32 Rt)
|
|
||||||
Instruction Type: MAPPING
|
|
||||||
Execution Slots: SLOT0123
|
|
||||||
========================================================================== */
|
|
||||||
|
|
||||||
#define Q6_V_vmem_R_nt __builtin_HEXAGON_V6_ldntnt0
|
|
||||||
#endif /* __HEXAGON_ARCH___ >= 62 */
|
|
||||||
|
|
||||||
#if __HEXAGON_ARCH__ >= 65
|
#if __HEXAGON_ARCH__ >= 65
|
||||||
/* ==========================================================================
|
/* ==========================================================================
|
||||||
Assembly Syntax: Pd4=!any8(vcmpb.eq(Rss32,Rtt32))
|
Assembly Syntax: Pd4=!any8(vcmpb.eq(Rss32,Rtt32))
|
||||||
|
|||||||
32
lib/include/hexagon_types.h
vendored
32
lib/include/hexagon_types.h
vendored
@ -1177,37 +1177,6 @@ private:
|
|||||||
|
|
||||||
#endif /* __cplusplus */
|
#endif /* __cplusplus */
|
||||||
|
|
||||||
// V65 Silver types
|
|
||||||
#if __Q6S_ARCH__ >= 65
|
|
||||||
// Silver vector types are 128 bytes, and pairs are 256. The vector predicate
|
|
||||||
// types are 16 bytes and 32 bytes for pairs.
|
|
||||||
typedef long HEXAGON_VecPred128 __attribute__((__vector_size__(16)))
|
|
||||||
__attribute__((aligned(128)));
|
|
||||||
|
|
||||||
typedef long HEXAGON_VecPred256 __attribute__((__vector_size__(32)))
|
|
||||||
__attribute__((aligned(128)));
|
|
||||||
|
|
||||||
typedef long HEXAGON_Vect1024 __attribute__((__vector_size__(128)))
|
|
||||||
__attribute__((aligned(128)));
|
|
||||||
|
|
||||||
typedef long HEXAGON_Vect2048 __attribute__((__vector_size__(256)))
|
|
||||||
__attribute__((aligned(256)));
|
|
||||||
|
|
||||||
typedef long HEXAGON_UVect1024 __attribute__((__vector_size__(128)))
|
|
||||||
__attribute__((aligned(4)));
|
|
||||||
|
|
||||||
typedef long HEXAGON_UVect2048 __attribute__((__vector_size__(256)))
|
|
||||||
__attribute__((aligned(4)));
|
|
||||||
|
|
||||||
#define Q6S_VectorPredPair HEXAGON_VecPred256
|
|
||||||
#define Q6S_VectorPred HEXAGON_VecPred128
|
|
||||||
#define Q6S_Vector HEXAGON_Vect1024
|
|
||||||
#define Q6S_VectorPair HEXAGON_Vect2048
|
|
||||||
#define Q6S_UVector HEXAGON_UVect1024
|
|
||||||
#define Q6S_UVectorPair HEXAGON_UVect2048
|
|
||||||
|
|
||||||
#else /* __Q6S_ARCH__ >= 65 */
|
|
||||||
|
|
||||||
// V65 Vector types
|
// V65 Vector types
|
||||||
#if __HVX_ARCH__ >= 65
|
#if __HVX_ARCH__ >= 65
|
||||||
#if defined __HVX__ && (__HVX_LENGTH__ == 128)
|
#if defined __HVX__ && (__HVX_LENGTH__ == 128)
|
||||||
@ -1256,7 +1225,6 @@ private:
|
|||||||
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 64) */
|
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 64) */
|
||||||
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 128) */
|
#endif /* defined __HVX__ && (__HVX_LENGTH__ == 128) */
|
||||||
#endif /* __HVX_ARCH__ >= 65 */
|
#endif /* __HVX_ARCH__ >= 65 */
|
||||||
#endif /* __Q6S_ARCH__ >= 65 */
|
|
||||||
|
|
||||||
/* Predicates */
|
/* Predicates */
|
||||||
|
|
||||||
|
|||||||
1609
lib/include/hvx_hexagon_protos.h
vendored
1609
lib/include/hvx_hexagon_protos.h
vendored
File diff suppressed because it is too large
Load Diff
12
lib/include/ia32intrin.h
vendored
12
lib/include/ia32intrin.h
vendored
@ -16,7 +16,7 @@
|
|||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
|
||||||
#define __DEFAULT_FN_ATTRS_SSE42 __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
|
#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
|
||||||
|
|
||||||
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
#if defined(__cplusplus) && (__cplusplus >= 201103L)
|
||||||
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
|
#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
|
||||||
@ -282,7 +282,7 @@ _castu64_f64(unsigned long long __A) {
|
|||||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
* operand \a __D.
|
* operand \a __D.
|
||||||
*/
|
*/
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
|
||||||
__crc32b(unsigned int __C, unsigned char __D)
|
__crc32b(unsigned int __C, unsigned char __D)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_crc32qi(__C, __D);
|
return __builtin_ia32_crc32qi(__C, __D);
|
||||||
@ -303,7 +303,7 @@ __crc32b(unsigned int __C, unsigned char __D)
|
|||||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
* operand \a __D.
|
* operand \a __D.
|
||||||
*/
|
*/
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
|
||||||
__crc32w(unsigned int __C, unsigned short __D)
|
__crc32w(unsigned int __C, unsigned short __D)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_crc32hi(__C, __D);
|
return __builtin_ia32_crc32hi(__C, __D);
|
||||||
@ -324,7 +324,7 @@ __crc32w(unsigned int __C, unsigned short __D)
|
|||||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
* operand \a __D.
|
* operand \a __D.
|
||||||
*/
|
*/
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS_SSE42
|
static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
|
||||||
__crc32d(unsigned int __C, unsigned int __D)
|
__crc32d(unsigned int __C, unsigned int __D)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_crc32si(__C, __D);
|
return __builtin_ia32_crc32si(__C, __D);
|
||||||
@ -346,7 +346,7 @@ __crc32d(unsigned int __C, unsigned int __D)
|
|||||||
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
* \returns The result of adding operand \a __C to the CRC-32C checksum of
|
||||||
* operand \a __D.
|
* operand \a __D.
|
||||||
*/
|
*/
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_SSE42
|
static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
|
||||||
__crc32q(unsigned long long __C, unsigned long long __D)
|
__crc32q(unsigned long long __C, unsigned long long __D)
|
||||||
{
|
{
|
||||||
return __builtin_ia32_crc32di(__C, __D);
|
return __builtin_ia32_crc32di(__C, __D);
|
||||||
@ -435,7 +435,7 @@ __rorq(unsigned long long __X, int __C) {
|
|||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
#undef __DEFAULT_FN_ATTRS_CAST
|
#undef __DEFAULT_FN_ATTRS_CAST
|
||||||
#undef __DEFAULT_FN_ATTRS_SSE42
|
#undef __DEFAULT_FN_ATTRS_CRC32
|
||||||
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
#undef __DEFAULT_FN_ATTRS_CONSTEXPR
|
||||||
|
|
||||||
#endif /* __IA32INTRIN_H */
|
#endif /* __IA32INTRIN_H */
|
||||||
|
|||||||
34
lib/include/immintrin.h
vendored
34
lib/include/immintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __IMMINTRIN_H
|
#ifndef __IMMINTRIN_H
|
||||||
#define __IMMINTRIN_H
|
#define __IMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <x86gprintrin.h>
|
#include <x86gprintrin.h>
|
||||||
|
|
||||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||||
@ -210,6 +214,20 @@
|
|||||||
#include <avx512pfintrin.h>
|
#include <avx512pfintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FIXME: _Float16 type is legal only when HW support float16 operation.
|
||||||
|
* We use __AVX512FP16__ to identify if float16 is supported or not, so
|
||||||
|
* when float16 is not supported, the related header is not included.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#if defined(__AVX512FP16__)
|
||||||
|
#include <avx512fp16intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__AVX512FP16__) && defined(__AVX512VL__)
|
||||||
|
#include <avx512vlfp16intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||||
defined(__AVX512BF16__)
|
defined(__AVX512BF16__)
|
||||||
#include <avx512bf16intrin.h>
|
#include <avx512bf16intrin.h>
|
||||||
@ -525,13 +543,13 @@ extern "C" {
|
|||||||
#if defined(__i386__) || defined(__x86_64__)
|
#if defined(__i386__) || defined(__x86_64__)
|
||||||
static __inline__ long __DEFAULT_FN_ATTRS
|
static __inline__ long __DEFAULT_FN_ATTRS
|
||||||
_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
|
_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
|
||||||
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
|
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
|
||||||
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
||||||
return _Value;
|
return _Value;
|
||||||
}
|
}
|
||||||
static __inline__ long __DEFAULT_FN_ATTRS
|
static __inline__ long __DEFAULT_FN_ATTRS
|
||||||
_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
|
_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
|
||||||
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
|
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
|
||||||
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
||||||
return _Value;
|
return _Value;
|
||||||
}
|
}
|
||||||
@ -539,13 +557,13 @@ _InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
|
|||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
||||||
_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
|
_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
|
||||||
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1"
|
__asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
|
||||||
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
||||||
return _Value;
|
return _Value;
|
||||||
}
|
}
|
||||||
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
||||||
_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
|
_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
|
||||||
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1"
|
__asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
|
||||||
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
: "+r" (_Value), "+m" (*_Target) :: "memory");
|
||||||
return _Value;
|
return _Value;
|
||||||
}
|
}
|
||||||
@ -557,7 +575,7 @@ _InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
|
|||||||
static __inline__ long __DEFAULT_FN_ATTRS
|
static __inline__ long __DEFAULT_FN_ATTRS
|
||||||
_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
|
_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
|
||||||
long _Exchange, long _Comparand) {
|
long _Exchange, long _Comparand) {
|
||||||
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
|
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
|
||||||
: "+a" (_Comparand), "+m" (*_Destination)
|
: "+a" (_Comparand), "+m" (*_Destination)
|
||||||
: "r" (_Exchange) : "memory");
|
: "r" (_Exchange) : "memory");
|
||||||
return _Comparand;
|
return _Comparand;
|
||||||
@ -565,7 +583,7 @@ _InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
|
|||||||
static __inline__ long __DEFAULT_FN_ATTRS
|
static __inline__ long __DEFAULT_FN_ATTRS
|
||||||
_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
|
_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
|
||||||
long _Exchange, long _Comparand) {
|
long _Exchange, long _Comparand) {
|
||||||
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
|
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
|
||||||
: "+a" (_Comparand), "+m" (*_Destination)
|
: "+a" (_Comparand), "+m" (*_Destination)
|
||||||
: "r" (_Exchange) : "memory");
|
: "r" (_Exchange) : "memory");
|
||||||
return _Comparand;
|
return _Comparand;
|
||||||
@ -575,7 +593,7 @@ _InterlockedCompareExchange_HLERelease(long volatile *_Destination,
|
|||||||
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
||||||
_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
|
_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
|
||||||
__int64 _Exchange, __int64 _Comparand) {
|
__int64 _Exchange, __int64 _Comparand) {
|
||||||
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1"
|
__asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
|
||||||
: "+a" (_Comparand), "+m" (*_Destination)
|
: "+a" (_Comparand), "+m" (*_Destination)
|
||||||
: "r" (_Exchange) : "memory");
|
: "r" (_Exchange) : "memory");
|
||||||
return _Comparand;
|
return _Comparand;
|
||||||
@ -583,7 +601,7 @@ _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
|
|||||||
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
static __inline__ __int64 __DEFAULT_FN_ATTRS
|
||||||
_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
|
_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
|
||||||
__int64 _Exchange, __int64 _Comparand) {
|
__int64 _Exchange, __int64 _Comparand) {
|
||||||
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1"
|
__asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
|
||||||
: "+a" (_Comparand), "+m" (*_Destination)
|
: "+a" (_Comparand), "+m" (*_Destination)
|
||||||
: "r" (_Exchange) : "memory");
|
: "r" (_Exchange) : "memory");
|
||||||
return _Comparand;
|
return _Comparand;
|
||||||
|
|||||||
40
lib/include/intrin.h
vendored
40
lib/include/intrin.h
vendored
@ -97,8 +97,9 @@ unsigned long __readcr8(void);
|
|||||||
unsigned int __readdr(unsigned int);
|
unsigned int __readdr(unsigned int);
|
||||||
#ifdef __i386__
|
#ifdef __i386__
|
||||||
unsigned char __readfsbyte(unsigned long);
|
unsigned char __readfsbyte(unsigned long);
|
||||||
unsigned __int64 __readfsqword(unsigned long);
|
|
||||||
unsigned short __readfsword(unsigned long);
|
unsigned short __readfsword(unsigned long);
|
||||||
|
unsigned long __readfsdword(unsigned long);
|
||||||
|
unsigned __int64 __readfsqword(unsigned long);
|
||||||
#endif
|
#endif
|
||||||
unsigned __int64 __readmsr(unsigned long);
|
unsigned __int64 __readmsr(unsigned long);
|
||||||
unsigned __int64 __readpmc(unsigned long);
|
unsigned __int64 __readpmc(unsigned long);
|
||||||
@ -149,10 +150,8 @@ long _InterlockedExchangeAdd_HLEAcquire(long volatile *, long);
|
|||||||
long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
|
long _InterlockedExchangeAdd_HLERelease(long volatile *, long);
|
||||||
__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
|
__int64 _InterlockedExchangeAdd64_HLEAcquire(__int64 volatile *, __int64);
|
||||||
__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
|
__int64 _InterlockedExchangeAdd64_HLERelease(__int64 volatile *, __int64);
|
||||||
void __attribute__((__deprecated__(
|
void _ReadBarrier(void);
|
||||||
"use other intrinsics or C++11 atomics instead"))) _ReadBarrier(void);
|
void _ReadWriteBarrier(void);
|
||||||
void __attribute__((__deprecated__(
|
|
||||||
"use other intrinsics or C++11 atomics instead"))) _ReadWriteBarrier(void);
|
|
||||||
unsigned int _rorx_u32(unsigned int, const unsigned int);
|
unsigned int _rorx_u32(unsigned int, const unsigned int);
|
||||||
int _sarx_i32(int, unsigned int);
|
int _sarx_i32(int, unsigned int);
|
||||||
#if __STDC_HOSTED__
|
#if __STDC_HOSTED__
|
||||||
@ -163,8 +162,7 @@ unsigned int _shrx_u32(unsigned int, unsigned int);
|
|||||||
void _Store_HLERelease(long volatile *, long);
|
void _Store_HLERelease(long volatile *, long);
|
||||||
void _Store64_HLERelease(__int64 volatile *, __int64);
|
void _Store64_HLERelease(__int64 volatile *, __int64);
|
||||||
void _StorePointer_HLERelease(void *volatile *, void *);
|
void _StorePointer_HLERelease(void *volatile *, void *);
|
||||||
void __attribute__((__deprecated__(
|
void _WriteBarrier(void);
|
||||||
"use other intrinsics or C++11 atomics instead"))) _WriteBarrier(void);
|
|
||||||
unsigned __int32 xbegin(void);
|
unsigned __int32 xbegin(void);
|
||||||
void _xend(void);
|
void _xend(void);
|
||||||
|
|
||||||
@ -457,7 +455,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst,
|
|||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsb\nxchg %%esi, %1"
|
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
|
||||||
|
"rep movsb\n"
|
||||||
|
"xchg {%%esi, %1|%1, esi}"
|
||||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
@ -467,12 +467,14 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst,
|
|||||||
unsigned long const *__src,
|
unsigned long const *__src,
|
||||||
size_t __n) {
|
size_t __n) {
|
||||||
#if defined(__x86_64__)
|
#if defined(__x86_64__)
|
||||||
__asm__ __volatile__("rep movsl"
|
__asm__ __volatile__("rep movs{l|d}"
|
||||||
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
: "+D"(__dst), "+S"(__src), "+c"(__n)
|
||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsl\nxchg %%esi, %1"
|
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
|
||||||
|
"rep movs{l|d}\n"
|
||||||
|
"xchg {%%esi, %1|%1, esi}"
|
||||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
@ -487,7 +489,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
|
|||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__("xchg %%esi, %1\nrep movsw\nxchg %%esi, %1"
|
__asm__ __volatile__("xchg {%%esi, %1|%1, esi}\n"
|
||||||
|
"rep movsw\n"
|
||||||
|
"xchg {%%esi, %1|%1, esi}"
|
||||||
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
: "+D"(__dst), "+r"(__src), "+c"(__n)
|
||||||
:
|
:
|
||||||
: "memory");
|
: "memory");
|
||||||
@ -496,7 +500,7 @@ static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst,
|
|||||||
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
|
static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst,
|
||||||
unsigned long __x,
|
unsigned long __x,
|
||||||
size_t __n) {
|
size_t __n) {
|
||||||
__asm__ __volatile__("rep stosl"
|
__asm__ __volatile__("rep stos{l|d}"
|
||||||
: "+D"(__dst), "+c"(__n)
|
: "+D"(__dst), "+c"(__n)
|
||||||
: "a"(__x)
|
: "a"(__x)
|
||||||
: "memory");
|
: "memory");
|
||||||
@ -538,9 +542,9 @@ static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst,
|
|||||||
#else
|
#else
|
||||||
/* x86-64 uses %rbx as the base register, so preserve it. */
|
/* x86-64 uses %rbx as the base register, so preserve it. */
|
||||||
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
|
||||||
__asm("xchgq %%rbx,%q1\n" \
|
__asm("xchg{q} {%%rbx, %q1|%q1, rbx}\n" \
|
||||||
"cpuid\n" \
|
"cpuid\n" \
|
||||||
"xchgq %%rbx,%q1" \
|
"xchg{q} {%%rbx, %q1|%q1, rbx}" \
|
||||||
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
: "=a"(__eax), "=r"(__ebx), "=c"(__ecx), "=d"(__edx) \
|
||||||
: "0"(__leaf), "2"(__count))
|
: "0"(__leaf), "2"(__count))
|
||||||
#endif
|
#endif
|
||||||
@ -600,13 +604,17 @@ __readmsr(unsigned long __register) {
|
|||||||
|
|
||||||
static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
|
static __inline__ unsigned __LPTRINT_TYPE__ __DEFAULT_FN_ATTRS __readcr3(void) {
|
||||||
unsigned __LPTRINT_TYPE__ __cr3_val;
|
unsigned __LPTRINT_TYPE__ __cr3_val;
|
||||||
__asm__ __volatile__ ("mov %%cr3, %0" : "=r"(__cr3_val) : : "memory");
|
__asm__ __volatile__(
|
||||||
|
"mov {%%cr3, %0|%0, cr3}"
|
||||||
|
: "=r"(__cr3_val)
|
||||||
|
:
|
||||||
|
: "memory");
|
||||||
return __cr3_val;
|
return __cr3_val;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ void __DEFAULT_FN_ATTRS
|
static __inline__ void __DEFAULT_FN_ATTRS
|
||||||
__writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
|
__writecr3(unsigned __INTPTR_TYPE__ __cr3_val) {
|
||||||
__asm__ ("mov %0, %%cr3" : : "r"(__cr3_val) : "memory");
|
__asm__ ("mov {%0, %%cr3|cr3, %0}" : : "r"(__cr3_val) : "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
10
lib/include/keylockerintrin.h
vendored
10
lib/include/keylockerintrin.h
vendored
@ -99,7 +99,7 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Wrap a 128-bit AES key from __key into a key handle and output in
|
/// Wrap a 128-bit AES key from __key into a key handle and output in
|
||||||
/// ((__m128i*)__h) to ((__m128i*)__h) + 5 and a 32-bit value as return.
|
/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return.
|
||||||
/// The explicit source operand __htype specifies handle restrictions.
|
/// The explicit source operand __htype specifies handle restrictions.
|
||||||
///
|
///
|
||||||
/// \headerfile <x86intrin.h>
|
/// \headerfile <x86intrin.h>
|
||||||
@ -120,9 +120,6 @@ _mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
|
|||||||
/// MEM[__h+127:__h] := Handle[127:0] // AAD
|
/// MEM[__h+127:__h] := Handle[127:0] // AAD
|
||||||
/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
|
/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
|
||||||
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
|
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
|
||||||
/// MEM[__h+511:__h+384] := 0 // Reserved for future usage
|
|
||||||
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
|
|
||||||
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
|
|
||||||
/// OF := 0
|
/// OF := 0
|
||||||
/// SF := 0
|
/// SF := 0
|
||||||
/// ZF := 0
|
/// ZF := 0
|
||||||
@ -136,7 +133,7 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
|
/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
|
||||||
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and
|
/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
|
||||||
/// a 32-bit value as return.
|
/// a 32-bit value as return.
|
||||||
/// The explicit source operand __htype specifies handle restrictions.
|
/// The explicit source operand __htype specifies handle restrictions.
|
||||||
///
|
///
|
||||||
@ -160,9 +157,6 @@ _mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
|
|||||||
/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
|
/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
|
||||||
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
|
/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
|
||||||
/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
|
/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
|
||||||
/// MEM[__h+639:__h+512] := 0 // Reserved for future usage
|
|
||||||
/// MEM[__h+767:__h+640] := 0 // Reserved for future usage
|
|
||||||
/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage
|
|
||||||
/// OF := 0
|
/// OF := 0
|
||||||
/// SF := 0
|
/// SF := 0
|
||||||
/// ZF := 0
|
/// ZF := 0
|
||||||
|
|||||||
20
lib/include/limits.h
vendored
20
lib/include/limits.h
vendored
@ -62,6 +62,26 @@
|
|||||||
|
|
||||||
#define CHAR_BIT __CHAR_BIT__
|
#define CHAR_BIT __CHAR_BIT__
|
||||||
|
|
||||||
|
/* C2x 5.2.4.2.1 */
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
#define BOOL_WIDTH __BOOL_WIDTH__
|
||||||
|
#define CHAR_WIDTH CHAR_BIT
|
||||||
|
#define SCHAR_WIDTH CHAR_BIT
|
||||||
|
#define UCHAR_WIDTH CHAR_BIT
|
||||||
|
#define USHRT_WIDTH __SHRT_WIDTH__
|
||||||
|
#define SHRT_WIDTH __SHRT_WIDTH__
|
||||||
|
#define UINT_WIDTH __INT_WIDTH__
|
||||||
|
#define INT_WIDTH __INT_WIDTH__
|
||||||
|
#define ULONG_WIDTH __LONG_WIDTH__
|
||||||
|
#define LONG_WIDTH __LONG_WIDTH__
|
||||||
|
#define ULLONG_WIDTH __LLONG_WIDTH__
|
||||||
|
#define LLONG_WIDTH __LLONG_WIDTH__
|
||||||
|
|
||||||
|
#define BITINT_MAXWIDTH __BITINT_MAXWIDTH__
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __CHAR_UNSIGNED__ /* -funsigned-char */
|
#ifdef __CHAR_UNSIGNED__ /* -funsigned-char */
|
||||||
#define CHAR_MIN 0
|
#define CHAR_MIN 0
|
||||||
#define CHAR_MAX UCHAR_MAX
|
#define CHAR_MAX UCHAR_MAX
|
||||||
|
|||||||
4
lib/include/mmintrin.h
vendored
4
lib/include/mmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __MMINTRIN_H
|
#ifndef __MMINTRIN_H
|
||||||
#define __MMINTRIN_H
|
#define __MMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
|
typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
|
||||||
|
|
||||||
typedef long long __v1di __attribute__((__vector_size__(8)));
|
typedef long long __v1di __attribute__((__vector_size__(8)));
|
||||||
|
|||||||
4
lib/include/nmmintrin.h
vendored
4
lib/include/nmmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __NMMINTRIN_H
|
#ifndef __NMMINTRIN_H
|
||||||
#define __NMMINTRIN_H
|
#define __NMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
|
/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
|
||||||
just include it now then. */
|
just include it now then. */
|
||||||
#include <smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
|||||||
78
lib/include/opencl-c-base.h
vendored
78
lib/include/opencl-c-base.h
vendored
@ -12,8 +12,8 @@
|
|||||||
// Define extension macros
|
// Define extension macros
|
||||||
|
|
||||||
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
||||||
// For SPIR all extensions are supported.
|
// For SPIR and SPIR-V all extensions are supported.
|
||||||
#if defined(__SPIR__)
|
#if defined(__SPIR__) || defined(__SPIRV__)
|
||||||
#define cl_khr_subgroup_extended_types 1
|
#define cl_khr_subgroup_extended_types 1
|
||||||
#define cl_khr_subgroup_non_uniform_vote 1
|
#define cl_khr_subgroup_non_uniform_vote 1
|
||||||
#define cl_khr_subgroup_ballot 1
|
#define cl_khr_subgroup_ballot 1
|
||||||
@ -25,12 +25,31 @@
|
|||||||
#define cl_khr_integer_dot_product 1
|
#define cl_khr_integer_dot_product 1
|
||||||
#define __opencl_c_integer_dot_product_input_4x8bit 1
|
#define __opencl_c_integer_dot_product_input_4x8bit 1
|
||||||
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
|
#define __opencl_c_integer_dot_product_input_4x8bit_packed 1
|
||||||
|
#define cl_ext_float_atomics 1
|
||||||
|
#ifdef cl_khr_fp16
|
||||||
|
#define __opencl_c_ext_fp16_global_atomic_load_store 1
|
||||||
|
#define __opencl_c_ext_fp16_local_atomic_load_store 1
|
||||||
|
#define __opencl_c_ext_fp16_global_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp16_local_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp16_global_atomic_min_max 1
|
||||||
|
#define __opencl_c_ext_fp16_local_atomic_min_max 1
|
||||||
|
#endif
|
||||||
|
#ifdef cl_khr_fp64
|
||||||
|
#define __opencl_c_ext_fp64_global_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp64_local_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp64_global_atomic_min_max 1
|
||||||
|
#define __opencl_c_ext_fp64_local_atomic_min_max 1
|
||||||
|
#endif
|
||||||
|
#define __opencl_c_ext_fp32_global_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp32_local_atomic_add 1
|
||||||
|
#define __opencl_c_ext_fp32_global_atomic_min_max 1
|
||||||
|
#define __opencl_c_ext_fp32_local_atomic_min_max 1
|
||||||
|
|
||||||
#endif // defined(__SPIR__)
|
#endif // defined(__SPIR__) || defined(__SPIRV__)
|
||||||
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
#endif // (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ >= 200)
|
||||||
|
|
||||||
// Define feature macros for OpenCL C 2.0
|
// Define feature macros for OpenCL C 2.0
|
||||||
#if (defined(__OPENCL_CPP_VERSION__) || __OPENCL_C_VERSION__ == 200)
|
#if (__OPENCL_CPP_VERSION__ == 100 || __OPENCL_C_VERSION__ == 200)
|
||||||
#define __opencl_c_pipes 1
|
#define __opencl_c_pipes 1
|
||||||
#define __opencl_c_generic_address_space 1
|
#define __opencl_c_generic_address_space 1
|
||||||
#define __opencl_c_work_group_collective_functions 1
|
#define __opencl_c_work_group_collective_functions 1
|
||||||
@ -45,12 +64,19 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Define header-only feature macros for OpenCL C 3.0.
|
// Define header-only feature macros for OpenCL C 3.0.
|
||||||
#if (__OPENCL_C_VERSION__ == 300)
|
#if (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
|
||||||
// For the SPIR target all features are supported.
|
// For the SPIR and SPIR-V target all features are supported.
|
||||||
#if defined(__SPIR__)
|
#if defined(__SPIR__) || defined(__SPIRV__)
|
||||||
#define __opencl_c_atomic_scope_all_devices 1
|
#define __opencl_c_atomic_scope_all_devices 1
|
||||||
|
#define __opencl_c_read_write_images 1
|
||||||
#endif // defined(__SPIR__)
|
#endif // defined(__SPIR__)
|
||||||
#endif // (__OPENCL_C_VERSION__ == 300)
|
#endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
|
||||||
|
|
||||||
|
#if !defined(__opencl_c_generic_address_space)
|
||||||
|
// Internal feature macro to provide named (global, local, private) address
|
||||||
|
// space overloads for builtin functions that take a pointer argument.
|
||||||
|
#define __opencl_c_named_address_space_builtins 1
|
||||||
|
#endif // !defined(__opencl_c_generic_address_space)
|
||||||
|
|
||||||
// built-in scalar data types:
|
// built-in scalar data types:
|
||||||
|
|
||||||
@ -329,11 +355,17 @@ typedef enum memory_scope {
|
|||||||
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||||
#if defined(__opencl_c_atomic_scope_all_devices)
|
#if defined(__opencl_c_atomic_scope_all_devices)
|
||||||
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||||
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0)
|
#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
|
||||||
memory_scope_all_devices = memory_scope_all_svm_devices,
|
memory_scope_all_devices = memory_scope_all_svm_devices,
|
||||||
#endif // __OPENCL_C_VERSION__ >= CL_VERSION_3_0
|
#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
|
||||||
#endif // defined(__opencl_c_atomic_scope_all_devices)
|
#endif // defined(__opencl_c_atomic_scope_all_devices)
|
||||||
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
|
/**
|
||||||
|
* Subgroups have different requirements on forward progress, so just test
|
||||||
|
* all the relevant macros.
|
||||||
|
* CL 3.0 sub-groups "they are not guaranteed to make independent forward progress"
|
||||||
|
* KHR subgroups "Subgroups within a workgroup are independent, make forward progress with respect to each other"
|
||||||
|
*/
|
||||||
|
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || defined(__opencl_c_subgroups)
|
||||||
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||||
#endif
|
#endif
|
||||||
} memory_scope;
|
} memory_scope;
|
||||||
@ -473,12 +505,14 @@ typedef int clk_profiling_info;
|
|||||||
|
|
||||||
#define MAX_WORK_DIM 3
|
#define MAX_WORK_DIM 3
|
||||||
|
|
||||||
|
#ifdef __opencl_c_device_enqueue
|
||||||
typedef struct {
|
typedef struct {
|
||||||
unsigned int workDimension;
|
unsigned int workDimension;
|
||||||
size_t globalWorkOffset[MAX_WORK_DIM];
|
size_t globalWorkOffset[MAX_WORK_DIM];
|
||||||
size_t globalWorkSize[MAX_WORK_DIM];
|
size_t globalWorkSize[MAX_WORK_DIM];
|
||||||
size_t localWorkSize[MAX_WORK_DIM];
|
size_t localWorkSize[MAX_WORK_DIM];
|
||||||
} ndrange_t;
|
} ndrange_t;
|
||||||
|
#endif // __opencl_c_device_enqueue
|
||||||
|
|
||||||
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||||
|
|
||||||
@ -572,6 +606,28 @@ typedef struct {
|
|||||||
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
|
#define as_intptr_t(x) __builtin_astype((x), intptr_t)
|
||||||
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
|
#define as_uintptr_t(x) __builtin_astype((x), uintptr_t)
|
||||||
|
|
||||||
|
// C++ for OpenCL - __remove_address_space
|
||||||
|
#if defined(__OPENCL_CPP_VERSION__)
|
||||||
|
template <typename _Tp> struct __remove_address_space { using type = _Tp; };
|
||||||
|
#if defined(__opencl_c_generic_address_space)
|
||||||
|
template <typename _Tp> struct __remove_address_space<__generic _Tp> {
|
||||||
|
using type = _Tp;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
template <typename _Tp> struct __remove_address_space<__global _Tp> {
|
||||||
|
using type = _Tp;
|
||||||
|
};
|
||||||
|
template <typename _Tp> struct __remove_address_space<__private _Tp> {
|
||||||
|
using type = _Tp;
|
||||||
|
};
|
||||||
|
template <typename _Tp> struct __remove_address_space<__local _Tp> {
|
||||||
|
using type = _Tp;
|
||||||
|
};
|
||||||
|
template <typename _Tp> struct __remove_address_space<__constant _Tp> {
|
||||||
|
using type = _Tp;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
|
// OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
|
||||||
|
|
||||||
#define __kernel_exec(X, typen) __kernel \
|
#define __kernel_exec(X, typen) __kernel \
|
||||||
|
|||||||
2162
lib/include/opencl-c.h
vendored
2162
lib/include/opencl-c.h
vendored
File diff suppressed because it is too large
Load Diff
13
lib/include/openmp_wrappers/complex
vendored
13
lib/include/openmp_wrappers/complex
vendored
@ -17,9 +17,18 @@
|
|||||||
// We require std::math functions in the complex builtins below.
|
// We require std::math functions in the complex builtins below.
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
|
#ifdef __NVPTX__
|
||||||
#define __OPENMP_NVPTX__
|
#define __OPENMP_NVPTX__
|
||||||
#include <__clang_cuda_complex_builtins.h>
|
#include <__clang_cuda_complex_builtins.h>
|
||||||
#undef __OPENMP_NVPTX__
|
#undef __OPENMP_NVPTX__
|
||||||
|
#endif // __NVPTX__
|
||||||
|
|
||||||
|
#ifdef __AMDGCN__
|
||||||
|
#define __OPENMP_AMDGCN__
|
||||||
|
#include <__clang_cuda_complex_builtins.h>
|
||||||
|
#undef __OPENMP_AMDGCN__
|
||||||
|
#endif // __AMDGCN__
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Grab the host header too.
|
// Grab the host header too.
|
||||||
@ -36,11 +45,11 @@
|
|||||||
#ifndef _LIBCPP_STD_VER
|
#ifndef _LIBCPP_STD_VER
|
||||||
|
|
||||||
#pragma omp begin declare variant match( \
|
#pragma omp begin declare variant match( \
|
||||||
device = {arch(nvptx, nvptx64)}, \
|
device = {arch(amdgcn, nvptx, nvptx64)}, \
|
||||||
implementation = {extension(match_any, allow_templates)})
|
implementation = {extension(match_any, allow_templates)})
|
||||||
|
|
||||||
#include <complex_cmath.h>
|
#include <complex_cmath.h>
|
||||||
|
|
||||||
#pragma omp end declare variant
|
#pragma omp end declare variant
|
||||||
|
|
||||||
#endif
|
#endif // _LIBCPP_STD_VER
|
||||||
|
|||||||
9
lib/include/openmp_wrappers/complex.h
vendored
9
lib/include/openmp_wrappers/complex.h
vendored
@ -17,10 +17,19 @@
|
|||||||
// We require math functions in the complex builtins below.
|
// We require math functions in the complex builtins below.
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
|
#ifdef __NVPTX__
|
||||||
#define __OPENMP_NVPTX__
|
#define __OPENMP_NVPTX__
|
||||||
#include <__clang_cuda_complex_builtins.h>
|
#include <__clang_cuda_complex_builtins.h>
|
||||||
#undef __OPENMP_NVPTX__
|
#undef __OPENMP_NVPTX__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AMDGCN__
|
||||||
|
#define __OPENMP_AMDGCN__
|
||||||
|
#include <__clang_cuda_complex_builtins.h>
|
||||||
|
#undef __OPENMP_AMDGCN__
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// Grab the host header too.
|
// Grab the host header too.
|
||||||
#include_next <complex.h>
|
#include_next <complex.h>
|
||||||
|
|||||||
4
lib/include/pmmintrin.h
vendored
4
lib/include/pmmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __PMMINTRIN_H
|
#ifndef __PMMINTRIN_H
|
||||||
#define __PMMINTRIN_H
|
#define __PMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
|
|||||||
5
lib/include/ppc_wrappers/emmintrin.h
vendored
5
lib/include/ppc_wrappers/emmintrin.h
vendored
@ -35,7 +35,7 @@
|
|||||||
#ifndef EMMINTRIN_H_
|
#ifndef EMMINTRIN_H_
|
||||||
#define EMMINTRIN_H_
|
#define EMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
@ -2319,6 +2319,7 @@ _mm_castsi128_pd(__m128i __A)
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <emmintrin.h>
|
#include_next <emmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* EMMINTRIN_H_ */
|
#endif /* EMMINTRIN_H_ */
|
||||||
|
|||||||
2
lib/include/ppc_wrappers/mm_malloc.h
vendored
2
lib/include/ppc_wrappers/mm_malloc.h
vendored
@ -10,7 +10,7 @@
|
|||||||
#ifndef _MM_MALLOC_H_INCLUDED
|
#ifndef _MM_MALLOC_H_INCLUDED
|
||||||
#define _MM_MALLOC_H_INCLUDED
|
#define _MM_MALLOC_H_INCLUDED
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
|||||||
5
lib/include/ppc_wrappers/mmintrin.h
vendored
5
lib/include/ppc_wrappers/mmintrin.h
vendored
@ -35,7 +35,7 @@
|
|||||||
#ifndef _MMINTRIN_H_INCLUDED
|
#ifndef _MMINTRIN_H_INCLUDED
|
||||||
#define _MMINTRIN_H_INCLUDED
|
#define _MMINTRIN_H_INCLUDED
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
/* The Intel API is flexible enough that we must allow aliasing with other
|
/* The Intel API is flexible enough that we must allow aliasing with other
|
||||||
@ -1445,6 +1445,7 @@ extern __inline __m64
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <mmintrin.h>
|
#include_next <mmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* _MMINTRIN_H_INCLUDED */
|
#endif /* _MMINTRIN_H_INCLUDED */
|
||||||
|
|||||||
5
lib/include/ppc_wrappers/pmmintrin.h
vendored
5
lib/include/ppc_wrappers/pmmintrin.h
vendored
@ -38,7 +38,7 @@
|
|||||||
#ifndef PMMINTRIN_H_
|
#ifndef PMMINTRIN_H_
|
||||||
#define PMMINTRIN_H_
|
#define PMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
/* We need definitions from the SSE2 and SSE header files*/
|
/* We need definitions from the SSE2 and SSE header files*/
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
@ -145,6 +145,7 @@ _mm_lddqu_si128 (__m128i const *__P)
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <pmmintrin.h>
|
#include_next <pmmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* PMMINTRIN_H_ */
|
#endif /* PMMINTRIN_H_ */
|
||||||
|
|||||||
7
lib/include/ppc_wrappers/smmintrin.h
vendored
7
lib/include/ppc_wrappers/smmintrin.h
vendored
@ -29,10 +29,10 @@
|
|||||||
#ifndef SMMINTRIN_H_
|
#ifndef SMMINTRIN_H_
|
||||||
#define SMMINTRIN_H_
|
#define SMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#include <emmintrin.h>
|
#include <tmmintrin.h>
|
||||||
|
|
||||||
extern __inline int
|
extern __inline int
|
||||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||||
@ -104,6 +104,7 @@ extern __inline __m128i
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <smmintrin.h>
|
#include_next <smmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* _SMMINTRIN_H_ */
|
#endif /* _SMMINTRIN_H_ */
|
||||||
|
|||||||
5
lib/include/ppc_wrappers/tmmintrin.h
vendored
5
lib/include/ppc_wrappers/tmmintrin.h
vendored
@ -25,7 +25,7 @@
|
|||||||
#ifndef TMMINTRIN_H_
|
#ifndef TMMINTRIN_H_
|
||||||
#define TMMINTRIN_H_
|
#define TMMINTRIN_H_
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
|
|
||||||
@ -490,6 +490,7 @@ _mm_mulhrs_pi16 (__m64 __A, __m64 __B)
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <tmmintrin.h>
|
#include_next <tmmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* TMMINTRIN_H_ */
|
#endif /* TMMINTRIN_H_ */
|
||||||
|
|||||||
5
lib/include/ppc_wrappers/xmmintrin.h
vendored
5
lib/include/ppc_wrappers/xmmintrin.h
vendored
@ -34,7 +34,7 @@
|
|||||||
#ifndef _XMMINTRIN_H_INCLUDED
|
#ifndef _XMMINTRIN_H_INCLUDED
|
||||||
#define _XMMINTRIN_H_INCLUDED
|
#define _XMMINTRIN_H_INCLUDED
|
||||||
|
|
||||||
#if defined(__linux__) && defined(__ppc64__)
|
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
|
||||||
|
|
||||||
/* Define four value permute mask */
|
/* Define four value permute mask */
|
||||||
#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
|
#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
|
||||||
@ -1838,6 +1838,7 @@ do { \
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
#include_next <xmmintrin.h>
|
#include_next <xmmintrin.h>
|
||||||
#endif /* defined(__linux__) && defined(__ppc64__) */
|
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
|
||||||
|
*/
|
||||||
|
|
||||||
#endif /* _XMMINTRIN_H_INCLUDED */
|
#endif /* _XMMINTRIN_H_INCLUDED */
|
||||||
|
|||||||
7
lib/include/prfchwintrin.h
vendored
7
lib/include/prfchwintrin.h
vendored
@ -47,9 +47,12 @@ _m_prefetch(void *__P)
|
|||||||
/// \param __P
|
/// \param __P
|
||||||
/// A pointer specifying the memory address to be prefetched.
|
/// A pointer specifying the memory address to be prefetched.
|
||||||
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
static __inline__ void __attribute__((__always_inline__, __nodebug__))
|
||||||
_m_prefetchw(void *__P)
|
_m_prefetchw(volatile const void *__P)
|
||||||
{
|
{
|
||||||
__builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
|
#pragma clang diagnostic push
|
||||||
|
#pragma clang diagnostic ignored "-Wcast-qual"
|
||||||
|
__builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
|
||||||
|
#pragma clang diagnostic pop
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* __PRFCHWINTRIN_H */
|
#endif /* __PRFCHWINTRIN_H */
|
||||||
|
|||||||
218900
lib/include/riscv_vector.h
vendored
218900
lib/include/riscv_vector.h
vendored
File diff suppressed because it is too large
Load Diff
225
lib/include/smmintrin.h
vendored
225
lib/include/smmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __SMMINTRIN_H
|
#ifndef __SMMINTRIN_H
|
||||||
#define __SMMINTRIN_H
|
#define __SMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <tmmintrin.h>
|
#include <tmmintrin.h>
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
@ -231,7 +235,7 @@
|
|||||||
/// 11: Truncated
|
/// 11: Truncated
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
|
/// \returns A 128-bit vector of [4 x float] containing the rounded values.
|
||||||
#define _mm_round_ps(X, M) \
|
#define _mm_round_ps(X, M) \
|
||||||
(__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
|
((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
|
||||||
|
|
||||||
/// Copies three upper elements of the first 128-bit vector operand to
|
/// Copies three upper elements of the first 128-bit vector operand to
|
||||||
/// the corresponding three upper elements of the 128-bit result vector of
|
/// the corresponding three upper elements of the 128-bit result vector of
|
||||||
@ -272,8 +276,8 @@
|
|||||||
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
|
/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
|
||||||
/// values.
|
/// values.
|
||||||
#define _mm_round_ss(X, Y, M) \
|
#define _mm_round_ss(X, Y, M) \
|
||||||
(__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
|
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
|
||||||
(__v4sf)(__m128)(Y), (M))
|
(__v4sf)(__m128)(Y), (M)))
|
||||||
|
|
||||||
/// Rounds each element of the 128-bit vector of [2 x double] to an
|
/// Rounds each element of the 128-bit vector of [2 x double] to an
|
||||||
/// integer value according to the rounding control specified by the second
|
/// integer value according to the rounding control specified by the second
|
||||||
@ -306,7 +310,7 @@
|
|||||||
/// 11: Truncated
|
/// 11: Truncated
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
|
/// \returns A 128-bit vector of [2 x double] containing the rounded values.
|
||||||
#define _mm_round_pd(X, M) \
|
#define _mm_round_pd(X, M) \
|
||||||
(__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
|
((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
|
||||||
|
|
||||||
/// Copies the upper element of the first 128-bit vector operand to the
|
/// Copies the upper element of the first 128-bit vector operand to the
|
||||||
/// corresponding upper element of the 128-bit result vector of [2 x double].
|
/// corresponding upper element of the 128-bit result vector of [2 x double].
|
||||||
@ -347,8 +351,8 @@
|
|||||||
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
|
/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
|
||||||
/// values.
|
/// values.
|
||||||
#define _mm_round_sd(X, Y, M) \
|
#define _mm_round_sd(X, Y, M) \
|
||||||
(__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
|
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
|
||||||
(__v2df)(__m128d)(Y), (M))
|
(__v2df)(__m128d)(Y), (M)))
|
||||||
|
|
||||||
/* SSE4 Packed Blending Intrinsics. */
|
/* SSE4 Packed Blending Intrinsics. */
|
||||||
/// Returns a 128-bit vector of [2 x double] where the values are
|
/// Returns a 128-bit vector of [2 x double] where the values are
|
||||||
@ -376,8 +380,8 @@
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
/// \returns A 128-bit vector of [2 x double] containing the copied values.
|
||||||
#define _mm_blend_pd(V1, V2, M) \
|
#define _mm_blend_pd(V1, V2, M) \
|
||||||
(__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
|
((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
|
||||||
(__v2df)(__m128d)(V2), (int)(M))
|
(__v2df)(__m128d)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Returns a 128-bit vector of [4 x float] where the values are selected
|
/// Returns a 128-bit vector of [4 x float] where the values are selected
|
||||||
/// from either the first or second operand as specified by the third
|
/// from either the first or second operand as specified by the third
|
||||||
@ -404,8 +408,8 @@
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
/// \returns A 128-bit vector of [4 x float] containing the copied values.
|
||||||
#define _mm_blend_ps(V1, V2, M) \
|
#define _mm_blend_ps(V1, V2, M) \
|
||||||
(__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
|
((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
|
||||||
(__v4sf)(__m128)(V2), (int)(M))
|
(__v4sf)(__m128)(V2), (int)(M)))
|
||||||
|
|
||||||
/// Returns a 128-bit vector of [2 x double] where the values are
|
/// Returns a 128-bit vector of [2 x double] where the values are
|
||||||
/// selected from either the first or second operand as specified by the
|
/// selected from either the first or second operand as specified by the
|
||||||
@ -513,8 +517,8 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
|
|||||||
/// is copied to the same position in the result.
|
/// is copied to the same position in the result.
|
||||||
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
|
/// \returns A 128-bit vector of [8 x i16] containing the copied values.
|
||||||
#define _mm_blend_epi16(V1, V2, M) \
|
#define _mm_blend_epi16(V1, V2, M) \
|
||||||
(__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
|
((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
|
||||||
(__v8hi)(__m128i)(V2), (int)(M))
|
(__v8hi)(__m128i)(V2), (int)(M)))
|
||||||
|
|
||||||
/* SSE4 Dword Multiply Instructions. */
|
/* SSE4 Dword Multiply Instructions. */
|
||||||
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
|
/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
|
||||||
@ -590,8 +594,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// in the corresponding element; otherwise that element is set to zero.
|
/// in the corresponding element; otherwise that element is set to zero.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the dot product.
|
/// \returns A 128-bit vector of [4 x float] containing the dot product.
|
||||||
#define _mm_dp_ps(X, Y, M) \
|
#define _mm_dp_ps(X, Y, M) \
|
||||||
(__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
|
((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
|
||||||
(__v4sf)(__m128)(Y), (M))
|
(__v4sf)(__m128)(Y), (M)))
|
||||||
|
|
||||||
/// Computes the dot product of the two 128-bit vectors of [2 x double]
|
/// Computes the dot product of the two 128-bit vectors of [2 x double]
|
||||||
/// and returns it in the elements of the 128-bit result vector of
|
/// and returns it in the elements of the 128-bit result vector of
|
||||||
@ -625,8 +629,8 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
/// each [2 x double] vector. If a bit is set, the dot product is returned in
|
/// each [2 x double] vector. If a bit is set, the dot product is returned in
|
||||||
/// the corresponding element; otherwise that element is set to zero.
|
/// the corresponding element; otherwise that element is set to zero.
|
||||||
#define _mm_dp_pd(X, Y, M) \
|
#define _mm_dp_pd(X, Y, M) \
|
||||||
(__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
|
((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
|
||||||
(__v2df)(__m128d)(Y), (M))
|
(__v2df)(__m128d)(Y), (M)))
|
||||||
|
|
||||||
/* SSE4 Streaming Load Hint Instruction. */
|
/* SSE4 Streaming Load Hint Instruction. */
|
||||||
/// Loads integer values from a 128-bit aligned memory location to a
|
/// Loads integer values from a 128-bit aligned memory location to a
|
||||||
@ -664,7 +668,7 @@ _mm_stream_load_si128 (__m128i const *__V)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epi8 (__m128i __V1, __m128i __V2)
|
_mm_min_epi8 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -683,7 +687,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epi8 (__m128i __V1, __m128i __V2)
|
_mm_max_epi8 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
|
return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -702,7 +706,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epu16 (__m128i __V1, __m128i __V2)
|
_mm_min_epu16 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -721,7 +725,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epu16 (__m128i __V1, __m128i __V2)
|
_mm_max_epu16 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
|
return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -740,7 +744,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epi32 (__m128i __V1, __m128i __V2)
|
_mm_min_epi32 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
|
return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -759,7 +763,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epi32 (__m128i __V1, __m128i __V2)
|
_mm_max_epi32 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
|
return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -778,7 +782,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_min_epu32 (__m128i __V1, __m128i __V2)
|
_mm_min_epu32 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
|
return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Compares the corresponding elements of two 128-bit vectors of
|
/// Compares the corresponding elements of two 128-bit vectors of
|
||||||
@ -797,7 +801,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_max_epu32 (__m128i __V1, __m128i __V2)
|
_mm_max_epu32 (__m128i __V1, __m128i __V2)
|
||||||
{
|
{
|
||||||
return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
|
return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
/* SSE4 Insertion and Extraction from XMM Register Instructions. */
|
||||||
@ -865,15 +869,13 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 10: Bits [95:64] of parameter \a X are returned. \n
|
/// 10: Bits [95:64] of parameter \a X are returned. \n
|
||||||
/// 11: Bits [127:96] of parameter \a X are returned.
|
/// 11: Bits [127:96] of parameter \a X are returned.
|
||||||
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
|
/// \returns A 32-bit integer containing the extracted 32 bits of float data.
|
||||||
#define _mm_extract_ps(X, N) (__extension__ \
|
#define _mm_extract_ps(X, N) \
|
||||||
({ union { int __i; float __f; } __t; \
|
__builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
|
||||||
__t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
|
|
||||||
__t.__i;}))
|
|
||||||
|
|
||||||
/* Miscellaneous insert and extract macros. */
|
/* Miscellaneous insert and extract macros. */
|
||||||
/* Extract a single-precision float from X at index N into D. */
|
/* Extract a single-precision float from X at index N into D. */
|
||||||
#define _MM_EXTRACT_FLOAT(D, X, N) \
|
#define _MM_EXTRACT_FLOAT(D, X, N) \
|
||||||
{ (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
|
do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
|
||||||
|
|
||||||
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
|
||||||
an index suitable for _mm_insert_ps. */
|
an index suitable for _mm_insert_ps. */
|
||||||
@ -925,8 +927,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 1111: Bits [127:120] of the result are used for insertion.
|
/// 1111: Bits [127:120] of the result are used for insertion.
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi8(X, I, N) \
|
#define _mm_insert_epi8(X, I, N) \
|
||||||
(__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
|
||||||
(int)(I), (int)(N))
|
(int)(I), (int)(N)))
|
||||||
|
|
||||||
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
|
/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
|
||||||
/// the 128-bit integer vector parameter, and then inserting the 32-bit
|
/// the 128-bit integer vector parameter, and then inserting the 32-bit
|
||||||
@ -957,8 +959,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 11: Bits [127:96] of the result are used for insertion.
|
/// 11: Bits [127:96] of the result are used for insertion.
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi32(X, I, N) \
|
#define _mm_insert_epi32(X, I, N) \
|
||||||
(__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
|
||||||
(int)(I), (int)(N))
|
(int)(I), (int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
|
/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
|
||||||
@ -988,8 +990,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 1: Bits [127:64] of the result are used for insertion. \n
|
/// 1: Bits [127:64] of the result are used for insertion. \n
|
||||||
/// \returns A 128-bit integer vector containing the constructed values.
|
/// \returns A 128-bit integer vector containing the constructed values.
|
||||||
#define _mm_insert_epi64(X, I, N) \
|
#define _mm_insert_epi64(X, I, N) \
|
||||||
(__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
|
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
|
||||||
(long long)(I), (int)(N))
|
(long long)(I), (int)(N)))
|
||||||
#endif /* __x86_64__ */
|
#endif /* __x86_64__ */
|
||||||
|
|
||||||
/* Extract int from packed integer array at index. This returns the element
|
/* Extract int from packed integer array at index. This returns the element
|
||||||
@ -1031,8 +1033,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 128-bit integer vector parameter and the remaining bits are assigned
|
/// 128-bit integer vector parameter and the remaining bits are assigned
|
||||||
/// zeros.
|
/// zeros.
|
||||||
#define _mm_extract_epi8(X, N) \
|
#define _mm_extract_epi8(X, N) \
|
||||||
(int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
|
((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
|
||||||
(int)(N))
|
(int)(N)))
|
||||||
|
|
||||||
/// Extracts a 32-bit element from the 128-bit integer vector of
|
/// Extracts a 32-bit element from the 128-bit integer vector of
|
||||||
/// [4 x i32], using the immediate value parameter \a N as a selector.
|
/// [4 x i32], using the immediate value parameter \a N as a selector.
|
||||||
@ -1057,7 +1059,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// \returns An integer, whose lower 32 bits are selected from the 128-bit
|
/// \returns An integer, whose lower 32 bits are selected from the 128-bit
|
||||||
/// integer vector parameter and the remaining bits are assigned zeros.
|
/// integer vector parameter and the remaining bits are assigned zeros.
|
||||||
#define _mm_extract_epi32(X, N) \
|
#define _mm_extract_epi32(X, N) \
|
||||||
(int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
|
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
/// Extracts a 64-bit element from the 128-bit integer vector of
|
/// Extracts a 64-bit element from the 128-bit integer vector of
|
||||||
@ -1080,7 +1082,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
|
|||||||
/// 1: Bits [127:64] are returned. \n
|
/// 1: Bits [127:64] are returned. \n
|
||||||
/// \returns A 64-bit integer.
|
/// \returns A 64-bit integer.
|
||||||
#define _mm_extract_epi64(X, N) \
|
#define _mm_extract_epi64(X, N) \
|
||||||
(long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
|
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
|
||||||
#endif /* __x86_64 */
|
#endif /* __x86_64 */
|
||||||
|
|
||||||
/* SSE4 128-bit Packed Integer Comparisons. */
|
/* SSE4 128-bit Packed Integer Comparisons. */
|
||||||
@ -1514,8 +1516,8 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
|
|||||||
/// \returns A 128-bit integer vector containing the sums of the sets of
|
/// \returns A 128-bit integer vector containing the sums of the sets of
|
||||||
/// absolute differences between both operands.
|
/// absolute differences between both operands.
|
||||||
#define _mm_mpsadbw_epu8(X, Y, M) \
|
#define _mm_mpsadbw_epu8(X, Y, M) \
|
||||||
(__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
|
((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
|
||||||
(__v16qi)(__m128i)(Y), (M))
|
(__v16qi)(__m128i)(Y), (M)))
|
||||||
|
|
||||||
/// Finds the minimum unsigned 16-bit element in the input 128-bit
|
/// Finds the minimum unsigned 16-bit element in the input 128-bit
|
||||||
/// vector of [8 x u16] and returns it and along with its index.
|
/// vector of [8 x u16] and returns it and along with its index.
|
||||||
@ -1624,8 +1626,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns a 128-bit integer vector representing the result mask of
|
/// \returns Returns a 128-bit integer vector representing the result mask of
|
||||||
/// the comparison.
|
/// the comparison.
|
||||||
#define _mm_cmpistrm(A, B, M) \
|
#define _mm_cmpistrm(A, B, M) \
|
||||||
(__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with implicitly defined lengths that is contained in source operands
|
/// data with implicitly defined lengths that is contained in source operands
|
||||||
@ -1678,8 +1680,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// 1: The index of the most significant set bit. \n
|
/// 1: The index of the most significant set bit. \n
|
||||||
/// \returns Returns an integer representing the result index of the comparison.
|
/// \returns Returns an integer representing the result index of the comparison.
|
||||||
#define _mm_cmpistri(A, B, M) \
|
#define _mm_cmpistri(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -1738,9 +1740,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns a 128-bit integer vector representing the result mask of
|
/// \returns Returns a 128-bit integer vector representing the result mask of
|
||||||
/// the comparison.
|
/// the comparison.
|
||||||
#define _mm_cmpestrm(A, LA, B, LB, M) \
|
#define _mm_cmpestrm(A, LA, B, LB, M) \
|
||||||
(__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
|
((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -1797,9 +1799,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// 1: The index of the most significant set bit. \n
|
/// 1: The index of the most significant set bit. \n
|
||||||
/// \returns Returns an integer representing the result index of the comparison.
|
/// \returns Returns an integer representing the result index of the comparison.
|
||||||
#define _mm_cmpestri(A, LA, B, LB, M) \
|
#define _mm_cmpestri(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
|
/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
@ -1849,8 +1851,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the bit mask is zero and the length of the string in
|
/// \returns Returns 1 if the bit mask is zero and the length of the string in
|
||||||
/// \a B is the maximum; otherwise, returns 0.
|
/// \a B is the maximum; otherwise, returns 0.
|
||||||
#define _mm_cmpistra(A, B, M) \
|
#define _mm_cmpistra(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with implicitly defined lengths that is contained in source operands
|
/// data with implicitly defined lengths that is contained in source operands
|
||||||
@ -1898,8 +1900,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// to the size of \a A or \a B.
|
/// to the size of \a A or \a B.
|
||||||
/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
|
/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
|
||||||
#define _mm_cmpistrc(A, B, M) \
|
#define _mm_cmpistrc(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with implicitly defined lengths that is contained in source operands
|
/// data with implicitly defined lengths that is contained in source operands
|
||||||
@ -1946,8 +1948,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// to the size of \a A or \a B. \n
|
/// to the size of \a A or \a B. \n
|
||||||
/// \returns Returns bit 0 of the resulting bit mask.
|
/// \returns Returns bit 0 of the resulting bit mask.
|
||||||
#define _mm_cmpistro(A, B, M) \
|
#define _mm_cmpistro(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with implicitly defined lengths that is contained in source operands
|
/// data with implicitly defined lengths that is contained in source operands
|
||||||
@ -1996,8 +1998,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the length of the string in \a A is less than the
|
/// \returns Returns 1 if the length of the string in \a A is less than the
|
||||||
/// maximum, otherwise, returns 0.
|
/// maximum, otherwise, returns 0.
|
||||||
#define _mm_cmpistrs(A, B, M) \
|
#define _mm_cmpistrs(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with implicitly defined lengths that is contained in source operands
|
/// data with implicitly defined lengths that is contained in source operands
|
||||||
@ -2046,8 +2048,8 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the length of the string in \a B is less than the
|
/// \returns Returns 1 if the length of the string in \a B is less than the
|
||||||
/// maximum, otherwise, returns 0.
|
/// maximum, otherwise, returns 0.
|
||||||
#define _mm_cmpistrz(A, B, M) \
|
#define _mm_cmpistrz(A, B, M) \
|
||||||
(int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
|
((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (int)(M))
|
(__v16qi)(__m128i)(B), (int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -2100,9 +2102,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the bit mask is zero and the length of the string in
|
/// \returns Returns 1 if the bit mask is zero and the length of the string in
|
||||||
/// \a B is the maximum, otherwise, returns 0.
|
/// \a B is the maximum, otherwise, returns 0.
|
||||||
#define _mm_cmpestra(A, LA, B, LB, M) \
|
#define _mm_cmpestra(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -2154,9 +2156,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// to the size of \a A or \a B. \n
|
/// to the size of \a A or \a B. \n
|
||||||
/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
|
/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
|
||||||
#define _mm_cmpestrc(A, LA, B, LB, M) \
|
#define _mm_cmpestrc(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -2207,9 +2209,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// to the size of \a A or \a B.
|
/// to the size of \a A or \a B.
|
||||||
/// \returns Returns bit 0 of the resulting bit mask.
|
/// \returns Returns bit 0 of the resulting bit mask.
|
||||||
#define _mm_cmpestro(A, LA, B, LB, M) \
|
#define _mm_cmpestro(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -2262,9 +2264,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the length of the string in \a A is less than the
|
/// \returns Returns 1 if the length of the string in \a A is less than the
|
||||||
/// maximum, otherwise, returns 0.
|
/// maximum, otherwise, returns 0.
|
||||||
#define _mm_cmpestrs(A, LA, B, LB, M) \
|
#define _mm_cmpestrs(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/// Uses the immediate operand \a M to perform a comparison of string
|
/// Uses the immediate operand \a M to perform a comparison of string
|
||||||
/// data with explicitly defined lengths that is contained in source operands
|
/// data with explicitly defined lengths that is contained in source operands
|
||||||
@ -2316,9 +2318,9 @@ _mm_minpos_epu16(__m128i __V)
|
|||||||
/// \returns Returns 1 if the length of the string in \a B is less than the
|
/// \returns Returns 1 if the length of the string in \a B is less than the
|
||||||
/// maximum, otherwise, returns 0.
|
/// maximum, otherwise, returns 0.
|
||||||
#define _mm_cmpestrz(A, LA, B, LB, M) \
|
#define _mm_cmpestrz(A, LA, B, LB, M) \
|
||||||
(int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
|
((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
|
||||||
(__v16qi)(__m128i)(B), (int)(LB), \
|
(__v16qi)(__m128i)(B), (int)(LB), \
|
||||||
(int)(M))
|
(int)(M)))
|
||||||
|
|
||||||
/* SSE4.2 Compare Packed Data -- Greater Than. */
|
/* SSE4.2 Compare Packed Data -- Greater Than. */
|
||||||
/// Compares each of the corresponding 64-bit values of the 128-bit
|
/// Compares each of the corresponding 64-bit values of the 128-bit
|
||||||
@ -2340,91 +2342,10 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
|
|||||||
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
return (__m128i)((__v2di)__V1 > (__v2di)__V2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* SSE4.2 Accumulate CRC32. */
|
|
||||||
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
|
||||||
/// unsigned char operand.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __C
|
|
||||||
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
|
||||||
/// \a __D.
|
|
||||||
/// \param __D
|
|
||||||
/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
|
|
||||||
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
|
||||||
/// operand \a __D.
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
|
||||||
_mm_crc32_u8(unsigned int __C, unsigned char __D)
|
|
||||||
{
|
|
||||||
return __builtin_ia32_crc32qi(__C, __D);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
|
||||||
/// unsigned short operand.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __C
|
|
||||||
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
|
||||||
/// \a __D.
|
|
||||||
/// \param __D
|
|
||||||
/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
|
|
||||||
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
|
||||||
/// operand \a __D.
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
|
||||||
_mm_crc32_u16(unsigned int __C, unsigned short __D)
|
|
||||||
{
|
|
||||||
return __builtin_ia32_crc32hi(__C, __D);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Adds the first unsigned integer operand to the CRC-32C checksum of
|
|
||||||
/// the second unsigned integer operand.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __C
|
|
||||||
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
|
||||||
/// \a __D.
|
|
||||||
/// \param __D
|
|
||||||
/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
|
|
||||||
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
|
||||||
/// operand \a __D.
|
|
||||||
static __inline__ unsigned int __DEFAULT_FN_ATTRS
|
|
||||||
_mm_crc32_u32(unsigned int __C, unsigned int __D)
|
|
||||||
{
|
|
||||||
return __builtin_ia32_crc32si(__C, __D);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __x86_64__
|
|
||||||
/// Adds the unsigned integer operand to the CRC-32C checksum of the
|
|
||||||
/// unsigned 64-bit integer operand.
|
|
||||||
///
|
|
||||||
/// \headerfile <x86intrin.h>
|
|
||||||
///
|
|
||||||
/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
|
|
||||||
///
|
|
||||||
/// \param __C
|
|
||||||
/// An unsigned integer operand to add to the CRC-32C checksum of operand
|
|
||||||
/// \a __D.
|
|
||||||
/// \param __D
|
|
||||||
/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
|
|
||||||
/// \returns The result of adding operand \a __C to the CRC-32C checksum of
|
|
||||||
/// operand \a __D.
|
|
||||||
static __inline__ unsigned long long __DEFAULT_FN_ATTRS
|
|
||||||
_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
|
|
||||||
{
|
|
||||||
return __builtin_ia32_crc32di(__C, __D);
|
|
||||||
}
|
|
||||||
#endif /* __x86_64__ */
|
|
||||||
|
|
||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
|
|
||||||
#include <popcntintrin.h>
|
#include <popcntintrin.h>
|
||||||
|
|
||||||
|
#include <crc32intrin.h>
|
||||||
|
|
||||||
#endif /* __SMMINTRIN_H */
|
#endif /* __SMMINTRIN_H */
|
||||||
|
|||||||
15
lib/include/stdatomic.h
vendored
15
lib/include/stdatomic.h
vendored
@ -12,8 +12,12 @@
|
|||||||
|
|
||||||
/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
|
/* If we're hosted, fall back to the system's stdatomic.h. FreeBSD, for
|
||||||
* example, already has a Clang-compatible stdatomic.h header.
|
* example, already has a Clang-compatible stdatomic.h header.
|
||||||
|
*
|
||||||
|
* Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
|
||||||
|
* explicitly disallows `stdatomic.h` in the C mode via an `#error`. Fallback
|
||||||
|
* to the clang resource header until that is fully supported.
|
||||||
*/
|
*/
|
||||||
#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>)
|
#if __STDC_HOSTED__ && __has_include_next(<stdatomic.h>) && !defined(_MSC_VER)
|
||||||
# include_next <stdatomic.h>
|
# include_next <stdatomic.h>
|
||||||
#else
|
#else
|
||||||
|
|
||||||
@ -40,6 +44,11 @@ extern "C" {
|
|||||||
/* 7.17.2 Initialization */
|
/* 7.17.2 Initialization */
|
||||||
|
|
||||||
#define ATOMIC_VAR_INIT(value) (value)
|
#define ATOMIC_VAR_INIT(value) (value)
|
||||||
|
#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) && \
|
||||||
|
!defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
||||||
|
/* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
|
||||||
|
#pragma clang deprecated(ATOMIC_VAR_INIT)
|
||||||
|
#endif
|
||||||
#define atomic_init __c11_atomic_init
|
#define atomic_init __c11_atomic_init
|
||||||
|
|
||||||
/* 7.17.3 Order and consistency */
|
/* 7.17.3 Order and consistency */
|
||||||
@ -149,6 +158,10 @@ typedef _Atomic(uintmax_t) atomic_uintmax_t;
|
|||||||
typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
|
typedef struct atomic_flag { atomic_bool _Value; } atomic_flag;
|
||||||
|
|
||||||
#define ATOMIC_FLAG_INIT { 0 }
|
#define ATOMIC_FLAG_INIT { 0 }
|
||||||
|
#if __cplusplus >= 202002L && !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
|
||||||
|
/* ATOMIC_FLAG_INIT was deprecated in C++20 but is not deprecated in C. */
|
||||||
|
#pragma clang deprecated(ATOMIC_FLAG_INIT)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* These should be provided by the libc implementation. */
|
/* These should be provided by the libc implementation. */
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
168
lib/include/stdint.h
vendored
168
lib/include/stdint.h
vendored
@ -461,6 +461,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT64_MAX INT64_C( 9223372036854775807)
|
# define INT64_MAX INT64_C( 9223372036854775807)
|
||||||
# define INT64_MIN (-INT64_C( 9223372036854775807)-1)
|
# define INT64_MIN (-INT64_C( 9223372036854775807)-1)
|
||||||
# define UINT64_MAX UINT64_C(18446744073709551615)
|
# define UINT64_MAX UINT64_C(18446744073709551615)
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT64_WIDTH 64
|
||||||
|
# define INT64_WIDTH UINT64_WIDTH
|
||||||
|
|
||||||
|
# define __UINT_LEAST64_WIDTH UINT64_WIDTH
|
||||||
|
# define __UINT_LEAST32_WIDTH UINT64_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT64_WIDTH
|
||||||
|
# define __UINT_LEAST8_MAX UINT64_MAX
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
|
|
||||||
# define __INT_LEAST64_MIN INT64_MIN
|
# define __INT_LEAST64_MIN INT64_MIN
|
||||||
# define __INT_LEAST64_MAX INT64_MAX
|
# define __INT_LEAST64_MAX INT64_MAX
|
||||||
# define __UINT_LEAST64_MAX UINT64_MAX
|
# define __UINT_LEAST64_MAX UINT64_MAX
|
||||||
@ -482,6 +494,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST64_MIN __INT_LEAST64_MIN
|
# define INT_FAST64_MIN __INT_LEAST64_MIN
|
||||||
# define INT_FAST64_MAX __INT_LEAST64_MAX
|
# define INT_FAST64_MAX __INT_LEAST64_MAX
|
||||||
# define UINT_FAST64_MAX __UINT_LEAST64_MAX
|
# define UINT_FAST64_MAX __UINT_LEAST64_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
|
||||||
|
# define INT_LEAST64_WIDTH UINT_LEAST64_WIDTH
|
||||||
|
# define UINT_FAST64_WIDTH __UINT_LEAST64_WIDTH
|
||||||
|
# define INT_FAST64_WIDTH UINT_FAST64_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT_LEAST64_MIN */
|
#endif /* __INT_LEAST64_MIN */
|
||||||
|
|
||||||
|
|
||||||
@ -495,6 +516,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST56_MIN INT56_MIN
|
# define INT_FAST56_MIN INT56_MIN
|
||||||
# define INT_FAST56_MAX INT56_MAX
|
# define INT_FAST56_MAX INT56_MAX
|
||||||
# define UINT_FAST56_MAX UINT56_MAX
|
# define UINT_FAST56_MAX UINT56_MAX
|
||||||
|
|
||||||
# define __INT_LEAST32_MIN INT56_MIN
|
# define __INT_LEAST32_MIN INT56_MIN
|
||||||
# define __INT_LEAST32_MAX INT56_MAX
|
# define __INT_LEAST32_MAX INT56_MAX
|
||||||
# define __UINT_LEAST32_MAX UINT56_MAX
|
# define __UINT_LEAST32_MAX UINT56_MAX
|
||||||
@ -504,6 +526,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define __INT_LEAST8_MIN INT56_MIN
|
# define __INT_LEAST8_MIN INT56_MIN
|
||||||
# define __INT_LEAST8_MAX INT56_MAX
|
# define __INT_LEAST8_MAX INT56_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT56_MAX
|
# define __UINT_LEAST8_MAX UINT56_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT56_WIDTH 56
|
||||||
|
# define INT56_WIDTH UINT56_WIDTH
|
||||||
|
# define UINT_LEAST56_WIDTH UINT56_WIDTH
|
||||||
|
# define INT_LEAST56_WIDTH UINT_LEAST56_WIDTH
|
||||||
|
# define UINT_FAST56_WIDTH UINT56_WIDTH
|
||||||
|
# define INT_FAST56_WIDTH UINT_FAST56_WIDTH
|
||||||
|
# define __UINT_LEAST32_WIDTH UINT56_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT56_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT56_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT56_TYPE__ */
|
#endif /* __INT56_TYPE__ */
|
||||||
|
|
||||||
|
|
||||||
@ -517,6 +553,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST48_MIN INT48_MIN
|
# define INT_FAST48_MIN INT48_MIN
|
||||||
# define INT_FAST48_MAX INT48_MAX
|
# define INT_FAST48_MAX INT48_MAX
|
||||||
# define UINT_FAST48_MAX UINT48_MAX
|
# define UINT_FAST48_MAX UINT48_MAX
|
||||||
|
|
||||||
# define __INT_LEAST32_MIN INT48_MIN
|
# define __INT_LEAST32_MIN INT48_MIN
|
||||||
# define __INT_LEAST32_MAX INT48_MAX
|
# define __INT_LEAST32_MAX INT48_MAX
|
||||||
# define __UINT_LEAST32_MAX UINT48_MAX
|
# define __UINT_LEAST32_MAX UINT48_MAX
|
||||||
@ -526,6 +563,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define __INT_LEAST8_MIN INT48_MIN
|
# define __INT_LEAST8_MIN INT48_MIN
|
||||||
# define __INT_LEAST8_MAX INT48_MAX
|
# define __INT_LEAST8_MAX INT48_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT48_MAX
|
# define __UINT_LEAST8_MAX UINT48_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
#define UINT48_WIDTH 48
|
||||||
|
#define INT48_WIDTH UINT48_WIDTH
|
||||||
|
#define UINT_LEAST48_WIDTH UINT48_WIDTH
|
||||||
|
#define INT_LEAST48_WIDTH UINT_LEAST48_WIDTH
|
||||||
|
#define UINT_FAST48_WIDTH UINT48_WIDTH
|
||||||
|
#define INT_FAST48_WIDTH UINT_FAST48_WIDTH
|
||||||
|
#define __UINT_LEAST32_WIDTH UINT48_WIDTH
|
||||||
|
#define __UINT_LEAST16_WIDTH UINT48_WIDTH
|
||||||
|
#define __UINT_LEAST8_WIDTH UINT48_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT48_TYPE__ */
|
#endif /* __INT48_TYPE__ */
|
||||||
|
|
||||||
|
|
||||||
@ -539,6 +590,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST40_MIN INT40_MIN
|
# define INT_FAST40_MIN INT40_MIN
|
||||||
# define INT_FAST40_MAX INT40_MAX
|
# define INT_FAST40_MAX INT40_MAX
|
||||||
# define UINT_FAST40_MAX UINT40_MAX
|
# define UINT_FAST40_MAX UINT40_MAX
|
||||||
|
|
||||||
# define __INT_LEAST32_MIN INT40_MIN
|
# define __INT_LEAST32_MIN INT40_MIN
|
||||||
# define __INT_LEAST32_MAX INT40_MAX
|
# define __INT_LEAST32_MAX INT40_MAX
|
||||||
# define __UINT_LEAST32_MAX UINT40_MAX
|
# define __UINT_LEAST32_MAX UINT40_MAX
|
||||||
@ -548,6 +600,20 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define __INT_LEAST8_MIN INT40_MIN
|
# define __INT_LEAST8_MIN INT40_MIN
|
||||||
# define __INT_LEAST8_MAX INT40_MAX
|
# define __INT_LEAST8_MAX INT40_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT40_MAX
|
# define __UINT_LEAST8_MAX UINT40_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT40_WIDTH 40
|
||||||
|
# define INT40_WIDTH UINT40_WIDTH
|
||||||
|
# define UINT_LEAST40_WIDTH UINT40_WIDTH
|
||||||
|
# define INT_LEAST40_WIDTH UINT_LEAST40_WIDTH
|
||||||
|
# define UINT_FAST40_WIDTH UINT40_WIDTH
|
||||||
|
# define INT_FAST40_WIDTH UINT_FAST40_WIDTH
|
||||||
|
# define __UINT_LEAST32_WIDTH UINT40_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT40_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT40_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT40_TYPE__ */
|
#endif /* __INT40_TYPE__ */
|
||||||
|
|
||||||
|
|
||||||
@ -555,6 +621,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT32_MAX INT32_C(2147483647)
|
# define INT32_MAX INT32_C(2147483647)
|
||||||
# define INT32_MIN (-INT32_C(2147483647)-1)
|
# define INT32_MIN (-INT32_C(2147483647)-1)
|
||||||
# define UINT32_MAX UINT32_C(4294967295)
|
# define UINT32_MAX UINT32_C(4294967295)
|
||||||
|
|
||||||
# define __INT_LEAST32_MIN INT32_MIN
|
# define __INT_LEAST32_MIN INT32_MIN
|
||||||
# define __INT_LEAST32_MAX INT32_MAX
|
# define __INT_LEAST32_MAX INT32_MAX
|
||||||
# define __UINT_LEAST32_MAX UINT32_MAX
|
# define __UINT_LEAST32_MAX UINT32_MAX
|
||||||
@ -564,6 +631,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define __INT_LEAST8_MIN INT32_MIN
|
# define __INT_LEAST8_MIN INT32_MIN
|
||||||
# define __INT_LEAST8_MAX INT32_MAX
|
# define __INT_LEAST8_MAX INT32_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT32_MAX
|
# define __UINT_LEAST8_MAX UINT32_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT32_WIDTH 32
|
||||||
|
# define INT32_WIDTH UINT32_WIDTH
|
||||||
|
# define __UINT_LEAST32_WIDTH UINT32_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT32_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT32_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT32_TYPE__ */
|
#endif /* __INT32_TYPE__ */
|
||||||
|
|
||||||
#ifdef __INT_LEAST32_MIN
|
#ifdef __INT_LEAST32_MIN
|
||||||
@ -573,6 +650,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST32_MIN __INT_LEAST32_MIN
|
# define INT_FAST32_MIN __INT_LEAST32_MIN
|
||||||
# define INT_FAST32_MAX __INT_LEAST32_MAX
|
# define INT_FAST32_MAX __INT_LEAST32_MAX
|
||||||
# define UINT_FAST32_MAX __UINT_LEAST32_MAX
|
# define UINT_FAST32_MAX __UINT_LEAST32_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
|
||||||
|
# define INT_LEAST32_WIDTH UINT_LEAST32_WIDTH
|
||||||
|
# define UINT_FAST32_WIDTH __UINT_LEAST32_WIDTH
|
||||||
|
# define INT_FAST32_WIDTH UINT_FAST32_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT_LEAST32_MIN */
|
#endif /* __INT_LEAST32_MIN */
|
||||||
|
|
||||||
|
|
||||||
@ -586,12 +672,26 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST24_MIN INT24_MIN
|
# define INT_FAST24_MIN INT24_MIN
|
||||||
# define INT_FAST24_MAX INT24_MAX
|
# define INT_FAST24_MAX INT24_MAX
|
||||||
# define UINT_FAST24_MAX UINT24_MAX
|
# define UINT_FAST24_MAX UINT24_MAX
|
||||||
|
|
||||||
# define __INT_LEAST16_MIN INT24_MIN
|
# define __INT_LEAST16_MIN INT24_MIN
|
||||||
# define __INT_LEAST16_MAX INT24_MAX
|
# define __INT_LEAST16_MAX INT24_MAX
|
||||||
# define __UINT_LEAST16_MAX UINT24_MAX
|
# define __UINT_LEAST16_MAX UINT24_MAX
|
||||||
# define __INT_LEAST8_MIN INT24_MIN
|
# define __INT_LEAST8_MIN INT24_MIN
|
||||||
# define __INT_LEAST8_MAX INT24_MAX
|
# define __INT_LEAST8_MAX INT24_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT24_MAX
|
# define __UINT_LEAST8_MAX UINT24_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT24_WIDTH 24
|
||||||
|
# define INT24_WIDTH UINT24_WIDTH
|
||||||
|
# define UINT_LEAST24_WIDTH UINT24_WIDTH
|
||||||
|
# define INT_LEAST24_WIDTH UINT_LEAST24_WIDTH
|
||||||
|
# define UINT_FAST24_WIDTH UINT24_WIDTH
|
||||||
|
# define INT_FAST24_WIDTH UINT_FAST24_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT24_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT24_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT24_TYPE__ */
|
#endif /* __INT24_TYPE__ */
|
||||||
|
|
||||||
|
|
||||||
@ -599,12 +699,22 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
#define INT16_MAX INT16_C(32767)
|
#define INT16_MAX INT16_C(32767)
|
||||||
#define INT16_MIN (-INT16_C(32767)-1)
|
#define INT16_MIN (-INT16_C(32767)-1)
|
||||||
#define UINT16_MAX UINT16_C(65535)
|
#define UINT16_MAX UINT16_C(65535)
|
||||||
|
|
||||||
# define __INT_LEAST16_MIN INT16_MIN
|
# define __INT_LEAST16_MIN INT16_MIN
|
||||||
# define __INT_LEAST16_MAX INT16_MAX
|
# define __INT_LEAST16_MAX INT16_MAX
|
||||||
# define __UINT_LEAST16_MAX UINT16_MAX
|
# define __UINT_LEAST16_MAX UINT16_MAX
|
||||||
# define __INT_LEAST8_MIN INT16_MIN
|
# define __INT_LEAST8_MIN INT16_MIN
|
||||||
# define __INT_LEAST8_MAX INT16_MAX
|
# define __INT_LEAST8_MAX INT16_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT16_MAX
|
# define __UINT_LEAST8_MAX UINT16_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT16_WIDTH 16
|
||||||
|
# define INT16_WIDTH UINT16_WIDTH
|
||||||
|
# define __UINT_LEAST16_WIDTH UINT16_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT16_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT16_TYPE__ */
|
#endif /* __INT16_TYPE__ */
|
||||||
|
|
||||||
#ifdef __INT_LEAST16_MIN
|
#ifdef __INT_LEAST16_MIN
|
||||||
@ -614,6 +724,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST16_MIN __INT_LEAST16_MIN
|
# define INT_FAST16_MIN __INT_LEAST16_MIN
|
||||||
# define INT_FAST16_MAX __INT_LEAST16_MAX
|
# define INT_FAST16_MAX __INT_LEAST16_MAX
|
||||||
# define UINT_FAST16_MAX __UINT_LEAST16_MAX
|
# define UINT_FAST16_MAX __UINT_LEAST16_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
|
||||||
|
# define INT_LEAST16_WIDTH UINT_LEAST16_WIDTH
|
||||||
|
# define UINT_FAST16_WIDTH __UINT_LEAST16_WIDTH
|
||||||
|
# define INT_FAST16_WIDTH UINT_FAST16_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT_LEAST16_MIN */
|
#endif /* __INT_LEAST16_MIN */
|
||||||
|
|
||||||
|
|
||||||
@ -621,9 +740,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT8_MAX INT8_C(127)
|
# define INT8_MAX INT8_C(127)
|
||||||
# define INT8_MIN (-INT8_C(127)-1)
|
# define INT8_MIN (-INT8_C(127)-1)
|
||||||
# define UINT8_MAX UINT8_C(255)
|
# define UINT8_MAX UINT8_C(255)
|
||||||
|
|
||||||
# define __INT_LEAST8_MIN INT8_MIN
|
# define __INT_LEAST8_MIN INT8_MIN
|
||||||
# define __INT_LEAST8_MAX INT8_MAX
|
# define __INT_LEAST8_MAX INT8_MAX
|
||||||
# define __UINT_LEAST8_MAX UINT8_MAX
|
# define __UINT_LEAST8_MAX UINT8_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT8_WIDTH 8
|
||||||
|
# define INT8_WIDTH UINT8_WIDTH
|
||||||
|
# define __UINT_LEAST8_WIDTH UINT8_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT8_TYPE__ */
|
#endif /* __INT8_TYPE__ */
|
||||||
|
|
||||||
#ifdef __INT_LEAST8_MIN
|
#ifdef __INT_LEAST8_MIN
|
||||||
@ -633,6 +761,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
# define INT_FAST8_MIN __INT_LEAST8_MIN
|
# define INT_FAST8_MIN __INT_LEAST8_MIN
|
||||||
# define INT_FAST8_MAX __INT_LEAST8_MAX
|
# define INT_FAST8_MAX __INT_LEAST8_MAX
|
||||||
# define UINT_FAST8_MAX __UINT_LEAST8_MAX
|
# define UINT_FAST8_MAX __UINT_LEAST8_MAX
|
||||||
|
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
# define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
|
||||||
|
# define INT_LEAST8_WIDTH UINT_LEAST8_WIDTH
|
||||||
|
# define UINT_FAST8_WIDTH __UINT_LEAST8_WIDTH
|
||||||
|
# define INT_FAST8_WIDTH UINT_FAST8_WIDTH
|
||||||
|
#endif /* __STDC_VERSION__ */
|
||||||
#endif /* __INT_LEAST8_MIN */
|
#endif /* __INT_LEAST8_MIN */
|
||||||
|
|
||||||
/* Some utility macros */
|
/* Some utility macros */
|
||||||
@ -652,6 +789,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
#define PTRDIFF_MAX __PTRDIFF_MAX__
|
#define PTRDIFF_MAX __PTRDIFF_MAX__
|
||||||
#define SIZE_MAX __SIZE_MAX__
|
#define SIZE_MAX __SIZE_MAX__
|
||||||
|
|
||||||
|
/* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
/* NB: The C standard requires that these be the same value, but the compiler
|
||||||
|
exposes separate internal width macros. */
|
||||||
|
#define INTPTR_WIDTH __INTPTR_WIDTH__
|
||||||
|
#define UINTPTR_WIDTH __UINTPTR_WIDTH__
|
||||||
|
#endif
|
||||||
|
|
||||||
/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
|
/* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
|
||||||
* is enabled. */
|
* is enabled. */
|
||||||
#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
|
#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ >= 1
|
||||||
@ -663,6 +810,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
#define INTMAX_MAX __INTMAX_MAX__
|
#define INTMAX_MAX __INTMAX_MAX__
|
||||||
#define UINTMAX_MAX __UINTMAX_MAX__
|
#define UINTMAX_MAX __UINTMAX_MAX__
|
||||||
|
|
||||||
|
/* C2x 7.20.2.5 Width of greatest-width integer types. */
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
/* NB: The C standard requires that these be the same value, but the compiler
|
||||||
|
exposes separate internal width macros. */
|
||||||
|
#define INTMAX_WIDTH __INTMAX_WIDTH__
|
||||||
|
#define UINTMAX_WIDTH __UINTMAX_WIDTH__
|
||||||
|
#endif
|
||||||
|
|
||||||
/* C99 7.18.3 Limits of other integer types. */
|
/* C99 7.18.3 Limits of other integer types. */
|
||||||
#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
|
#define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
|
||||||
#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
|
#define SIG_ATOMIC_MAX __INTN_MAX(__SIG_ATOMIC_WIDTH__)
|
||||||
@ -689,5 +846,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
|
|||||||
#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
|
#define INTMAX_C(v) __int_c(v, __INTMAX_C_SUFFIX__)
|
||||||
#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
|
#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
|
||||||
|
|
||||||
|
/* C2x 7.20.3.x Width of other integer types. */
|
||||||
|
/* FIXME: This is using the placeholder dates Clang produces for these macros
|
||||||
|
in C2x mode; switch to the correct values once they've been published. */
|
||||||
|
#if __STDC_VERSION__ >= 202000L
|
||||||
|
#define PTRDIFF_WIDTH __PTRDIFF_WIDTH__
|
||||||
|
#define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
|
||||||
|
#define SIZE_WIDTH __SIZE_WIDTH__
|
||||||
|
#define WCHAR_WIDTH __WCHAR_WIDTH__
|
||||||
|
#define WINT_WIDTH __WINT_WIDTH__
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* __STDC_HOSTED__ */
|
#endif /* __STDC_HOSTED__ */
|
||||||
#endif /* __CLANG_STDINT_H */
|
#endif /* __CLANG_STDINT_H */
|
||||||
|
|||||||
16
lib/include/tmmintrin.h
vendored
16
lib/include/tmmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __TMMINTRIN_H
|
#ifndef __TMMINTRIN_H
|
||||||
#define __TMMINTRIN_H
|
#define __TMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
/* Define the default attributes for the functions in this file. */
|
/* Define the default attributes for the functions in this file. */
|
||||||
@ -49,7 +53,7 @@ _mm_abs_pi8(__m64 __a)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_abs_epi8(__m128i __a)
|
_mm_abs_epi8(__m128i __a)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
|
return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes the absolute value of each of the packed 16-bit signed
|
/// Computes the absolute value of each of the packed 16-bit signed
|
||||||
@ -85,7 +89,7 @@ _mm_abs_pi16(__m64 __a)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_abs_epi16(__m128i __a)
|
_mm_abs_epi16(__m128i __a)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
|
return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Computes the absolute value of each of the packed 32-bit signed
|
/// Computes the absolute value of each of the packed 32-bit signed
|
||||||
@ -121,7 +125,7 @@ _mm_abs_pi32(__m64 __a)
|
|||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_abs_epi32(__m128i __a)
|
_mm_abs_epi32(__m128i __a)
|
||||||
{
|
{
|
||||||
return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
|
return (__m128i)__builtin_elementwise_abs((__v4si)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Concatenates the two 128-bit integer vector operands, and
|
/// Concatenates the two 128-bit integer vector operands, and
|
||||||
@ -145,8 +149,8 @@ _mm_abs_epi32(__m128i __a)
|
|||||||
/// \returns A 128-bit integer vector containing the concatenated right-shifted
|
/// \returns A 128-bit integer vector containing the concatenated right-shifted
|
||||||
/// value.
|
/// value.
|
||||||
#define _mm_alignr_epi8(a, b, n) \
|
#define _mm_alignr_epi8(a, b, n) \
|
||||||
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
|
((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
|
||||||
(__v16qi)(__m128i)(b), (n))
|
(__v16qi)(__m128i)(b), (n)))
|
||||||
|
|
||||||
/// Concatenates the two 64-bit integer vector operands, and right-shifts
|
/// Concatenates the two 64-bit integer vector operands, and right-shifts
|
||||||
/// the result by the number of bytes specified in the immediate operand.
|
/// the result by the number of bytes specified in the immediate operand.
|
||||||
@ -168,7 +172,7 @@ _mm_abs_epi32(__m128i __a)
|
|||||||
/// \returns A 64-bit integer vector containing the concatenated right-shifted
|
/// \returns A 64-bit integer vector containing the concatenated right-shifted
|
||||||
/// value.
|
/// value.
|
||||||
#define _mm_alignr_pi8(a, b, n) \
|
#define _mm_alignr_pi8(a, b, n) \
|
||||||
(__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
|
((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
|
||||||
|
|
||||||
/// Horizontally adds the adjacent pairs of values contained in 2 packed
|
/// Horizontally adds the adjacent pairs of values contained in 2 packed
|
||||||
/// 128-bit vectors of [8 x i16].
|
/// 128-bit vectors of [8 x i16].
|
||||||
|
|||||||
3
lib/include/unwind.h
vendored
3
lib/include/unwind.h
vendored
@ -172,7 +172,8 @@ typedef enum {
|
|||||||
_UVRSC_CORE = 0, /* integer register */
|
_UVRSC_CORE = 0, /* integer register */
|
||||||
_UVRSC_VFP = 1, /* vfp */
|
_UVRSC_VFP = 1, /* vfp */
|
||||||
_UVRSC_WMMXD = 3, /* Intel WMMX data register */
|
_UVRSC_WMMXD = 3, /* Intel WMMX data register */
|
||||||
_UVRSC_WMMXC = 4 /* Intel WMMX control register */
|
_UVRSC_WMMXC = 4, /* Intel WMMX control register */
|
||||||
|
_UVRSC_PSEUDO = 5 /* Special purpose pseudo register */
|
||||||
} _Unwind_VRS_RegClass;
|
} _Unwind_VRS_RegClass;
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
|||||||
2
lib/include/vaesintrin.h
vendored
2
lib/include/vaesintrin.h
vendored
@ -82,4 +82,4 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS_F
|
|||||||
#undef __DEFAULT_FN_ATTRS
|
#undef __DEFAULT_FN_ATTRS
|
||||||
#undef __DEFAULT_FN_ATTRS_F
|
#undef __DEFAULT_FN_ATTRS_F
|
||||||
|
|
||||||
#endif
|
#endif // __VAESINTRIN_H
|
||||||
|
|||||||
8
lib/include/vpclmulqdqintrin.h
vendored
8
lib/include/vpclmulqdqintrin.h
vendored
@ -15,15 +15,15 @@
|
|||||||
#define __VPCLMULQDQINTRIN_H
|
#define __VPCLMULQDQINTRIN_H
|
||||||
|
|
||||||
#define _mm256_clmulepi64_epi128(A, B, I) \
|
#define _mm256_clmulepi64_epi128(A, B, I) \
|
||||||
(__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
|
((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
|
||||||
(__v4di)(__m256i)(B), \
|
(__v4di)(__m256i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
|
|
||||||
#ifdef __AVX512FINTRIN_H
|
#ifdef __AVX512FINTRIN_H
|
||||||
#define _mm512_clmulepi64_epi128(A, B, I) \
|
#define _mm512_clmulepi64_epi128(A, B, I) \
|
||||||
(__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
|
((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
|
||||||
(__v8di)(__m512i)(B), \
|
(__v8di)(__m512i)(B), \
|
||||||
(char)(I))
|
(char)(I)))
|
||||||
#endif // __AVX512FINTRIN_H
|
#endif // __AVX512FINTRIN_H
|
||||||
|
|
||||||
#endif /* __VPCLMULQDQINTRIN_H */
|
#endif /* __VPCLMULQDQINTRIN_H */
|
||||||
|
|||||||
191
lib/include/wasm_simd128.h
vendored
191
lib/include/wasm_simd128.h
vendored
@ -276,12 +276,28 @@ wasm_i8x16_make(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3, int8_t __c4,
|
|||||||
__c12, __c13, __c14, __c15};
|
__c12, __c13, __c14, __c15};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u8x16_make(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
|
||||||
|
uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
|
||||||
|
uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
|
||||||
|
uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15) {
|
||||||
|
return (v128_t)(__u8x16){__c0, __c1, __c2, __c3, __c4, __c5,
|
||||||
|
__c6, __c7, __c8, __c9, __c10, __c11,
|
||||||
|
__c12, __c13, __c14, __c15};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_i16x8_make(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
|
wasm_i16x8_make(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
|
||||||
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7) {
|
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7) {
|
||||||
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u16x8_make(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
|
||||||
|
uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7) {
|
||||||
|
return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
|
||||||
int32_t __c1,
|
int32_t __c1,
|
||||||
int32_t __c2,
|
int32_t __c2,
|
||||||
@ -289,11 +305,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
|
|||||||
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
|
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_make(uint32_t __c0,
|
||||||
|
uint32_t __c1,
|
||||||
|
uint32_t __c2,
|
||||||
|
uint32_t __c3) {
|
||||||
|
return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_make(int64_t __c0,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_make(int64_t __c0,
|
||||||
int64_t __c1) {
|
int64_t __c1) {
|
||||||
return (v128_t)(__i64x2){__c0, __c1};
|
return (v128_t)(__i64x2){__c0, __c1};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_make(uint64_t __c0,
|
||||||
|
uint64_t __c1) {
|
||||||
|
return (v128_t)(__u64x2){__c0, __c1};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_make(float __c0,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_make(float __c0,
|
||||||
float __c1,
|
float __c1,
|
||||||
float __c2,
|
float __c2,
|
||||||
@ -324,6 +352,24 @@ wasm_i8x16_const(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3,
|
|||||||
__c12, __c13, __c14, __c15};
|
__c12, __c13, __c14, __c15};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u8x16_const(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
|
||||||
|
uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
|
||||||
|
uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
|
||||||
|
uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15)
|
||||||
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
||||||
|
__REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
|
||||||
|
__REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
|
||||||
|
__REQUIRE_CONSTANT(__c7) __REQUIRE_CONSTANT(__c8)
|
||||||
|
__REQUIRE_CONSTANT(__c9) __REQUIRE_CONSTANT(__c10)
|
||||||
|
__REQUIRE_CONSTANT(__c11) __REQUIRE_CONSTANT(__c12)
|
||||||
|
__REQUIRE_CONSTANT(__c13) __REQUIRE_CONSTANT(__c14)
|
||||||
|
__REQUIRE_CONSTANT(__c15) {
|
||||||
|
return (v128_t)(__u8x16){__c0, __c1, __c2, __c3, __c4, __c5,
|
||||||
|
__c6, __c7, __c8, __c9, __c10, __c11,
|
||||||
|
__c12, __c13, __c14, __c15};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
|
wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
|
||||||
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7)
|
int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7)
|
||||||
@ -334,6 +380,16 @@ wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
|
|||||||
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u16x8_const(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
|
||||||
|
uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7)
|
||||||
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
||||||
|
__REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
|
||||||
|
__REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
|
||||||
|
__REQUIRE_CONSTANT(__c7) {
|
||||||
|
return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
|
wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
|
||||||
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
||||||
@ -341,12 +397,25 @@ wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
|
|||||||
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
|
return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u32x4_const(uint32_t __c0, uint32_t __c1, uint32_t __c2, uint32_t __c3)
|
||||||
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
||||||
|
__REQUIRE_CONSTANT(__c3) {
|
||||||
|
return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const(int64_t __c0,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const(int64_t __c0,
|
||||||
int64_t __c1)
|
int64_t __c1)
|
||||||
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
|
||||||
return (v128_t)(__i64x2){__c0, __c1};
|
return (v128_t)(__i64x2){__c0, __c1};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const(uint64_t __c0,
|
||||||
|
uint64_t __c1)
|
||||||
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
|
||||||
|
return (v128_t)(__u64x2){__c0, __c1};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
wasm_f32x4_const(float __c0, float __c1, float __c2, float __c3)
|
wasm_f32x4_const(float __c0, float __c1, float __c2, float __c3)
|
||||||
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
__REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
|
||||||
@ -366,21 +435,42 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_const_splat(int8_t __c)
|
|||||||
__c, __c, __c, __c, __c, __c, __c, __c};
|
__c, __c, __c, __c, __c, __c, __c, __c};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_const_splat(uint8_t __c)
|
||||||
|
__REQUIRE_CONSTANT(__c) {
|
||||||
|
return (v128_t)(__u8x16){__c, __c, __c, __c, __c, __c, __c, __c,
|
||||||
|
__c, __c, __c, __c, __c, __c, __c, __c};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_const_splat(int16_t __c)
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_const_splat(int16_t __c)
|
||||||
__REQUIRE_CONSTANT(__c) {
|
__REQUIRE_CONSTANT(__c) {
|
||||||
return (v128_t)(__i16x8){__c, __c, __c, __c, __c, __c, __c, __c};
|
return (v128_t)(__i16x8){__c, __c, __c, __c, __c, __c, __c, __c};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_const_splat(uint16_t __c)
|
||||||
|
__REQUIRE_CONSTANT(__c) {
|
||||||
|
return (v128_t)(__u16x8){__c, __c, __c, __c, __c, __c, __c, __c};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_const_splat(int32_t __c)
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_const_splat(int32_t __c)
|
||||||
__REQUIRE_CONSTANT(__c) {
|
__REQUIRE_CONSTANT(__c) {
|
||||||
return (v128_t)(__i32x4){__c, __c, __c, __c};
|
return (v128_t)(__i32x4){__c, __c, __c, __c};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_const_splat(uint32_t __c)
|
||||||
|
__REQUIRE_CONSTANT(__c) {
|
||||||
|
return (v128_t)(__u32x4){__c, __c, __c, __c};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const_splat(int64_t __c)
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const_splat(int64_t __c)
|
||||||
__REQUIRE_CONSTANT(__c) {
|
__REQUIRE_CONSTANT(__c) {
|
||||||
return (v128_t)(__i64x2){__c, __c};
|
return (v128_t)(__i64x2){__c, __c};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const_splat(uint64_t __c)
|
||||||
|
__REQUIRE_CONSTANT(__c) {
|
||||||
|
return (v128_t)(__u64x2){__c, __c};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_const_splat(float __c)
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_const_splat(float __c)
|
||||||
__REQUIRE_CONSTANT(__c) {
|
__REQUIRE_CONSTANT(__c) {
|
||||||
return (v128_t)(__f32x4){__c, __c, __c, __c};
|
return (v128_t)(__f32x4){__c, __c, __c, __c};
|
||||||
@ -396,6 +486,11 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
|
|||||||
__a, __a, __a, __a, __a, __a, __a, __a};
|
__a, __a, __a, __a, __a, __a, __a, __a};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_splat(uint8_t __a) {
|
||||||
|
return (v128_t)(__u8x16){__a, __a, __a, __a, __a, __a, __a, __a,
|
||||||
|
__a, __a, __a, __a, __a, __a, __a, __a};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ int8_t __DEFAULT_FN_ATTRS wasm_i8x16_extract_lane(v128_t __a,
|
static __inline__ int8_t __DEFAULT_FN_ATTRS wasm_i8x16_extract_lane(v128_t __a,
|
||||||
int __i)
|
int __i)
|
||||||
__REQUIRE_CONSTANT(__i) {
|
__REQUIRE_CONSTANT(__i) {
|
||||||
@ -417,10 +512,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_replace_lane(v128_t __a,
|
|||||||
return (v128_t)__v;
|
return (v128_t)__v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_replace_lane(v128_t __a,
|
||||||
|
int __i,
|
||||||
|
uint8_t __b)
|
||||||
|
__REQUIRE_CONSTANT(__i) {
|
||||||
|
__u8x16 __v = (__u8x16)__a;
|
||||||
|
__v[__i] = __b;
|
||||||
|
return (v128_t)__v;
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
|
||||||
return (v128_t)(__i16x8){__a, __a, __a, __a, __a, __a, __a, __a};
|
return (v128_t)(__i16x8){__a, __a, __a, __a, __a, __a, __a, __a};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_splat(uint16_t __a) {
|
||||||
|
return (v128_t)(__u16x8){__a, __a, __a, __a, __a, __a, __a, __a};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ int16_t __DEFAULT_FN_ATTRS wasm_i16x8_extract_lane(v128_t __a,
|
static __inline__ int16_t __DEFAULT_FN_ATTRS wasm_i16x8_extract_lane(v128_t __a,
|
||||||
int __i)
|
int __i)
|
||||||
__REQUIRE_CONSTANT(__i) {
|
__REQUIRE_CONSTANT(__i) {
|
||||||
@ -441,16 +549,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_replace_lane(v128_t __a,
|
|||||||
return (v128_t)__v;
|
return (v128_t)__v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_replace_lane(
|
||||||
|
v128_t __a, int __i, uint16_t __b) __REQUIRE_CONSTANT(__i) {
|
||||||
|
__u16x8 __v = (__u16x8)__a;
|
||||||
|
__v[__i] = __b;
|
||||||
|
return (v128_t)__v;
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_splat(int32_t __a) {
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_splat(int32_t __a) {
|
||||||
return (v128_t)(__i32x4){__a, __a, __a, __a};
|
return (v128_t)(__i32x4){__a, __a, __a, __a};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_splat(uint32_t __a) {
|
||||||
|
return (v128_t)(__u32x4){__a, __a, __a, __a};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_extract_lane(v128_t __a,
|
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_extract_lane(v128_t __a,
|
||||||
int __i)
|
int __i)
|
||||||
__REQUIRE_CONSTANT(__i) {
|
__REQUIRE_CONSTANT(__i) {
|
||||||
return ((__i32x4)__a)[__i];
|
return ((__i32x4)__a)[__i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ uint32_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u32x4_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
|
||||||
|
return ((__u32x4)__a)[__i];
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
|
||||||
int __i,
|
int __i,
|
||||||
int32_t __b)
|
int32_t __b)
|
||||||
@ -460,16 +584,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
|
|||||||
return (v128_t)__v;
|
return (v128_t)__v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_replace_lane(
|
||||||
|
v128_t __a, int __i, uint32_t __b) __REQUIRE_CONSTANT(__i) {
|
||||||
|
__u32x4 __v = (__u32x4)__a;
|
||||||
|
__v[__i] = __b;
|
||||||
|
return (v128_t)__v;
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_splat(int64_t __a) {
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_splat(int64_t __a) {
|
||||||
return (v128_t)(__i64x2){__a, __a};
|
return (v128_t)(__i64x2){__a, __a};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_splat(uint64_t __a) {
|
||||||
|
return (v128_t)(__u64x2){__a, __a};
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ int64_t __DEFAULT_FN_ATTRS wasm_i64x2_extract_lane(v128_t __a,
|
static __inline__ int64_t __DEFAULT_FN_ATTRS wasm_i64x2_extract_lane(v128_t __a,
|
||||||
int __i)
|
int __i)
|
||||||
__REQUIRE_CONSTANT(__i) {
|
__REQUIRE_CONSTANT(__i) {
|
||||||
return ((__i64x2)__a)[__i];
|
return ((__i64x2)__a)[__i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ uint64_t __DEFAULT_FN_ATTRS
|
||||||
|
wasm_u64x2_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
|
||||||
|
return ((__u64x2)__a)[__i];
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
|
||||||
int __i,
|
int __i,
|
||||||
int64_t __b)
|
int64_t __b)
|
||||||
@ -479,6 +619,13 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
|
|||||||
return (v128_t)__v;
|
return (v128_t)__v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_replace_lane(
|
||||||
|
v128_t __a, int __i, uint64_t __b) __REQUIRE_CONSTANT(__i) {
|
||||||
|
__u64x2 __v = (__u64x2)__a;
|
||||||
|
__v[__i] = __b;
|
||||||
|
return (v128_t)__v;
|
||||||
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_splat(float __a) {
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_splat(float __a) {
|
||||||
return (v128_t)(__f32x4){__a, __a, __a, __a};
|
return (v128_t)(__f32x4){__a, __a, __a, __a};
|
||||||
}
|
}
|
||||||
@ -804,7 +951,7 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i8x16_all_true(v128_t __a) {
|
|||||||
return __builtin_wasm_all_true_i8x16((__i8x16)__a);
|
return __builtin_wasm_all_true_i8x16((__i8x16)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
|
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
|
||||||
return __builtin_wasm_bitmask_i8x16((__i8x16)__a);
|
return __builtin_wasm_bitmask_i8x16((__i8x16)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -813,17 +960,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i8x16)__a << __b);
|
return (v128_t)((__i8x16)__a << __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i8x16)__a >> __b);
|
return (v128_t)((__i8x16)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__u8x16)__a >> __b);
|
return (v128_t)((__u8x16)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -894,22 +1041,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i16x8_all_true(v128_t __a) {
|
|||||||
return __builtin_wasm_all_true_i16x8((__i16x8)__a);
|
return __builtin_wasm_all_true_i16x8((__i16x8)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
|
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
|
||||||
return __builtin_wasm_bitmask_i16x8((__i16x8)__a);
|
return __builtin_wasm_bitmask_i16x8((__i16x8)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i16x8)__a << __b);
|
return (v128_t)((__i16x8)__a << __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i16x8)__a >> __b);
|
return (v128_t)((__i16x8)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__u16x8)__a >> __b);
|
return (v128_t)((__u16x8)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -985,22 +1132,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i32x4_all_true(v128_t __a) {
|
|||||||
return __builtin_wasm_all_true_i32x4((__i32x4)__a);
|
return __builtin_wasm_all_true_i32x4((__i32x4)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
|
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
|
||||||
return __builtin_wasm_bitmask_i32x4((__i32x4)__a);
|
return __builtin_wasm_bitmask_i32x4((__i32x4)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i32x4)__a << __b);
|
return (v128_t)((__i32x4)__a << __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i32x4)__a >> __b);
|
return (v128_t)((__i32x4)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__u32x4)__a >> __b);
|
return (v128_t)((__u32x4)__a >> __b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1056,22 +1203,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_all_true(v128_t __a) {
|
|||||||
return __builtin_wasm_all_true_i64x2((__i64x2)__a);
|
return __builtin_wasm_all_true_i64x2((__i64x2)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
|
static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
|
||||||
return __builtin_wasm_bitmask_i64x2((__i64x2)__a);
|
return __builtin_wasm_bitmask_i64x2((__i64x2)__a);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i64x2)__a << (int64_t)__b);
|
return (v128_t)((__i64x2)__a << (int64_t)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__i64x2)__a >> (int64_t)__b);
|
return (v128_t)((__i64x2)__a >> (int64_t)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
|
||||||
int32_t __b) {
|
uint32_t __b) {
|
||||||
return (v128_t)((__u64x2)__a >> (int64_t)__b);
|
return (v128_t)((__u64x2)__a >> (int64_t)__b);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1150,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,
|
|||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
|
||||||
v128_t __b) {
|
v128_t __b) {
|
||||||
__i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
|
return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
|
||||||
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
|
||||||
v128_t __b) {
|
v128_t __b) {
|
||||||
__i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
|
return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
|
||||||
return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
|
||||||
@ -1220,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,
|
|||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
|
||||||
v128_t __b) {
|
v128_t __b) {
|
||||||
__i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
|
return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
|
||||||
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
|
static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
|
||||||
v128_t __b) {
|
v128_t __b) {
|
||||||
__i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
|
return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
|
||||||
return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
static __inline__ v128_t __DEFAULT_FN_ATTRS
|
||||||
|
|||||||
4
lib/include/wmmintrin.h
vendored
4
lib/include/wmmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __WMMINTRIN_H
|
#ifndef __WMMINTRIN_H
|
||||||
#define __WMMINTRIN_H
|
#define __WMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
#include <__wmmintrin_aes.h>
|
#include <__wmmintrin_aes.h>
|
||||||
|
|||||||
12
lib/include/x86gprintrin.h
vendored
12
lib/include/x86gprintrin.h
vendored
@ -20,4 +20,16 @@
|
|||||||
#include <uintrintrin.h>
|
#include <uintrintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
|
||||||
|
defined(__CRC32__)
|
||||||
|
#include <crc32intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define __SSC_MARK(Tag) \
|
||||||
|
__asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \
|
||||||
|
"mov {%0, %%ebx|ebx, %0}; " \
|
||||||
|
".byte 0x64, 0x67, 0x90; " \
|
||||||
|
"mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \
|
||||||
|
: "%eax");
|
||||||
|
|
||||||
#endif /* __X86GPRINTRIN_H */
|
#endif /* __X86GPRINTRIN_H */
|
||||||
|
|||||||
14
lib/include/xmmintrin.h
vendored
14
lib/include/xmmintrin.h
vendored
@ -10,6 +10,10 @@
|
|||||||
#ifndef __XMMINTRIN_H
|
#ifndef __XMMINTRIN_H
|
||||||
#define __XMMINTRIN_H
|
#define __XMMINTRIN_H
|
||||||
|
|
||||||
|
#if !defined(__i386__) && !defined(__x86_64__)
|
||||||
|
#error "This header is only meant to be used on x86 and x64 architecture"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <mmintrin.h>
|
#include <mmintrin.h>
|
||||||
|
|
||||||
typedef int __v4si __attribute__((__vector_size__(16)));
|
typedef int __v4si __attribute__((__vector_size__(16)));
|
||||||
@ -2181,7 +2185,7 @@ void _mm_sfence(void);
|
|||||||
/// 3: Bits [63:48] are copied to the destination.
|
/// 3: Bits [63:48] are copied to the destination.
|
||||||
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
|
/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
|
||||||
#define _mm_extract_pi16(a, n) \
|
#define _mm_extract_pi16(a, n) \
|
||||||
(int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
|
((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
|
||||||
|
|
||||||
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
|
/// Copies data from the 64-bit vector of [4 x i16] to the destination,
|
||||||
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
|
/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
|
||||||
@ -2212,7 +2216,7 @@ void _mm_sfence(void);
|
|||||||
/// \returns A 64-bit integer vector containing the copied packed data from the
|
/// \returns A 64-bit integer vector containing the copied packed data from the
|
||||||
/// operands.
|
/// operands.
|
||||||
#define _mm_insert_pi16(a, d, n) \
|
#define _mm_insert_pi16(a, d, n) \
|
||||||
(__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
|
((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
|
||||||
|
|
||||||
/// Compares each of the corresponding packed 16-bit integer values of
|
/// Compares each of the corresponding packed 16-bit integer values of
|
||||||
/// the 64-bit integer vectors, and writes the greater value to the
|
/// the 64-bit integer vectors, and writes the greater value to the
|
||||||
@ -2359,7 +2363,7 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
|
|||||||
/// 11: assigned from bits [63:48] of \a a.
|
/// 11: assigned from bits [63:48] of \a a.
|
||||||
/// \returns A 64-bit integer vector containing the shuffled values.
|
/// \returns A 64-bit integer vector containing the shuffled values.
|
||||||
#define _mm_shuffle_pi16(a, n) \
|
#define _mm_shuffle_pi16(a, n) \
|
||||||
(__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
|
((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
|
||||||
|
|
||||||
/// Conditionally copies the values from each 8-bit element in the first
|
/// Conditionally copies the values from each 8-bit element in the first
|
||||||
/// 64-bit integer vector operand to the specified memory location, as
|
/// 64-bit integer vector operand to the specified memory location, as
|
||||||
@ -2601,8 +2605,8 @@ void _mm_setcsr(unsigned int __i);
|
|||||||
/// 11: Bits [127:96] copied from the specified operand.
|
/// 11: Bits [127:96] copied from the specified operand.
|
||||||
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
|
/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
|
||||||
#define _mm_shuffle_ps(a, b, mask) \
|
#define _mm_shuffle_ps(a, b, mask) \
|
||||||
(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
|
((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
|
||||||
(int)(mask))
|
(int)(mask)))
|
||||||
|
|
||||||
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
|
/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
|
||||||
/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
|
/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
|
||||||
|
|||||||
56
lib/include/xopintrin.h
vendored
56
lib/include/xopintrin.h
vendored
@ -225,16 +225,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_roti_epi8(A, N) \
|
#define _mm_roti_epi8(A, N) \
|
||||||
(__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
|
((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
|
||||||
|
|
||||||
#define _mm_roti_epi16(A, N) \
|
#define _mm_roti_epi16(A, N) \
|
||||||
(__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
|
((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
|
||||||
|
|
||||||
#define _mm_roti_epi32(A, N) \
|
#define _mm_roti_epi32(A, N) \
|
||||||
(__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
|
((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
|
||||||
|
|
||||||
#define _mm_roti_epi64(A, N) \
|
#define _mm_roti_epi64(A, N) \
|
||||||
(__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
|
((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
|
||||||
|
|
||||||
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
||||||
_mm_shl_epi8(__m128i __A, __m128i __B)
|
_mm_shl_epi8(__m128i __A, __m128i __B)
|
||||||
@ -285,36 +285,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_com_epu8(A, B, N) \
|
#define _mm_com_epu8(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (N))
|
(__v16qi)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epu16(A, B, N) \
|
#define _mm_com_epu16(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
|
||||||
(__v8hi)(__m128i)(B), (N))
|
(__v8hi)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epu32(A, B, N) \
|
#define _mm_com_epu32(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
|
||||||
(__v4si)(__m128i)(B), (N))
|
(__v4si)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epu64(A, B, N) \
|
#define _mm_com_epu64(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
|
||||||
(__v2di)(__m128i)(B), (N))
|
(__v2di)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epi8(A, B, N) \
|
#define _mm_com_epi8(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
|
||||||
(__v16qi)(__m128i)(B), (N))
|
(__v16qi)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epi16(A, B, N) \
|
#define _mm_com_epi16(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
|
||||||
(__v8hi)(__m128i)(B), (N))
|
(__v8hi)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epi32(A, B, N) \
|
#define _mm_com_epi32(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
|
||||||
(__v4si)(__m128i)(B), (N))
|
(__v4si)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _mm_com_epi64(A, B, N) \
|
#define _mm_com_epi64(A, B, N) \
|
||||||
(__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
|
((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
|
||||||
(__v2di)(__m128i)(B), (N))
|
(__v2di)(__m128i)(B), (N)))
|
||||||
|
|
||||||
#define _MM_PCOMCTRL_LT 0
|
#define _MM_PCOMCTRL_LT 0
|
||||||
#define _MM_PCOMCTRL_LE 1
|
#define _MM_PCOMCTRL_LE 1
|
||||||
@ -710,23 +710,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define _mm_permute2_pd(X, Y, C, I) \
|
#define _mm_permute2_pd(X, Y, C, I) \
|
||||||
(__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
|
((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
|
||||||
(__v2df)(__m128d)(Y), \
|
(__v2df)(__m128d)(Y), \
|
||||||
(__v2di)(__m128i)(C), (I))
|
(__v2di)(__m128i)(C), (I)))
|
||||||
|
|
||||||
#define _mm256_permute2_pd(X, Y, C, I) \
|
#define _mm256_permute2_pd(X, Y, C, I) \
|
||||||
(__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
|
((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
|
||||||
(__v4df)(__m256d)(Y), \
|
(__v4df)(__m256d)(Y), \
|
||||||
(__v4di)(__m256i)(C), (I))
|
(__v4di)(__m256i)(C), (I)))
|
||||||
|
|
||||||
#define _mm_permute2_ps(X, Y, C, I) \
|
#define _mm_permute2_ps(X, Y, C, I) \
|
||||||
(__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
|
((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
|
||||||
(__v4si)(__m128i)(C), (I))
|
(__v4si)(__m128i)(C), (I)))
|
||||||
|
|
||||||
#define _mm256_permute2_ps(X, Y, C, I) \
|
#define _mm256_permute2_ps(X, Y, C, I) \
|
||||||
(__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
|
((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
|
||||||
(__v8sf)(__m256)(Y), \
|
(__v8sf)(__m256)(Y), \
|
||||||
(__v8si)(__m256i)(C), (I))
|
(__v8si)(__m256i)(C), (I)))
|
||||||
|
|
||||||
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
static __inline__ __m128 __DEFAULT_FN_ATTRS
|
||||||
_mm_frcz_ss(__m128 __A)
|
_mm_frcz_ss(__m128 __A)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user